Compare commits

...

6 Commits

3 changed files with 41 additions and 24 deletions

View File

@ -25,7 +25,9 @@ def extract_post(post):
ht.unwrap()
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
link.insert_after(link["href"])
if 'href' in link:
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
link.insert_after(link["href"])
link.decompose()
text = soup.get_text()

54
app/scrape.py Normal file → Executable file
View File

@ -30,22 +30,27 @@ def scrape_posts(account):
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
if 'first' in j:
# backwards compatibility for older (pre-v1.0.7) pleroma instances
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
done = False
try:
r = requests.get(outbox, timeout = 10)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
if 'first' in j:
# backwards compatibility for older (pre-v1.0.7) pleroma instances
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri, timeout = 10)
j = r.json()
except:
print("Couldn't load or parse outbox at URL {}".format(outbox))
done = True
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
@ -78,19 +83,24 @@ def scrape_posts(account):
))
except:
#TODO: error handling
raise
print("Failed to insert post {} for user {}".format(post_id, handle))
if not done:
if pleroma:
if 'next' in j:
r = requests.get(j['next'], timeout = 10)
try:
if pleroma:
if 'next' in j:
r = requests.get(j['next'], timeout = 10)
else:
done = True
else:
done = True
else:
if 'prev' in j:
r = requests.get(j['prev'], timeout = 10)
else:
done = True
if 'prev' in j:
r = requests.get(j['prev'], timeout = 10)
else:
done = True
except requests.Timeout:
print("Timed out while loading next page for {}".format(handle))
except:
print("Encountered unknown error while getting next page for {}".format(handle))
if r.status_code == 429:
# we are now being ratelimited, move on to the next user

View File

@ -1,4 +1,4 @@
from flask import Flask, render_template, session, request, redirect, url_for, send_file
from flask import Flask, render_template, session, request, redirect, url_for, send_file, jsonify
from flask_mysqldb import MySQL
from mastodon import Mastodon
@ -370,6 +370,11 @@ def img_bot_generic():
def favicon():
return send_file("static/favicon.ico")
# @app.route("/.well-known/webfinger")
# def webfinger():
# return render_template("webfinger.json", base_uri = cfg['base_uri']), 200, {'Content-type':'application/json'}
def bot_check(bot):
# check to ensure bot is owned by user
c = mysql.connection.cursor()