diff --git a/functions.py b/functions.py new file mode 100644 index 0000000..5321941 --- /dev/null +++ b/functions.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup +import html, re + +def extract_post(post): + post = html.unescape(post) # convert HTML escape codes to text + soup = BeautifulSoup(post, "html.parser") + for lb in soup.select("br"): # replace
with linebreak + lb.insert_after("\n") + lb.decompose() + + for p in soup.select("p"): # ditto for

+ p.insert_after("\n") + p.unwrap() + + for ht in soup.select("a.hashtag"): # convert hashtags from links to text + ht.unwrap() + + for link in soup.select("a"): #ocnvert 0: + # we've downloaded this user's posts before + # find out the most recently downloaded post of theirs + c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,)) + last_post = c.fetchone()[0] + r = requests.get(outbox) j = r.json() + # check for pleroma pleroma = 'next' not in j if pleroma: j = j['first'] @@ -20,6 +32,58 @@ def scrape_posts(handle, outbox): r = requests.get(uri) j = r.json() + # here we go! + # warning: scraping posts from outbox.json is messy stuff + done = False + while not done and len(j['orderedItems']) > 0: + for oi in j['orderedItems']: + if oi['type'] == "Create": + # this is a status/post/toot/florp/whatever + # first, check to see if we already have this in the database + post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/ + c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id)) + if c.fetchone()[0] > 0: + # this post is already in the DB. + # we'll set done to true because we've caught up to where we were last time. + done = True + # we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page. + continue + + content = oi['object']['content'] + # remove HTML tags and such from post + content = functions.extract_post(content) + + if len(content) > 65535: + # post is too long to go into the DB + continue + + try: + c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", ( + handle, + post_id, + content, + 1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0 + )) + except: + #TODO: error handling + raise + + if not done: + if pleroma: + r = requests.get(j['next'], timeout = 10) + else: + r = requests.get(j['prev'], timeout = 10) + + if r.status_code == 429: + # we are now being ratelimited, move on to the next user + done = True + else: + j = r.json() + + db.commit() + c.close() + print("Finished {}".format(handle)) + print("Establishing DB connection") db = MySQLdb.connect( host = cfg['db_host'], @@ -28,6 +92,11 @@ db = MySQLdb.connect( db=cfg['db_name'] ) -c = db.cursor() - print("Downloading posts") + +cursor = db.cursor() +cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts`") +accounts = cursor.fetchall() +cursor.close() +with Pool(8) as p: + p.map(scrape_posts, accounts) diff --git a/webui.py b/webui.py index e6c6800..0aac6c6 100644 --- a/webui.py +++ b/webui.py @@ -214,7 +214,7 @@ def bot_accounts_delete(id): c.close() mysql.connection.commit() - return redirect(url_for("/bot/accounts/{}".format(session['bot'])), 303) + return redirect("/bot/accounts/{}".format(session['bot']), 303) @app.route("/bot/create/", methods=['GET', 'POST']) def bot_create():