diff --git a/functions.py b/functions.py
new file mode 100644
index 0000000..5321941
--- /dev/null
+++ b/functions.py
@@ -0,0 +1,26 @@
+from bs4 import BeautifulSoup
+import html, re
+
+def extract_post(post):
+ post = html.unescape(post) # convert HTML escape codes to text
+ soup = BeautifulSoup(post, "html.parser")
+ for lb in soup.select("br"): # replace
with linebreak
+ lb.insert_after("\n")
+ lb.decompose()
+
+ for p in soup.select("p"): # ditto for
+ p.insert_after("\n")
+ p.unwrap()
+
+ for ht in soup.select("a.hashtag"): # convert hashtags from links to text
+ ht.unwrap()
+
+ for link in soup.select("a"): #ocnvert 0:
+ # we've downloaded this user's posts before
+ # find out the most recently downloaded post of theirs
+ c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
+ last_post = c.fetchone()[0]
+
r = requests.get(outbox)
j = r.json()
+ # check for pleroma
pleroma = 'next' not in j
if pleroma:
j = j['first']
@@ -20,6 +32,58 @@ def scrape_posts(handle, outbox):
r = requests.get(uri)
j = r.json()
+ # here we go!
+ # warning: scraping posts from outbox.json is messy stuff
+ done = False
+ while not done and len(j['orderedItems']) > 0:
+ for oi in j['orderedItems']:
+ if oi['type'] == "Create":
+ # this is a status/post/toot/florp/whatever
+ # first, check to see if we already have this in the database
+ post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
+ c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
+ if c.fetchone()[0] > 0:
+ # this post is already in the DB.
+ # we'll set done to true because we've caught up to where we were last time.
+ done = True
+ # we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
+ continue
+
+ content = oi['object']['content']
+ # remove HTML tags and such from post
+ content = functions.extract_post(content)
+
+ if len(content) > 65535:
+ # post is too long to go into the DB
+ continue
+
+ try:
+ c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
+ handle,
+ post_id,
+ content,
+ 1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
+ ))
+ except:
+ #TODO: error handling
+ raise
+
+ if not done:
+ if pleroma:
+ r = requests.get(j['next'], timeout = 10)
+ else:
+ r = requests.get(j['prev'], timeout = 10)
+
+ if r.status_code == 429:
+ # we are now being ratelimited, move on to the next user
+ done = True
+ else:
+ j = r.json()
+
+ db.commit()
+ c.close()
+ print("Finished {}".format(handle))
+
print("Establishing DB connection")
db = MySQLdb.connect(
host = cfg['db_host'],
@@ -28,6 +92,11 @@ db = MySQLdb.connect(
db=cfg['db_name']
)
-c = db.cursor()
-
print("Downloading posts")
+
+cursor = db.cursor()
+cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts`")
+accounts = cursor.fetchall()
+cursor.close()
+with Pool(8) as p:
+ p.map(scrape_posts, accounts)
diff --git a/webui.py b/webui.py
index e6c6800..0aac6c6 100644
--- a/webui.py
+++ b/webui.py
@@ -214,7 +214,7 @@ def bot_accounts_delete(id):
c.close()
mysql.connection.commit()
- return redirect(url_for("/bot/accounts/{}".format(session['bot'])), 303)
+ return redirect("/bot/accounts/{}".format(session['bot']), 303)
@app.route("/bot/create/", methods=['GET', 'POST'])
def bot_create():