scrape user posts

2025-08-01 08:34:48 +00:00 · 2019-09-06 12:38:50 +10:00 · 2019-09-06 12:38:50 +10:00 · 9ccd5586bf
commit 9ccd5586bf
parent dd3679d8db
3 changed files with 101 additions and 6 deletions
--- a/functions.py
+++ b/functions.py
@ -0,0 +1,26 @@
+from bs4 import BeautifulSoup
+import html, re
+
+def extract_post(post):
+	post = html.unescape(post) # convert HTML escape codes to text
+	soup = BeautifulSoup(post, "html.parser")
+	for lb in soup.select("br"): # replace <br> with linebreak
+		lb.insert_after("\n")
+		lb.decompose()
+
+	for p in soup.select("p"): # ditto for <p>
+		p.insert_after("\n")
+		p.unwrap()
+
+	for ht in soup.select("a.hashtag"): # convert hashtags from links to text
+		ht.unwrap()
+
+	for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
+		link.insert_after(link["href"])
+		link.decompose()
+
+	text = soup.get_text()
+	text = re.sub("https://([^/]+)/(@[^ ]+)", r"\2@\1", text) # put mastodon-style mentions back in
+	text = re.sub("https://([^/]+)/users/([^ ]+)", r"@\2@\1", text) # put pleroma-style mentions back in
+	text = text.rstrip("\n") # remove trailing newline(s)
+	return text
--- a/service.py
+++ b/service.py
@ -3,15 +3,27 @@ from mastodon import Mastodon
 import MySQLdb
 import requests
 from multiprocessing import Pool
-import json
+import json, re
+import functions

 cfg = json.load(open('config.json'))

-def scrape_posts(handle, outbox):
-	# check for min_id
+def scrape_posts(account):
+	handle = account[0]
+	outbox = account[1]
+	print("Scraping {}".format(handle))
+	c = db.cursor()
 	last_post = 0
+	c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
+	if c.fetchone()[0] > 0:
+		# we've downloaded this user's posts before
+		# find out the most recently downloaded post of theirs
+		c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
+		last_post = c.fetchone()[0]
+
 	r = requests.get(outbox)
 	j = r.json()
+	# check for pleroma
 	pleroma = 'next' not in j
 	if pleroma:
 		j = j['first']
@ -20,6 +32,58 @@ def scrape_posts(handle, outbox):
 		r = requests.get(uri)
 		j = r.json()

+		# here we go!
+		# warning: scraping posts from outbox.json is messy stuff
+		done = False
+		while not done and len(j['orderedItems']) > 0:
+			for oi in j['orderedItems']:
+				if oi['type'] == "Create":
+					# this is a status/post/toot/florp/whatever
+					# first, check to see if we already have this in the database
+					post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
+					c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
+					if c.fetchone()[0] > 0:
+						# this post is already in the DB.
+						# we'll set done to true because we've caught up to where we were last time.
+						done = True
+						# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
+						continue
+
+					content = oi['object']['content']
+					# remove HTML tags and such from post
+					content = functions.extract_post(content)
+
+					if len(content) > 65535:
+						# post is too long to go into the DB
+						continue
+
+					try:
+						c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
+							handle,
+							post_id,
+							content,
+							1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
+						))
+					except:
+						#TODO: error handling
+						raise
+
+			if not done:
+				if pleroma:
+					r = requests.get(j['next'], timeout = 10)
+				else:
+					r = requests.get(j['prev'], timeout = 10)
+
+				if r.status_code == 429:
+					# we are now being ratelimited, move on to the next user
+					done = True
+				else:
+					j = r.json()
+
+		db.commit()
+		c.close()
+		print("Finished {}".format(handle))
+
 print("Establishing DB connection")
 db = MySQLdb.connect(
 	host = cfg['db_host'],
@ -28,6 +92,11 @@ db = MySQLdb.connect(
 	db=cfg['db_name']
 )

-c = db.cursor()
-
 print("Downloading posts")
+
+cursor = db.cursor()
+cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts`")
+accounts = cursor.fetchall()
+cursor.close()
+with Pool(8) as p:
+	p.map(scrape_posts, accounts)
--- a/webui.py
+++ b/webui.py
@ -214,7 +214,7 @@ def bot_accounts_delete(id):
 		c.close()
 		mysql.connection.commit()

-		return redirect(url_for("/bot/accounts/{}".format(session['bot'])), 303)
+		return redirect("/bot/accounts/{}".format(session['bot']), 303)

@app.route("/bot/create/", methods=['GET', 'POST'])
 def bot_create():
editor.table_modal.label.rows
editor.table_modal.label.columns