FediBooks/app/scrape.py

#!/usr/bin/env python3

import MySQLdb
import requests
import json, re
import functions

cfg = json.load(open('config.json'))

def scrape_posts(account):
	db = MySQLdb.connect(
		host = cfg['db_host'],
		user=cfg['db_user'],
		passwd=cfg['db_pass'],
		db=cfg['db_name'],
		use_unicode=True,
		charset="utf8mb4"
	)
	handle = account[0]
	outbox = account[1]
	# print("Scraping {}".format(handle))
	c = db.cursor()
	last_post = 0
	c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
	count = c.fetchone()
	if count is not None and int(count[0]) > 0:
		# we've downloaded this user's posts before
		# find out the most recently downloaded post of theirs
		c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
		last_post = c.fetchone()[0]

	done = False

	try:
		r = requests.get(outbox, timeout = 10)
		j = r.json()
		# check for pleroma
		pleroma = 'next' not in j
		if pleroma:
			if 'first' in j:
				# backwards compatibility for older (pre-v1.0.7) pleroma instances
				j = j['first']
		else:
			uri = "{}&min_id={}".format(outbox, last_post)
			r = requests.get(uri, timeout = 10)
			j = r.json()
	except:
		print("Couldn't load or parse outbox at URL {}".format(outbox))
		done = True

	# here we go!
	# warning: scraping posts from outbox.json is messy stuff
	while not done and len(j['orderedItems']) > 0:
		for oi in j['orderedItems']:
			if oi['type'] == "Create":
				# this is a status/post/toot/florp/whatever
				# first, check to see if we already have this in the database
				post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
				c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
				count = c.fetchone()
				if count is not None and int(count[0]) > 0:
					# this post is already in the DB.
					# we'll set done to true because we've caught up to where we were last time.
					done = True
					# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
					continue

				content = oi['object']['content']
				# remove HTML tags and such from post
				content = functions.extract_post(content)

				if len(content) > 65535:
					# post is too long to go into the DB
					continue

				try:
					c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
						handle,
						post_id,
						content,
						1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
					))
				except:
					#TODO: error handling
					print("Failed to insert post {} for user {}".format(post_id, handle))

		if not done:
			try:
				if pleroma:
					if 'next' in j:
						r = requests.get(j['next'], timeout = 10)
					else:
						done = True
				else:
					if 'prev' in j:
						r = requests.get(j['prev'], timeout = 10)
					else:
						done = True
			except requests.Timeout:
				print("Timed out while loading next page for {}".format(handle))
			except:
				print("Encountered unknown error while getting next page for {}".format(handle))

			if r.status_code == 429:
				# we are now being ratelimited, move on to the next user
				print("Hit rate limit while scraping {}".format(handle))
				done = True
			else:
				j = r.json()

			db.commit()

		db.commit()
	# print("Finished scraping {}".format(handle))

print("Establishing DB connection")
db = MySQLdb.connect(
	host = cfg['db_host'],
	user=cfg['db_user'],
	passwd=cfg['db_pass'],
	db=cfg['db_name'],
	use_unicode=True,
	charset="utf8mb4"
)

cursor = db.cursor()

print("Downloading posts")
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
cursor.close()
db.close()

functions.do_in_pool(scrape_posts, accounts, timeout=60)

print("Done!")
fixed some more issues that didn't happen before 2019-09-11 08:16:36 +00:00			`#!/usr/bin/env python3`

moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`import MySQLdb`
			`import requests`
			`import json, re`
			`import functions`

			`cfg = json.load(open('config.json'))`

			`def scrape_posts(account):`
use a new db connection for each scrape thread 2019-09-13 02:11:01 +00:00			`db = MySQLdb.connect(`
			`host = cfg['db_host'],`
			`user=cfg['db_user'],`
			`passwd=cfg['db_pass'],`
			`db=cfg['db_name'],`
			`use_unicode=True,`
			`charset="utf8mb4"`
			`)`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`handle = account[0]`
			`outbox = account[1]`
less logspam 2020-01-20 02:52:38 +00:00			`# print("Scraping {}".format(handle))`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`c = db.cursor()`
			`last_post = 0`
			c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
handle case where we haven't scraped any posts yet i guess 2019-09-11 05:26:28 +00:00			`count = c.fetchone()`
fixed some more issues that didn't happen before 2019-09-11 08:16:36 +00:00			`if count is not None and int(count[0]) > 0:`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`# we've downloaded this user's posts before`
			`# find out the most recently downloaded post of theirs`
			c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
			`last_post = c.fetchone()[0]`

don't override manually set done value 2020-03-18 05:06:03 +00:00			`done = False`

added http timeouts to initial requests, better error handling 2020-03-18 05:00:18 +00:00			`try:`
			`r = requests.get(outbox, timeout = 10)`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`j = r.json()`
added http timeouts to initial requests, better error handling 2020-03-18 05:00:18 +00:00			`# check for pleroma`
			`pleroma = 'next' not in j`
			`if pleroma:`
			`if 'first' in j:`
			`# backwards compatibility for older (pre-v1.0.7) pleroma instances`
			`j = j['first']`
			`else:`
			`uri = "{}&min_id={}".format(outbox, last_post)`
			`r = requests.get(uri, timeout = 10)`
			`j = r.json()`
			`except:`
			`print("Couldn't load or parse outbox at URL {}".format(outbox))`
			`done = True`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00
fixed indentation issue, hopefully #21 is fixed now 2019-09-17 08:12:32 +00:00			`# here we go!`
			`# warning: scraping posts from outbox.json is messy stuff`
			`while not done and len(j['orderedItems']) > 0:`
			`for oi in j['orderedItems']:`
			`if oi['type'] == "Create":`
			`# this is a status/post/toot/florp/whatever`
			`# first, check to see if we already have this in the database`
			`post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/`
			c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
			`count = c.fetchone()`
			`if count is not None and int(count[0]) > 0:`
			`# this post is already in the DB.`
			`# we'll set done to true because we've caught up to where we were last time.`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`done = True`
fixed indentation issue, hopefully #21 is fixed now 2019-09-17 08:12:32 +00:00			`# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.`
			`continue`

			`content = oi['object']['content']`
			`# remove HTML tags and such from post`
			`content = functions.extract_post(content)`

			`if len(content) > 65535:`
			`# post is too long to go into the DB`
			`continue`

			`try:`
			c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
			`handle,`
			`post_id,`
			`content,`
			`1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0`
			`))`
			`except:`
			`#TODO: error handling`
fixed silly mistake in logging 2020-03-18 05:05:32 +00:00			`print("Failed to insert post {} for user {}".format(post_id, handle))`
fixed indentation issue, hopefully #21 is fixed now 2019-09-17 08:12:32 +00:00
			`if not done:`
added http timeouts to initial requests, better error handling 2020-03-18 05:00:18 +00:00			`try:`
			`if pleroma:`
			`if 'next' in j:`
			`r = requests.get(j['next'], timeout = 10)`
			`else:`
			`done = True`
handle running out of posts gracefully 2020-01-20 04:37:39 +00:00			`else:`
added http timeouts to initial requests, better error handling 2020-03-18 05:00:18 +00:00			`if 'prev' in j:`
			`r = requests.get(j['prev'], timeout = 10)`
			`else:`
			`done = True`
			`except requests.Timeout:`
			`print("Timed out while loading next page for {}".format(handle))`
			`except:`
			`print("Encountered unknown error while getting next page for {}".format(handle))`
fixed indentation issue, hopefully #21 is fixed now 2019-09-17 08:12:32 +00:00
			`if r.status_code == 429:`
			`# we are now being ratelimited, move on to the next user`
			`print("Hit rate limit while scraping {}".format(handle))`
			`done = True`
			`else:`
			`j = r.json()`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00
make scrape.py commit more frequencly 2019-09-11 05:28:50 +00:00			`db.commit()`
fixed some more issues that didn't happen before 2019-09-11 08:16:36 +00:00
use a new db connection for each scrape thread 2019-09-13 02:11:01 +00:00			`db.commit()`
less logspam 2020-01-20 02:52:38 +00:00			`# print("Finished scraping {}".format(handle))`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00
			`print("Establishing DB connection")`
			`db = MySQLdb.connect(`
			`host = cfg['db_host'],`
			`user=cfg['db_user'],`
			`passwd=cfg['db_pass'],`
enable db unicode support 2019-09-10 12:21:22 +00:00			`db=cfg['db_name'],`
			`use_unicode=True,`
			`charset="utf8mb4"`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00			`)`

			`cursor = db.cursor()`

			`print("Downloading posts")`
			cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
			`accounts = cursor.fetchall()`
use a new db connection for each scrape thread 2019-09-13 02:11:01 +00:00			`cursor.close()`
			`db.close()`
implement timeouts on service.py and scrape.py's stuff 2020-05-27 10:48:46 +00:00
			`functions.do_in_pool(scrape_posts, accounts, timeout=60)`
moved scraping functions to a separate file 2019-09-10 02:07:53 +00:00
			`print("Done!")`