moved scraping functions to a separate file

This commit is contained in:
Lynne Megido 2019-09-10 12:07:53 +10:00
parent 989a9ec98b
commit f2d15b7d80
2 changed files with 102 additions and 85 deletions

100
scrape.py Normal file
View File

@ -0,0 +1,100 @@
import MySQLdb
import requests
from multiprocessing import Pool
import json, re
import functions
cfg = json.load(open('config.json'))
def scrape_posts(account):
handle = account[0]
outbox = account[1]
print("Scraping {}".format(handle))
c = db.cursor()
last_post = 0
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
if c.fetchone()[0] > 0:
# we've downloaded this user's posts before
# find out the most recently downloaded post of theirs
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
j = r.json()
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# this is a status/post/toot/florp/whatever
# first, check to see if we already have this in the database
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
if c.fetchone()[0] > 0:
# this post is already in the DB.
# we'll set done to true because we've caught up to where we were last time.
done = True
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
continue
content = oi['object']['content']
# remove HTML tags and such from post
content = functions.extract_post(content)
if len(content) > 65535:
# post is too long to go into the DB
continue
try:
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
handle,
post_id,
content,
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
))
except:
#TODO: error handling
raise
if not done:
if pleroma:
r = requests.get(j['next'], timeout = 10)
else:
r = requests.get(j['prev'], timeout = 10)
if r.status_code == 429:
# we are now being ratelimited, move on to the next user
done = True
else:
j = r.json()
db.commit()
c.close()
print("Establishing DB connection")
db = MySQLdb.connect(
host = cfg['db_host'],
user=cfg['db_user'],
passwd=cfg['db_pass'],
db=cfg['db_name']
)
cursor = db.cursor()
print("Downloading posts")
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
with Pool(8) as p:
p.map(scrape_posts, accounts)
print("Done!")

View File

@ -1,88 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from mastodon import Mastodon
import MySQLdb import MySQLdb
import requests
import markovify
from multiprocessing import Pool from multiprocessing import Pool
import json, re import json
import functions import functions
cfg = json.load(open('config.json')) cfg = json.load(open('config.json'))
def scrape_posts(account):
handle = account[0]
outbox = account[1]
print("Scraping {}".format(handle))
c = db.cursor()
last_post = 0
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
if c.fetchone()[0] > 0:
# we've downloaded this user's posts before
# find out the most recently downloaded post of theirs
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
j = r.json()
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# this is a status/post/toot/florp/whatever
# first, check to see if we already have this in the database
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
if c.fetchone()[0] > 0:
# this post is already in the DB.
# we'll set done to true because we've caught up to where we were last time.
done = True
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
continue
content = oi['object']['content']
# remove HTML tags and such from post
content = functions.extract_post(content)
if len(content) > 65535:
# post is too long to go into the DB
continue
try:
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
handle,
post_id,
content,
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
))
except:
#TODO: error handling
raise
if not done:
if pleroma:
r = requests.get(j['next'], timeout = 10)
else:
r = requests.get(j['prev'], timeout = 10)
if r.status_code == 429:
# we are now being ratelimited, move on to the next user
done = True
else:
j = r.json()
db.commit()
c.close()
print("Establishing DB connection") print("Establishing DB connection")
db = MySQLdb.connect( db = MySQLdb.connect(
@ -95,13 +18,7 @@ db = MySQLdb.connect(
print("Cleaning up database") print("Cleaning up database")
# delete any fedi accounts we no longer need # delete any fedi accounts we no longer need
cursor = db.cursor() cursor = db.cursor()
cursor.execute("DELETE FROM fedi_accounts WHERE handle NOT IN (SELECT fedi_id FROM bot_learned_accounts);") cursor.execute("DELETE FROM fedi_accounts WHERE handle NOT IN (SELECT fedi_id FROM bot_learned_accounts)")
print("Downloading posts")
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
# with Pool(8) as p:
# p.map(scrape_posts, accounts)
print("Generating posts") print("Generating posts")
cursor.execute("SELECT handle FROM bots WHERE enabled = TRUE AND TIMESTAMPDIFF(MINUTE, last_post, CURRENT_TIMESTAMP()) > post_frequency") cursor.execute("SELECT handle FROM bots WHERE enabled = TRUE AND TIMESTAMPDIFF(MINUTE, last_post, CURRENT_TIMESTAMP()) > post_frequency")