moved scraping functions to a separate file

This commit is contained in:
Lynne Megido 2019-09-10 12:07:53 +10:00
parent 989a9ec98b
commit f2d15b7d80
2 changed files with 102 additions and 85 deletions

100
scrape.py Normal file
View File

@ -0,0 +1,100 @@
import MySQLdb
import requests
from multiprocessing import Pool
import json, re
import functions
cfg = json.load(open('config.json'))
def scrape_posts(account):
handle = account[0]
outbox = account[1]
print("Scraping {}".format(handle))
c = db.cursor()
last_post = 0
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
if c.fetchone()[0] > 0:
# we've downloaded this user's posts before
# find out the most recently downloaded post of theirs
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
j = r.json()
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# this is a status/post/toot/florp/whatever
# first, check to see if we already have this in the database
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
if c.fetchone()[0] > 0:
# this post is already in the DB.
# we'll set done to true because we've caught up to where we were last time.
done = True
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
continue
content = oi['object']['content']
# remove HTML tags and such from post
content = functions.extract_post(content)
if len(content) > 65535:
# post is too long to go into the DB
continue
try:
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
handle,
post_id,
content,
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
))
except:
#TODO: error handling
raise
if not done:
if pleroma:
r = requests.get(j['next'], timeout = 10)
else:
r = requests.get(j['prev'], timeout = 10)
if r.status_code == 429:
# we are now being ratelimited, move on to the next user
done = True
else:
j = r.json()
db.commit()
c.close()
print("Establishing DB connection")
db = MySQLdb.connect(
host = cfg['db_host'],
user=cfg['db_user'],
passwd=cfg['db_pass'],
db=cfg['db_name']
)
cursor = db.cursor()
print("Downloading posts")
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
with Pool(8) as p:
p.map(scrape_posts, accounts)
print("Done!")

View File

@ -1,88 +1,11 @@
#!/usr/bin/env python3
from mastodon import Mastodon
import MySQLdb
import requests
import markovify
from multiprocessing import Pool
import json, re
import json
import functions
cfg = json.load(open('config.json'))
def scrape_posts(account):
handle = account[0]
outbox = account[1]
print("Scraping {}".format(handle))
c = db.cursor()
last_post = 0
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
if c.fetchone()[0] > 0:
# we've downloaded this user's posts before
# find out the most recently downloaded post of theirs
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
j = r.json()
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# this is a status/post/toot/florp/whatever
# first, check to see if we already have this in the database
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
if c.fetchone()[0] > 0:
# this post is already in the DB.
# we'll set done to true because we've caught up to where we were last time.
done = True
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
continue
content = oi['object']['content']
# remove HTML tags and such from post
content = functions.extract_post(content)
if len(content) > 65535:
# post is too long to go into the DB
continue
try:
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
handle,
post_id,
content,
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
))
except:
#TODO: error handling
raise
if not done:
if pleroma:
r = requests.get(j['next'], timeout = 10)
else:
r = requests.get(j['prev'], timeout = 10)
if r.status_code == 429:
# we are now being ratelimited, move on to the next user
done = True
else:
j = r.json()
db.commit()
c.close()
print("Establishing DB connection")
db = MySQLdb.connect(
@ -95,13 +18,7 @@ db = MySQLdb.connect(
print("Cleaning up database")
# delete any fedi accounts we no longer need
cursor = db.cursor()
cursor.execute("DELETE FROM fedi_accounts WHERE handle NOT IN (SELECT fedi_id FROM bot_learned_accounts);")
print("Downloading posts")
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
# with Pool(8) as p:
# p.map(scrape_posts, accounts)
cursor.execute("DELETE FROM fedi_accounts WHERE handle NOT IN (SELECT fedi_id FROM bot_learned_accounts)")
print("Generating posts")
cursor.execute("SELECT handle FROM bots WHERE enabled = TRUE AND TIMESTAMPDIFF(MINUTE, last_post, CURRENT_TIMESTAMP()) > post_frequency")