FediBooks/service.py

#!/usr/bin/env python3
from mastodon import Mastodon
import MySQLdb
import requests
import markovify
from multiprocessing import Pool
import json, re
import functions

cfg = json.load(open('config.json'))

class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences
	def test_sentence_input(self, sentence):
		return True # all sentences are valid <3

def scrape_posts(account):
	handle = account[0]
	outbox = account[1]
	print("Scraping {}".format(handle))
	c = db.cursor()
	last_post = 0
	c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
	if c.fetchone()[0] > 0:
		# we've downloaded this user's posts before
		# find out the most recently downloaded post of theirs
		c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
		last_post = c.fetchone()[0]

	r = requests.get(outbox)
	j = r.json()
	# check for pleroma
	pleroma = 'next' not in j
	if pleroma:
		j = j['first']
	else:
		uri = "{}&min_id={}".format(outbox, last_post)
		r = requests.get(uri)
		j = r.json()

		# here we go!
		# warning: scraping posts from outbox.json is messy stuff
		done = False
		while not done and len(j['orderedItems']) > 0:
			for oi in j['orderedItems']:
				if oi['type'] == "Create":
					# this is a status/post/toot/florp/whatever
					# first, check to see if we already have this in the database
					post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
					c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
					if c.fetchone()[0] > 0:
						# this post is already in the DB.
						# we'll set done to true because we've caught up to where we were last time.
						done = True
						# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
						continue

					content = oi['object']['content']
					# remove HTML tags and such from post
					content = functions.extract_post(content)

					if len(content) > 65535:
						# post is too long to go into the DB
						continue

					try:
						c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
							handle,
							post_id,
							content,
							1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
						))
					except:
						#TODO: error handling
						raise

			if not done:
				if pleroma:
					r = requests.get(j['next'], timeout = 10)
				else:
					r = requests.get(j['prev'], timeout = 10)

				if r.status_code == 429:
					# we are now being ratelimited, move on to the next user
					done = True
				else:
					j = r.json()

		db.commit()
		c.close()

def make_post(bot):
	print("Generating post for {}".format(bot[0]))
	client = Mastodon(
		client_id = bot[1],
		client_secret = bot[2],
		access_token = bot[3],
		api_base_url = "https://{}".format(bot[0].split("@")[2])
	)

	c = db.cursor()
	# select 1000 random posts for the bot to learn from
	# TODO: optionally don't learn from CW'd posts
	c.execute("SELECT content FROM posts WHERE fedi_id IN (SELECT fedi_id FROM bot_learned_accounts WHERE bot_id = %s) ORDER BY RAND() LIMIT 1000", (bot[0],))

	# this line is a little gross/optimised but here's what it does
	# 1. fetch all of the results from the above query
	# 2. turn (('this',), ('format')) into ('this', 'format')
	# 3. convert the tuple to a list
	# 4. join the list into a string separated by newlines
	posts = "\n".join(list(sum(c.fetchall(), ())))

	model = nlt_fixed(posts)
	tries = 0
	sentence = None
	# even with such a high tries value for markovify, it still sometimes returns none.
	# so we implement our own tries function as well, and try ten times.
	while sentence is None and tries < 10:
		sentence = model.make_short_sentence(500, tries = 10000)
		tries += 1

	# TODO: mention handling

	if sentence == None:
		# TODO: send an error email
		pass
	else:
		client.status_post(sentence)

	# TODO: update date of last post

print("Establishing DB connection")
db = MySQLdb.connect(
	host = cfg['db_host'],
	user=cfg['db_user'],
	passwd=cfg['db_pass'],
	db=cfg['db_name']
)

print("Cleaning up database")
# delete any fedi accounts we no longer need
cursor = db.cursor()
cursor.execute("DELETE FROM fedi_accounts WHERE handle NOT IN (SELECT fedi_id FROM bot_learned_accounts);")

print("Downloading posts")
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
# with Pool(8) as p:
# 	p.map(scrape_posts, accounts)

print("Generating posts")
cursor.execute("""
SELECT
    bots.handle, credentials.client_id, credentials.client_secret, credentials.secret
FROM
    bots,
    credentials
WHERE
    bots.credentials_id = credentials.id
        AND bots.enabled = TRUE;
""")
bots = cursor.fetchall()

with Pool(8) as p:
	p.map(make_post, bots)

#TODO: other cron tasks should be done here, like updating profile pictures