2019-09-03 04:29:45 +00:00
#!/usr/bin/env python3
from mastodon import Mastodon
import MySQLdb
import requests
2019-09-08 03:32:21 +00:00
import markovify
2019-09-03 04:29:45 +00:00
from multiprocessing import Pool
2019-09-06 02:38:50 +00:00
import json , re
import functions
2019-09-03 04:29:45 +00:00
cfg = json . load ( open ( ' config.json ' ) )
2019-09-08 03:32:21 +00:00
class nlt_fixed ( markovify . NewlineText ) : # modified version of NewlineText that never rejects sentences
def test_sentence_input ( self , sentence ) :
return True # all sentences are valid <3
2019-09-06 02:38:50 +00:00
def scrape_posts ( account ) :
handle = account [ 0 ]
outbox = account [ 1 ]
print ( " Scraping {} " . format ( handle ) )
c = db . cursor ( )
2019-09-03 04:29:45 +00:00
last_post = 0
2019-09-06 02:38:50 +00:00
c . execute ( " SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s " , ( handle , ) )
if c . fetchone ( ) [ 0 ] > 0 :
# we've downloaded this user's posts before
# find out the most recently downloaded post of theirs
c . execute ( " SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1 " , ( handle , ) )
last_post = c . fetchone ( ) [ 0 ]
2019-09-03 04:29:45 +00:00
r = requests . get ( outbox )
j = r . json ( )
2019-09-06 02:38:50 +00:00
# check for pleroma
2019-09-03 04:29:45 +00:00
pleroma = ' next ' not in j
if pleroma :
j = j [ ' first ' ]
else :
uri = " {} &min_id= {} " . format ( outbox , last_post )
r = requests . get ( uri )
j = r . json ( )
2019-09-06 02:38:50 +00:00
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len ( j [ ' orderedItems ' ] ) > 0 :
for oi in j [ ' orderedItems ' ] :
if oi [ ' type ' ] == " Create " :
# this is a status/post/toot/florp/whatever
# first, check to see if we already have this in the database
post_id = re . search ( r " ([^ \ /]+)/?$ " , oi [ ' object ' ] [ ' id ' ] ) . group ( 1 ) # extract 123 from https://example.com/posts/123/
c . execute ( " SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s " , ( handle , post_id ) )
if c . fetchone ( ) [ 0 ] > 0 :
# this post is already in the DB.
# we'll set done to true because we've caught up to where we were last time.
done = True
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
continue
content = oi [ ' object ' ] [ ' content ' ]
# remove HTML tags and such from post
content = functions . extract_post ( content )
if len ( content ) > 65535 :
# post is too long to go into the DB
continue
try :
c . execute ( " INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES ( %s , %s , %s , %s ) " , (
handle ,
post_id ,
content ,
1 if ( oi [ ' object ' ] [ ' summary ' ] != None and oi [ ' object ' ] [ ' summary ' ] != " " ) else 0
) )
except :
#TODO: error handling
raise
if not done :
if pleroma :
r = requests . get ( j [ ' next ' ] , timeout = 10 )
else :
r = requests . get ( j [ ' prev ' ] , timeout = 10 )
if r . status_code == 429 :
# we are now being ratelimited, move on to the next user
done = True
else :
j = r . json ( )
db . commit ( )
c . close ( )
2019-09-08 03:59:50 +00:00
def make_post ( handle ) :
handle = handle [ 0 ]
print ( " Generating post for {} " . format ( handle ) )
c = db . cursor ( )
c . execute ( """
SELECT
learn_from_cw , client_id , client_secret , secret
FROM
bots , credentials
WHERE
bots . credentials_id = ( SELECT
credentials_id
FROM
bots
WHERE
handle = % s )
""" , (handle,))
bot = c . fetchone ( )
2019-09-07 09:58:42 +00:00
client = Mastodon (
client_id = bot [ 1 ] ,
client_secret = bot [ 2 ] ,
access_token = bot [ 3 ] ,
2019-09-08 03:59:50 +00:00
api_base_url = " https:// {} " . format ( handle . split ( " @ " ) [ 2 ] )
2019-09-07 09:58:42 +00:00
)
2019-09-08 03:59:50 +00:00
# by default, only select posts that don't have CWs.
# if learn_from_cw, then also select posts with CWs
cw_list = [ False ]
if bot [ 0 ] :
cw_list = [ False , True ]
2019-09-08 03:32:21 +00:00
# select 1000 random posts for the bot to learn from
2019-09-08 03:59:50 +00:00
c . execute ( " SELECT content FROM posts WHERE fedi_id IN (SELECT fedi_id FROM bot_learned_accounts WHERE bot_id = %s ) AND cw IN %s ORDER BY RAND() LIMIT 1000 " , ( handle , cw_list ) )
2019-09-08 03:32:21 +00:00
# this line is a little gross/optimised but here's what it does
# 1. fetch all of the results from the above query
# 2. turn (('this',), ('format')) into ('this', 'format')
# 3. convert the tuple to a list
# 4. join the list into a string separated by newlines
posts = " \n " . join ( list ( sum ( c . fetchall ( ) , ( ) ) ) )
model = nlt_fixed ( posts )
tries = 0
sentence = None
# even with such a high tries value for markovify, it still sometimes returns none.
# so we implement our own tries function as well, and try ten times.
while sentence is None and tries < 10 :
sentence = model . make_short_sentence ( 500 , tries = 10000 )
tries + = 1
# TODO: mention handling
if sentence == None :
# TODO: send an error email
pass
else :
client . status_post ( sentence )
# TODO: update date of last post
2019-09-07 03:31:11 +00:00
2019-09-03 04:29:45 +00:00
print ( " Establishing DB connection " )
db = MySQLdb . connect (
host = cfg [ ' db_host ' ] ,
user = cfg [ ' db_user ' ] ,
passwd = cfg [ ' db_pass ' ] ,
db = cfg [ ' db_name ' ]
)
2019-09-07 09:37:48 +00:00
print ( " Cleaning up database " )
# delete any fedi accounts we no longer need
2019-09-06 02:38:50 +00:00
cursor = db . cursor ( )
2019-09-07 09:37:48 +00:00
cursor . execute ( " DELETE FROM fedi_accounts WHERE handle NOT IN (SELECT fedi_id FROM bot_learned_accounts); " )
print ( " Downloading posts " )
2019-09-06 02:39:51 +00:00
cursor . execute ( " SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND() " )
2019-09-06 02:38:50 +00:00
accounts = cursor . fetchall ( )
2019-09-08 03:32:21 +00:00
# with Pool(8) as p:
# p.map(scrape_posts, accounts)
2019-09-06 02:47:39 +00:00
2019-09-07 03:31:11 +00:00
print ( " Generating posts " )
2019-09-08 03:59:50 +00:00
cursor . execute ( " SELECT handle FROM bots WHERE enabled = TRUE " )
2019-09-07 03:31:11 +00:00
bots = cursor . fetchall ( )
with Pool ( 8 ) as p :
p . map ( make_post , bots )
2019-09-06 02:47:39 +00:00
#TODO: other cron tasks should be done here, like updating profile pictures