2019-09-06 02:38:50 +00:00
from bs4 import BeautifulSoup
2019-09-09 03:39:28 +00:00
import MySQLdb
2020-05-27 10:48:46 +00:00
from pebble import ProcessPool
from concurrent . futures import TimeoutError
2019-09-09 03:39:28 +00:00
import markovify
2020-04-11 07:14:53 +00:00
import requests
2020-03-18 06:41:32 +00:00
from Crypto . PublicKey import RSA
2020-04-11 07:14:53 +00:00
from Crypto . Hash import SHA256
from Crypto . Signature import PKCS1_v1_5
from base64 import b64decode , b64encode
2019-09-11 08:02:26 +00:00
from mastodon import Mastodon , MastodonUnauthorizedError
2019-09-09 03:39:28 +00:00
import html , re , json
cfg = json . load ( open ( ' config.json ' ) )
class nlt_fixed ( markovify . NewlineText ) : # modified version of NewlineText that never rejects sentences
def test_sentence_input ( self , sentence ) :
return True # all sentences are valid <3
2019-09-06 02:38:50 +00:00
def extract_post ( post ) :
post = html . unescape ( post ) # convert HTML escape codes to text
soup = BeautifulSoup ( post , " html.parser " )
for lb in soup . select ( " br " ) : # replace <br> with linebreak
2020-05-27 12:27:20 +00:00
lb . replace_with ( " \n " )
2019-09-06 02:38:50 +00:00
for p in soup . select ( " p " ) : # ditto for <p>
2020-05-27 12:27:20 +00:00
p . replace_with ( " \n " )
2019-09-06 02:38:50 +00:00
for ht in soup . select ( " a.hashtag " ) : # convert hashtags from links to text
ht . unwrap ( )
for link in soup . select ( " a " ) : #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
2020-03-18 05:09:04 +00:00
if ' href ' in link :
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
2020-05-27 12:27:20 +00:00
link . replace_with ( link [ " href " ] )
2019-09-06 02:38:50 +00:00
text = soup . get_text ( )
2019-09-21 05:10:40 +00:00
text = re . sub ( r " https://([^/]+)/(@[^ \ s]+) " , r " \ 2@ \ 1 " , text ) # put mastodon-style mentions back in
text = re . sub ( r " https://([^/]+)/users/([^ \ s/]+) " , r " @ \ 2@ \ 1 " , text ) # put pleroma-style mentions back in
2019-09-06 02:38:50 +00:00
text = text . rstrip ( " \n " ) # remove trailing newline(s)
return text
2019-09-09 03:39:28 +00:00
2020-01-20 02:53:11 +00:00
def generate_output ( handle ) :
2019-09-09 03:39:28 +00:00
db = MySQLdb . connect (
host = cfg [ ' db_host ' ] ,
user = cfg [ ' db_user ' ] ,
passwd = cfg [ ' db_pass ' ] ,
2020-05-27 10:09:21 +00:00
db = cfg [ ' db_name ' ] ,
use_unicode = True ,
charset = " utf8mb4 "
2019-09-09 03:39:28 +00:00
)
2020-01-20 02:18:17 +00:00
# print("Generating post for {}".format(handle))
2019-09-09 04:08:25 +00:00
dc = db . cursor ( MySQLdb . cursors . DictCursor )
2019-09-09 03:39:28 +00:00
c = db . cursor ( )
2019-09-09 04:08:25 +00:00
dc . execute ( """
2020-01-20 02:18:17 +00:00
SELECT
2019-09-11 05:00:15 +00:00
learn_from_cw ,
length ,
2019-09-09 04:08:25 +00:00
fake_mentions ,
fake_mentions_full ,
post_privacy ,
content_warning ,
client_id ,
client_secret ,
secret
2019-09-09 03:39:28 +00:00
FROM
2019-09-09 04:08:25 +00:00
bots , credentials
2019-09-09 03:39:28 +00:00
WHERE
2020-01-20 02:18:17 +00:00
bots . handle = % s
2019-09-14 01:49:34 +00:00
AND bots . credentials_id = credentials . id
2019-09-09 03:39:28 +00:00
""" , (handle,))
2019-09-09 04:08:25 +00:00
bot = dc . fetchone ( )
2019-09-09 03:39:28 +00:00
# by default, only select posts that don't have CWs.
# if learn_from_cw, then also select posts with CWs
cw_list = [ False ]
2019-09-09 04:08:25 +00:00
if bot [ ' learn_from_cw ' ] :
2019-09-09 03:39:28 +00:00
cw_list = [ False , True ]
# select 1000 random posts for the bot to learn from
c . execute ( " SELECT content FROM posts WHERE fedi_id IN (SELECT fedi_id FROM bot_learned_accounts WHERE bot_id = %s ) AND cw IN %s ORDER BY RAND() LIMIT 1000 " , ( handle , cw_list ) )
# this line is a little gross/optimised but here's what it does
# 1. fetch all of the results from the above query
# 2. turn (('this',), ('format')) into ('this', 'format')
# 3. convert the tuple to a list
# 4. join the list into a string separated by newlines
posts = " \n " . join ( list ( sum ( c . fetchall ( ) , ( ) ) ) )
2019-09-11 05:43:18 +00:00
if len ( posts ) == 0 :
2020-01-20 02:18:17 +00:00
print ( " {} - No posts to learn from. " . format ( handle ) )
fixed a silly but rather bad bug\ni'm pretty sure i'm gonna stop working on fedibooks, and everything else i do relating to fedi. writing complex webapps to (rather poorly) interface with decentralised, half-standardised social networking platforms really isn't for me, and much of this code is honestly pretty awful. i have no idea why i did some of this stuff - for example, writing an object to interface with the outbox json would have been a LOT cleaner than what i'm currently doing. fedibooks really doesn't work well at all, and its biggest flaw is, of course, the fact that it can't interact with any instances using authenticated fetch. while i may fix minor bugs from time to time, i can pretty much guarantee that one's never getting fixed. i'm not shutting down fedibooks.com or anything like that - and if i ever do, i'll give advance warning - but don't expect any new features here, or in mstdn-ebooks, OCRbot, et al. but hey, at least it's fixed now.
2020-05-27 09:39:15 +00:00
return bot , None
2019-09-09 03:39:28 +00:00
2019-09-09 09:49:48 +00:00
if bot [ ' fake_mentions ' ] == ' never ' :
# remove all mentions from the training data before the markov model sees it
2019-09-14 04:25:54 +00:00
posts = re . sub ( r " (?<! \ S)@ \ w+(@[ \ w.]+)? \ s? " , " " , posts )
2019-09-09 09:49:48 +00:00
2019-09-19 06:17:26 +00:00
model = nlt_fixed ( posts )
tries = 0
post = None
2019-09-11 05:43:18 +00:00
# even with such a high tries value for markovify, it still sometimes returns none.
fixed a silly but rather bad bug\ni'm pretty sure i'm gonna stop working on fedibooks, and everything else i do relating to fedi. writing complex webapps to (rather poorly) interface with decentralised, half-standardised social networking platforms really isn't for me, and much of this code is honestly pretty awful. i have no idea why i did some of this stuff - for example, writing an object to interface with the outbox json would have been a LOT cleaner than what i'm currently doing. fedibooks really doesn't work well at all, and its biggest flaw is, of course, the fact that it can't interact with any instances using authenticated fetch. while i may fix minor bugs from time to time, i can pretty much guarantee that one's never getting fixed. i'm not shutting down fedibooks.com or anything like that - and if i ever do, i'll give advance warning - but don't expect any new features here, or in mstdn-ebooks, OCRbot, et al. but hey, at least it's fixed now.
2020-05-27 09:39:15 +00:00
# so we implement our own tries function as well, and try five times.
while post is None and tries < 5 :
2019-09-11 05:43:18 +00:00
post = model . make_short_sentence ( bot [ ' length ' ] , tries = 1000 )
2019-09-09 09:49:48 +00:00
tries + = 1
2019-09-09 03:39:28 +00:00
2019-09-09 09:49:48 +00:00
if post == None :
2019-09-09 03:39:28 +00:00
# TODO: send an error email
pass
else :
2019-09-09 09:49:48 +00:00
if " @ " in post and bot [ ' fake_mentions ' ] != ' never ' :
2019-09-09 09:53:08 +00:00
# the unicode zero width space is a (usually) invisible character
# we can insert it between the @ symbols in a handle to make it appear fine while not mentioning the user
2019-09-09 09:49:48 +00:00
zws = " \u200B "
if bot [ ' fake_mentions ' ] == ' middle ' :
# remove mentions at the start of a post
2019-09-14 04:25:54 +00:00
post = re . sub ( r " ^(@ \ w+(@[ \ w.]+)? \ s*)+ " , " " , post )
2019-09-09 09:49:48 +00:00
# TODO: does this regex catch all valid handles?
if bot [ ' fake_mentions_full ' ] :
post = re . sub ( r " @( \ w+)@([ \ w.]+) " , r " @ {} \ 1@ {} \ 2 " . format ( zws , zws ) , post )
else :
2020-01-20 02:18:17 +00:00
post = re . sub ( r " @( \ w+)@([ \ w.]+) " , r " @ {} \ 1 " . format ( zws ) , post )
2019-09-14 04:25:54 +00:00
# also format handles without instances, e.g. @user instead of @user@instan.ce
post = re . sub ( r " (?<! \ S)@( \ w+) " , r " @ {} \ 1 " . format ( zws ) , post )
2019-09-09 09:49:48 +00:00
2020-01-20 02:56:24 +00:00
return bot , post
2020-01-20 02:53:11 +00:00
def make_post ( args ) :
id = None
acct = None
if len ( args ) > 1 :
id = args [ 1 ]
acct = args [ 3 ]
handle = args [ 0 ]
2020-01-20 04:40:59 +00:00
# print("Generating post for {}".format(handle))
2020-01-20 04:38:55 +00:00
bot , post = generate_output ( handle )
fixed a silly but rather bad bug\ni'm pretty sure i'm gonna stop working on fedibooks, and everything else i do relating to fedi. writing complex webapps to (rather poorly) interface with decentralised, half-standardised social networking platforms really isn't for me, and much of this code is honestly pretty awful. i have no idea why i did some of this stuff - for example, writing an object to interface with the outbox json would have been a LOT cleaner than what i'm currently doing. fedibooks really doesn't work well at all, and its biggest flaw is, of course, the fact that it can't interact with any instances using authenticated fetch. while i may fix minor bugs from time to time, i can pretty much guarantee that one's never getting fixed. i'm not shutting down fedibooks.com or anything like that - and if i ever do, i'll give advance warning - but don't expect any new features here, or in mstdn-ebooks, OCRbot, et al. but hey, at least it's fixed now.
2020-05-27 09:39:15 +00:00
# post will be None if there's no posts for the bot to learn from.
# in such a case, we should just exit without doing anything.
if post == None : return
2020-01-20 03:01:56 +00:00
client = Mastodon (
client_id = bot [ ' client_id ' ] ,
client_secret = bot [ ' client_secret ' ] ,
access_token = bot [ ' secret ' ] ,
api_base_url = " https:// {} " . format ( handle . split ( " @ " ) [ 2 ] )
)
2020-01-20 04:40:59 +00:00
db = MySQLdb . connect (
host = cfg [ ' db_host ' ] ,
user = cfg [ ' db_user ' ] ,
passwd = cfg [ ' db_pass ' ] ,
2020-05-27 10:09:21 +00:00
db = cfg [ ' db_name ' ] ,
use_unicode = True ,
charset = " utf8mb4 "
2020-01-20 04:40:59 +00:00
)
c = db . cursor ( )
2020-01-20 02:53:11 +00:00
# print(post)
visibility = bot [ ' post_privacy ' ] if len ( args ) == 1 else args [ 2 ]
visibilities = [ ' public ' , ' unlisted ' , ' private ' ]
if visibilities . index ( visibility ) < visibilities . index ( bot [ ' post_privacy ' ] ) :
# if post_privacy is set to a more restricted level than the visibility of the post we're replying to, use the user's setting
visibility = bot [ ' post_privacy ' ]
if acct is not None :
post = " {} {} " . format ( acct , post )
# ensure post isn't longer than bot['length']
# TODO: ehhhhhhhhh
post = post [ : bot [ ' length ' ] ]
# send toot!!
try :
client . status_post ( post , id , visibility = visibility , spoiler_text = bot [ ' content_warning ' ] )
except MastodonUnauthorizedError :
# user has revoked the token given to the bot
# this needs to be dealt with properly later on, but for now, we'll just disable the bot
c . execute ( " UPDATE bots SET enabled = FALSE WHERE handle = %s " , ( handle , ) )
2020-01-20 05:21:46 +00:00
except :
2020-05-27 09:42:37 +00:00
print ( " Failed to submit post for {} " . format ( handle ) )
2019-09-09 03:39:28 +00:00
2019-09-11 04:37:13 +00:00
if id == None :
# this wasn't a reply, it was a regular post, so update the last post date
c . execute ( " UPDATE bots SET last_post = CURRENT_TIMESTAMP() WHERE handle = %s " , ( handle , ) )
db . commit ( )
2020-01-20 05:21:46 +00:00
c . close ( )
2020-03-18 06:41:32 +00:00
2020-05-27 11:51:58 +00:00
def task_done ( future ) :
try :
result = future . result ( ) # blocks until results are ready
except TimeoutError as error :
if not future . silent : print ( " Timed out on {} . " . format ( future . function_data ) )
2020-05-27 10:48:46 +00:00
def do_in_pool ( function , data , timeout = 30 , silent = False ) :
2020-05-27 11:51:58 +00:00
with ProcessPool ( max_workers = 5 , max_tasks = 10 ) as pool :
for i in data :
future = pool . schedule ( function , args = [ i ] , timeout = timeout )
future . silent = silent
future . function_data = i
future . add_done_callback ( task_done )
2020-05-27 10:48:46 +00:00
2020-03-18 06:41:32 +00:00
def get_key ( ) :
db = MySQLdb . connect (
host = cfg [ ' db_host ' ] ,
user = cfg [ ' db_user ' ] ,
passwd = cfg [ ' db_pass ' ] ,
2020-05-27 10:09:21 +00:00
db = cfg [ ' db_name ' ] ,
use_unicode = True ,
charset = " utf8mb4 "
2020-03-18 06:41:32 +00:00
)
dc = db . cursor ( MySQLdb . cursors . DictCursor )
dc . execute ( " SELECT * FROM http_auth_key " )
key = dc . fetchone ( )
if key == None :
# generate new key
key = { }
privkey = RSA . generate ( 4096 )
key [ ' private ' ] = privkey . exportKey ( ' PEM ' ) . decode ( ' utf-8 ' )
key [ ' public ' ] = privkey . publickey ( ) . exportKey ( ' PEM ' ) . decode ( ' utf-8 ' )
fixed a silly but rather bad bug\ni'm pretty sure i'm gonna stop working on fedibooks, and everything else i do relating to fedi. writing complex webapps to (rather poorly) interface with decentralised, half-standardised social networking platforms really isn't for me, and much of this code is honestly pretty awful. i have no idea why i did some of this stuff - for example, writing an object to interface with the outbox json would have been a LOT cleaner than what i'm currently doing. fedibooks really doesn't work well at all, and its biggest flaw is, of course, the fact that it can't interact with any instances using authenticated fetch. while i may fix minor bugs from time to time, i can pretty much guarantee that one's never getting fixed. i'm not shutting down fedibooks.com or anything like that - and if i ever do, i'll give advance warning - but don't expect any new features here, or in mstdn-ebooks, OCRbot, et al. but hey, at least it's fixed now.
2020-05-27 09:39:15 +00:00
2020-03-18 06:41:32 +00:00
dc . execute ( " INSERT INTO http_auth_key (private, public) VALUES ( %s , %s ) " , ( key [ ' private ' ] , key [ ' public ' ] ) )
fixed a silly but rather bad bug\ni'm pretty sure i'm gonna stop working on fedibooks, and everything else i do relating to fedi. writing complex webapps to (rather poorly) interface with decentralised, half-standardised social networking platforms really isn't for me, and much of this code is honestly pretty awful. i have no idea why i did some of this stuff - for example, writing an object to interface with the outbox json would have been a LOT cleaner than what i'm currently doing. fedibooks really doesn't work well at all, and its biggest flaw is, of course, the fact that it can't interact with any instances using authenticated fetch. while i may fix minor bugs from time to time, i can pretty much guarantee that one's never getting fixed. i'm not shutting down fedibooks.com or anything like that - and if i ever do, i'll give advance warning - but don't expect any new features here, or in mstdn-ebooks, OCRbot, et al. but hey, at least it's fixed now.
2020-05-27 09:39:15 +00:00
2020-03-18 06:41:32 +00:00
dc . close ( )
db . commit ( )
2020-04-11 07:14:53 +00:00
return key
def signed_get ( url , timeout = 10 , additional_headers = { } , request_json = True ) :
headers = { }
if request_json :
headers = {
" Accept " : " application/json " ,
" Content-Type " : " application/json "
}
headers = { * * headers , * * additional_headers }
# sign request headers
key = RSA . importKey ( get_key ( ) [ ' private ' ] )
sigstring = ' '
for header , value in headers . items ( ) :
sigstring + = ' {} : {} \n ' . format ( header . lower ( ) , value )
sigstring . rstrip ( " \n " )
pkcs = PKCS1_v1_5 . new ( key )
h = SHA256 . new ( )
h . update ( sigstring . encode ( ' ascii ' ) )
signed_sigstring = b64encode ( pkcs . sign ( h ) ) . decode ( ' ascii ' )
sig = {
' keyId ' : " {} /actor " . format ( cfg [ ' base_uri ' ] ) ,
' algorithm ' : ' rsa-sha256 ' ,
' headers ' : ' ' . join ( headers . keys ( ) ) ,
' signature ' : signed_sigstring
}
sig_header = [ ' {} = " {} " ' . format ( k , v ) for k , v in sig . items ( ) ]
headers [ ' signature ' ] = ' , ' . join ( sig_header )
r = requests . Request ( ' GET ' , url , headers )
return r . headers
fixed a silly but rather bad bug\ni'm pretty sure i'm gonna stop working on fedibooks, and everything else i do relating to fedi. writing complex webapps to (rather poorly) interface with decentralised, half-standardised social networking platforms really isn't for me, and much of this code is honestly pretty awful. i have no idea why i did some of this stuff - for example, writing an object to interface with the outbox json would have been a LOT cleaner than what i'm currently doing. fedibooks really doesn't work well at all, and its biggest flaw is, of course, the fact that it can't interact with any instances using authenticated fetch. while i may fix minor bugs from time to time, i can pretty much guarantee that one's never getting fixed. i'm not shutting down fedibooks.com or anything like that - and if i ever do, i'll give advance warning - but don't expect any new features here, or in mstdn-ebooks, OCRbot, et al. but hey, at least it's fixed now.
2020-05-27 09:39:15 +00:00
# return requests.get(url, timeout = timeout)