from bs4 import BeautifulSoup import MySQLdb import markovify from mastodon import Mastodon, MastodonUnauthorizedError import html, re, json cfg = json.load(open('config.json')) class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences def test_sentence_input(self, sentence): return True # all sentences are valid <3 def extract_post(post): post = html.unescape(post) # convert HTML escape codes to text soup = BeautifulSoup(post, "html.parser") for lb in soup.select("br"): # replace
with linebreak lb.insert_after("\n") lb.decompose() for p in soup.select("p"): # ditto for

p.insert_after("\n") p.unwrap() for ht in soup.select("a.hashtag"): # convert hashtags from links to text ht.unwrap() for link in soup.select("a"): #ocnvert