from bs4 import BeautifulSoup
import html, re
def extract_post(post):
post = html.unescape(post) # convert HTML escape codes to text
soup = BeautifulSoup(post, "html.parser")
for lb in soup.select("br"): # replace
with linebreak
lb.insert_after("\n")
lb.decompose()
for p in soup.select("p"): # ditto for
p.insert_after("\n") p.unwrap() for ht in soup.select("a.hashtag"): # convert hashtags from links to text ht.unwrap() for link in soup.select("a"): #ocnvert