from bs4 import BeautifulSoup import html, re def extract_post(post): post = html.unescape(post) # convert HTML escape codes to text soup = BeautifulSoup(post, "html.parser") for lb in soup.select("br"): # replace
with linebreak lb.insert_after("\n") lb.decompose() for p in soup.select("p"): # ditto for

p.insert_after("\n") p.unwrap() for ht in soup.select("a.hashtag"): # convert hashtags from links to text ht.unwrap() for link in soup.select("a"): #ocnvert