1
0
Fork 0
mirror of https://github.com/Lynnesbian/FediBooks/ synced 2024-11-26 00:58:59 +00:00

Compare commits

..

No commits in common. "0997de535b7fda785cbdd9a6d4813d9a32175c66" and "954544205ec7f794d4a93f0b8a56ef49da60c816" have entirely different histories.

5 changed files with 25 additions and 20 deletions

1
.gitignore vendored
View file

@ -4,4 +4,3 @@ planning.txt
*.pyc
/debug
lynnesbian.json
test.py

View file

@ -21,10 +21,12 @@ def extract_post(post):
post = html.unescape(post) # convert HTML escape codes to text
soup = BeautifulSoup(post, "html.parser")
for lb in soup.select("br"): # replace <br> with linebreak
lb.replace_with("\n")
lb.insert_after("\n")
lb.decompose()
for p in soup.select("p"): # ditto for <p>
p.replace_with("\n")
p.insert_after("\n")
p.unwrap()
for ht in soup.select("a.hashtag"): # convert hashtags from links to text
ht.unwrap()
@ -32,7 +34,8 @@ def extract_post(post):
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
if 'href' in link:
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
link.replace_with(link["href"])
link.insert_after(link["href"])
link.decompose()
text = soup.get_text()
text = re.sub(r"https://([^/]+)/(@[^\s]+)", r"\2@\1", text) # put mastodon-style mentions back in
@ -188,19 +191,22 @@ def make_post(args):
db.commit()
c.close()
def task_done(future):
try:
result = future.result() # blocks until results are ready
except TimeoutError as error:
if not future.silent: print("Timed out on {}.".format(future.function_data))
def do_in_pool(function, data, timeout=30, silent=False):
with ProcessPool(max_workers=5, max_tasks=10) as pool:
for i in data:
future = pool.schedule(function, args=[i], timeout=timeout)
future.silent = silent
future.function_data = i
future.add_done_callback(task_done)
with ProcessPool(max_workers=cfg['service_threads']) as p:
index = 0
future = p.map(function, data)
iterator = future.result()
while True:
try:
result = next(iterator)
except StopIteration:
# all threads are done
break
except TimeoutError as error:
if not silent: print("Timed out on {}.".format(data[index]))
finally:
index += 1
def get_key():
db = MySQLdb.connect(

View file

@ -50,7 +50,7 @@ def scrape_posts(account):
# here we go!
# warning: scraping posts from outbox.json is messy stuff
while not done and 'orderedItems' in j and len(j['orderedItems']) > 0:
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# this is a status/post/toot/florp/whatever

View file

@ -30,7 +30,7 @@ def update_icon(bot):
if r.status_code != 200:
raise
except:
print("{} is down - can't update icon for {}.".format(url, bot['handle']))
print("{} is down.".format(url))
return
client = Mastodon(

View file

@ -1,6 +1,6 @@
Mastodon.py==1.5.1
markovify==0.8.0
beautifulsoup4==4.9.1
beautifulsoup4==4.9.0
requests==2.23.0
Flask==1.1.2
flask-mysqldb==0.2.0