mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-25 16:48:58 +00:00
Compare commits
6 commits
ed08240619
...
bd2b064153
Author | SHA1 | Date | |
---|---|---|---|
bd2b064153 | |||
dada8514e4 | |||
996d52d542 | |||
61f95f654a | |||
fc60b6d937 | |||
87b9f64d75 |
3 changed files with 41 additions and 24 deletions
|
@ -25,7 +25,9 @@ def extract_post(post):
|
|||
ht.unwrap()
|
||||
|
||||
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
|
||||
link.insert_after(link["href"])
|
||||
if 'href' in link:
|
||||
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
|
||||
link.insert_after(link["href"])
|
||||
link.decompose()
|
||||
|
||||
text = soup.get_text()
|
||||
|
|
54
app/scrape.py
Normal file → Executable file
54
app/scrape.py
Normal file → Executable file
|
@ -30,22 +30,27 @@ def scrape_posts(account):
|
|||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
||||
last_post = c.fetchone()[0]
|
||||
|
||||
r = requests.get(outbox)
|
||||
j = r.json()
|
||||
# check for pleroma
|
||||
pleroma = 'next' not in j
|
||||
if pleroma:
|
||||
if 'first' in j:
|
||||
# backwards compatibility for older (pre-v1.0.7) pleroma instances
|
||||
j = j['first']
|
||||
else:
|
||||
uri = "{}&min_id={}".format(outbox, last_post)
|
||||
r = requests.get(uri)
|
||||
done = False
|
||||
|
||||
try:
|
||||
r = requests.get(outbox, timeout = 10)
|
||||
j = r.json()
|
||||
# check for pleroma
|
||||
pleroma = 'next' not in j
|
||||
if pleroma:
|
||||
if 'first' in j:
|
||||
# backwards compatibility for older (pre-v1.0.7) pleroma instances
|
||||
j = j['first']
|
||||
else:
|
||||
uri = "{}&min_id={}".format(outbox, last_post)
|
||||
r = requests.get(uri, timeout = 10)
|
||||
j = r.json()
|
||||
except:
|
||||
print("Couldn't load or parse outbox at URL {}".format(outbox))
|
||||
done = True
|
||||
|
||||
# here we go!
|
||||
# warning: scraping posts from outbox.json is messy stuff
|
||||
done = False
|
||||
while not done and len(j['orderedItems']) > 0:
|
||||
for oi in j['orderedItems']:
|
||||
if oi['type'] == "Create":
|
||||
|
@ -78,19 +83,24 @@ def scrape_posts(account):
|
|||
))
|
||||
except:
|
||||
#TODO: error handling
|
||||
raise
|
||||
print("Failed to insert post {} for user {}".format(post_id, handle))
|
||||
|
||||
if not done:
|
||||
if pleroma:
|
||||
if 'next' in j:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
try:
|
||||
if pleroma:
|
||||
if 'next' in j:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
else:
|
||||
done = True
|
||||
else:
|
||||
if 'prev' in j:
|
||||
r = requests.get(j['prev'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
if 'prev' in j:
|
||||
r = requests.get(j['prev'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
except requests.Timeout:
|
||||
print("Timed out while loading next page for {}".format(handle))
|
||||
except:
|
||||
print("Encountered unknown error while getting next page for {}".format(handle))
|
||||
|
||||
if r.status_code == 429:
|
||||
# we are now being ratelimited, move on to the next user
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from flask import Flask, render_template, session, request, redirect, url_for, send_file
|
||||
from flask import Flask, render_template, session, request, redirect, url_for, send_file, jsonify
|
||||
from flask_mysqldb import MySQL
|
||||
|
||||
from mastodon import Mastodon
|
||||
|
@ -370,6 +370,11 @@ def img_bot_generic():
|
|||
def favicon():
|
||||
return send_file("static/favicon.ico")
|
||||
|
||||
# @app.route("/.well-known/webfinger")
|
||||
# def webfinger():
|
||||
# return render_template("webfinger.json", base_uri = cfg['base_uri']), 200, {'Content-type':'application/json'}
|
||||
|
||||
|
||||
def bot_check(bot):
|
||||
# check to ensure bot is owned by user
|
||||
c = mysql.connection.cursor()
|
||||
|
|
Loading…
Reference in a new issue