mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-26 00:58:59 +00:00
Compare commits
No commits in common. "bd2b0641537e1c65561de229fc6ffa272a2d02a7" and "ed08240619a90c37181ecda1c9e5826fb9e712e2" have entirely different histories.
bd2b064153
...
ed08240619
3 changed files with 24 additions and 41 deletions
|
@ -25,8 +25,6 @@ def extract_post(post):
|
|||
ht.unwrap()
|
||||
|
||||
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
|
||||
if 'href' in link:
|
||||
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
|
||||
link.insert_after(link["href"])
|
||||
link.decompose()
|
||||
|
||||
|
|
18
app/scrape.py
Executable file → Normal file
18
app/scrape.py
Executable file → Normal file
|
@ -30,10 +30,7 @@ def scrape_posts(account):
|
|||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
||||
last_post = c.fetchone()[0]
|
||||
|
||||
done = False
|
||||
|
||||
try:
|
||||
r = requests.get(outbox, timeout = 10)
|
||||
r = requests.get(outbox)
|
||||
j = r.json()
|
||||
# check for pleroma
|
||||
pleroma = 'next' not in j
|
||||
|
@ -43,14 +40,12 @@ def scrape_posts(account):
|
|||
j = j['first']
|
||||
else:
|
||||
uri = "{}&min_id={}".format(outbox, last_post)
|
||||
r = requests.get(uri, timeout = 10)
|
||||
r = requests.get(uri)
|
||||
j = r.json()
|
||||
except:
|
||||
print("Couldn't load or parse outbox at URL {}".format(outbox))
|
||||
done = True
|
||||
|
||||
# here we go!
|
||||
# warning: scraping posts from outbox.json is messy stuff
|
||||
done = False
|
||||
while not done and len(j['orderedItems']) > 0:
|
||||
for oi in j['orderedItems']:
|
||||
if oi['type'] == "Create":
|
||||
|
@ -83,10 +78,9 @@ def scrape_posts(account):
|
|||
))
|
||||
except:
|
||||
#TODO: error handling
|
||||
print("Failed to insert post {} for user {}".format(post_id, handle))
|
||||
raise
|
||||
|
||||
if not done:
|
||||
try:
|
||||
if pleroma:
|
||||
if 'next' in j:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
|
@ -97,10 +91,6 @@ def scrape_posts(account):
|
|||
r = requests.get(j['prev'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
except requests.Timeout:
|
||||
print("Timed out while loading next page for {}".format(handle))
|
||||
except:
|
||||
print("Encountered unknown error while getting next page for {}".format(handle))
|
||||
|
||||
if r.status_code == 429:
|
||||
# we are now being ratelimited, move on to the next user
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from flask import Flask, render_template, session, request, redirect, url_for, send_file, jsonify
|
||||
from flask import Flask, render_template, session, request, redirect, url_for, send_file
|
||||
from flask_mysqldb import MySQL
|
||||
|
||||
from mastodon import Mastodon
|
||||
|
@ -370,11 +370,6 @@ def img_bot_generic():
|
|||
def favicon():
|
||||
return send_file("static/favicon.ico")
|
||||
|
||||
# @app.route("/.well-known/webfinger")
|
||||
# def webfinger():
|
||||
# return render_template("webfinger.json", base_uri = cfg['base_uri']), 200, {'Content-type':'application/json'}
|
||||
|
||||
|
||||
def bot_check(bot):
|
||||
# check to ensure bot is owned by user
|
||||
c = mysql.connection.cursor()
|
||||
|
|
Loading…
Reference in a new issue