mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-26 00:58:59 +00:00
Compare commits
No commits in common. "bd2b0641537e1c65561de229fc6ffa272a2d02a7" and "ed08240619a90c37181ecda1c9e5826fb9e712e2" have entirely different histories.
bd2b064153
...
ed08240619
3 changed files with 24 additions and 41 deletions
|
@ -25,9 +25,7 @@ def extract_post(post):
|
||||||
ht.unwrap()
|
ht.unwrap()
|
||||||
|
|
||||||
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
|
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
|
||||||
if 'href' in link:
|
link.insert_after(link["href"])
|
||||||
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
|
|
||||||
link.insert_after(link["href"])
|
|
||||||
link.decompose()
|
link.decompose()
|
||||||
|
|
||||||
text = soup.get_text()
|
text = soup.get_text()
|
||||||
|
|
54
app/scrape.py
Executable file → Normal file
54
app/scrape.py
Executable file → Normal file
|
@ -30,27 +30,22 @@ def scrape_posts(account):
|
||||||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
||||||
last_post = c.fetchone()[0]
|
last_post = c.fetchone()[0]
|
||||||
|
|
||||||
done = False
|
r = requests.get(outbox)
|
||||||
|
j = r.json()
|
||||||
try:
|
# check for pleroma
|
||||||
r = requests.get(outbox, timeout = 10)
|
pleroma = 'next' not in j
|
||||||
|
if pleroma:
|
||||||
|
if 'first' in j:
|
||||||
|
# backwards compatibility for older (pre-v1.0.7) pleroma instances
|
||||||
|
j = j['first']
|
||||||
|
else:
|
||||||
|
uri = "{}&min_id={}".format(outbox, last_post)
|
||||||
|
r = requests.get(uri)
|
||||||
j = r.json()
|
j = r.json()
|
||||||
# check for pleroma
|
|
||||||
pleroma = 'next' not in j
|
|
||||||
if pleroma:
|
|
||||||
if 'first' in j:
|
|
||||||
# backwards compatibility for older (pre-v1.0.7) pleroma instances
|
|
||||||
j = j['first']
|
|
||||||
else:
|
|
||||||
uri = "{}&min_id={}".format(outbox, last_post)
|
|
||||||
r = requests.get(uri, timeout = 10)
|
|
||||||
j = r.json()
|
|
||||||
except:
|
|
||||||
print("Couldn't load or parse outbox at URL {}".format(outbox))
|
|
||||||
done = True
|
|
||||||
|
|
||||||
# here we go!
|
# here we go!
|
||||||
# warning: scraping posts from outbox.json is messy stuff
|
# warning: scraping posts from outbox.json is messy stuff
|
||||||
|
done = False
|
||||||
while not done and len(j['orderedItems']) > 0:
|
while not done and len(j['orderedItems']) > 0:
|
||||||
for oi in j['orderedItems']:
|
for oi in j['orderedItems']:
|
||||||
if oi['type'] == "Create":
|
if oi['type'] == "Create":
|
||||||
|
@ -83,24 +78,19 @@ def scrape_posts(account):
|
||||||
))
|
))
|
||||||
except:
|
except:
|
||||||
#TODO: error handling
|
#TODO: error handling
|
||||||
print("Failed to insert post {} for user {}".format(post_id, handle))
|
raise
|
||||||
|
|
||||||
if not done:
|
if not done:
|
||||||
try:
|
if pleroma:
|
||||||
if pleroma:
|
if 'next' in j:
|
||||||
if 'next' in j:
|
r = requests.get(j['next'], timeout = 10)
|
||||||
r = requests.get(j['next'], timeout = 10)
|
|
||||||
else:
|
|
||||||
done = True
|
|
||||||
else:
|
else:
|
||||||
if 'prev' in j:
|
done = True
|
||||||
r = requests.get(j['prev'], timeout = 10)
|
else:
|
||||||
else:
|
if 'prev' in j:
|
||||||
done = True
|
r = requests.get(j['prev'], timeout = 10)
|
||||||
except requests.Timeout:
|
else:
|
||||||
print("Timed out while loading next page for {}".format(handle))
|
done = True
|
||||||
except:
|
|
||||||
print("Encountered unknown error while getting next page for {}".format(handle))
|
|
||||||
|
|
||||||
if r.status_code == 429:
|
if r.status_code == 429:
|
||||||
# we are now being ratelimited, move on to the next user
|
# we are now being ratelimited, move on to the next user
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from flask import Flask, render_template, session, request, redirect, url_for, send_file, jsonify
|
from flask import Flask, render_template, session, request, redirect, url_for, send_file
|
||||||
from flask_mysqldb import MySQL
|
from flask_mysqldb import MySQL
|
||||||
|
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
|
@ -370,11 +370,6 @@ def img_bot_generic():
|
||||||
def favicon():
|
def favicon():
|
||||||
return send_file("static/favicon.ico")
|
return send_file("static/favicon.ico")
|
||||||
|
|
||||||
# @app.route("/.well-known/webfinger")
|
|
||||||
# def webfinger():
|
|
||||||
# return render_template("webfinger.json", base_uri = cfg['base_uri']), 200, {'Content-type':'application/json'}
|
|
||||||
|
|
||||||
|
|
||||||
def bot_check(bot):
|
def bot_check(bot):
|
||||||
# check to ensure bot is owned by user
|
# check to ensure bot is owned by user
|
||||||
c = mysql.connection.cursor()
|
c = mysql.connection.cursor()
|
||||||
|
|
Loading…
Reference in a new issue