1
0
Fork 0
mirror of https://github.com/Lynnesbian/FediBooks/ synced 2024-11-25 16:48:58 +00:00

Compare commits

...

3 commits

4 changed files with 111 additions and 8 deletions

26
functions.py Normal file
View file

@ -0,0 +1,26 @@
from bs4 import BeautifulSoup
import html, re
def extract_post(post):
post = html.unescape(post) # convert HTML escape codes to text
soup = BeautifulSoup(post, "html.parser")
for lb in soup.select("br"): # replace <br> with linebreak
lb.insert_after("\n")
lb.decompose()
for p in soup.select("p"): # ditto for <p>
p.insert_after("\n")
p.unwrap()
for ht in soup.select("a.hashtag"): # convert hashtags from links to text
ht.unwrap()
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
link.insert_after(link["href"])
link.decompose()
text = soup.get_text()
text = re.sub("https://([^/]+)/(@[^ ]+)", r"\2@\1", text) # put mastodon-style mentions back in
text = re.sub("https://([^/]+)/users/([^ ]+)", r"@\2@\1", text) # put pleroma-style mentions back in
text = text.rstrip("\n") # remove trailing newline(s)
return text

81
service.py Normal file → Executable file
View file

@ -3,15 +3,27 @@ from mastodon import Mastodon
import MySQLdb import MySQLdb
import requests import requests
from multiprocessing import Pool from multiprocessing import Pool
import json import json, re
import functions
cfg = json.load(open('config.json')) cfg = json.load(open('config.json'))
def scrape_posts(handle, outbox): def scrape_posts(account):
# check for min_id handle = account[0]
outbox = account[1]
print("Scraping {}".format(handle))
c = db.cursor()
last_post = 0 last_post = 0
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
if c.fetchone()[0] > 0:
# we've downloaded this user's posts before
# find out the most recently downloaded post of theirs
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox) r = requests.get(outbox)
j = r.json() j = r.json()
# check for pleroma
pleroma = 'next' not in j pleroma = 'next' not in j
if pleroma: if pleroma:
j = j['first'] j = j['first']
@ -20,6 +32,58 @@ def scrape_posts(handle, outbox):
r = requests.get(uri) r = requests.get(uri)
j = r.json() j = r.json()
# here we go!
# warning: scraping posts from outbox.json is messy stuff
done = False
while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# this is a status/post/toot/florp/whatever
# first, check to see if we already have this in the database
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
if c.fetchone()[0] > 0:
# this post is already in the DB.
# we'll set done to true because we've caught up to where we were last time.
done = True
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
continue
content = oi['object']['content']
# remove HTML tags and such from post
content = functions.extract_post(content)
if len(content) > 65535:
# post is too long to go into the DB
continue
try:
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
handle,
post_id,
content,
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
))
except:
#TODO: error handling
raise
if not done:
if pleroma:
r = requests.get(j['next'], timeout = 10)
else:
r = requests.get(j['prev'], timeout = 10)
if r.status_code == 429:
# we are now being ratelimited, move on to the next user
done = True
else:
j = r.json()
db.commit()
c.close()
print("Finished {}".format(handle))
print("Establishing DB connection") print("Establishing DB connection")
db = MySQLdb.connect( db = MySQLdb.connect(
host = cfg['db_host'], host = cfg['db_host'],
@ -28,6 +92,13 @@ db = MySQLdb.connect(
db=cfg['db_name'] db=cfg['db_name']
) )
c = db.cursor()
print("Downloading posts") print("Downloading posts")
cursor = db.cursor()
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
accounts = cursor.fetchall()
cursor.close()
with Pool(8) as p:
p.map(scrape_posts, accounts)
#TODO: other cron tasks should be done here, like updating profile pictures

View file

@ -23,7 +23,7 @@
<div class="panel-text"> <div class="panel-text">
{% set handle_list = user['fedi_id'].split('@') %} {% set handle_list = user['fedi_id'].split('@') %}
<div class="panel-name">@{{ handle_list[1] }}<span class="subtle tiny">@{{ handle_list[2] }}</span></div> <div class="panel-name">@{{ handle_list[1] }}<span class="subtle tiny">@{{ handle_list[2] }}</span></div>
<div class="panel-status">{{ "Active" if user['enabled'] else "Inactive" }}</div> <div class="panel-status">{{ "Active" if user['enabled'] else "Inactive" }}, {{ post_count[user['fedi_id']] }} posts in database</div>
</div> </div>
<div class="panel-actions"> <div class="panel-actions">
<a class="button btn-secondary" href="/bot/accounts/toggle/{{ user['fedi_id'] }}" title="Turn on/off"><i class="fas fa-power-off"></i></a><a class="button btn-dangerous" href="/bot/accounts/delete/{{ user['fedi_id'] }}" title="Delete"><i class="fas fa-trash"></i></a> <a class="button btn-secondary" href="/bot/accounts/toggle/{{ user['fedi_id'] }}" title="Turn on/off"><i class="fas fa-power-off"></i></a><a class="button btn-dangerous" href="/bot/accounts/delete/{{ user['fedi_id'] }}" title="Delete"><i class="fas fa-trash"></i></a>

View file

@ -128,6 +128,7 @@ def bot_accounts(id):
c.execute("SELECT COUNT(*) FROM `bot_learned_accounts` WHERE `bot_id` = %s", (id,)) c.execute("SELECT COUNT(*) FROM `bot_learned_accounts` WHERE `bot_id` = %s", (id,))
user_count = c.fetchone()[0] user_count = c.fetchone()[0]
users = {} users = {}
post_count = {}
if user_count > 0: if user_count > 0:
dc = mysql.connection.cursor(MySQLdb.cursors.DictCursor) dc = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
@ -135,9 +136,14 @@ def bot_accounts(id):
users = dc.fetchall() users = dc.fetchall()
dc.close() dc.close()
post_count = {}
for user in users:
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (user['fedi_id'],))
post_count[user['fedi_id']] = c.fetchone()[0]
c.close() c.close()
return render_template("bot_accounts.html", users = users) return render_template("bot_accounts.html", users = users, post_count = post_count)
@app.route("/bot/accounts/add", methods = ['GET', 'POST']) @app.route("/bot/accounts/add", methods = ['GET', 'POST'])
def bot_accounts_add(): def bot_accounts_add():
@ -214,7 +220,7 @@ def bot_accounts_delete(id):
c.close() c.close()
mysql.connection.commit() mysql.connection.commit()
return redirect(url_for("/bot/accounts/{}".format(session['bot'])), 303) return redirect("/bot/accounts/{}".format(session['bot']), 303)
@app.route("/bot/create/", methods=['GET', 'POST']) @app.route("/bot/create/", methods=['GET', 'POST'])
def bot_create(): def bot_create():