mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-25 16:48:58 +00:00
Compare commits
No commits in common. "c38f01b07adb8a701b749d72ac4c2dee38ded0ca" and "dd3679d8db4051d2570bf506b5346ede123030ea" have entirely different histories.
c38f01b07a
...
dd3679d8db
4 changed files with 8 additions and 111 deletions
26
functions.py
26
functions.py
|
@ -1,26 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import html, re
|
|
||||||
|
|
||||||
def extract_post(post):
|
|
||||||
post = html.unescape(post) # convert HTML escape codes to text
|
|
||||||
soup = BeautifulSoup(post, "html.parser")
|
|
||||||
for lb in soup.select("br"): # replace <br> with linebreak
|
|
||||||
lb.insert_after("\n")
|
|
||||||
lb.decompose()
|
|
||||||
|
|
||||||
for p in soup.select("p"): # ditto for <p>
|
|
||||||
p.insert_after("\n")
|
|
||||||
p.unwrap()
|
|
||||||
|
|
||||||
for ht in soup.select("a.hashtag"): # convert hashtags from links to text
|
|
||||||
ht.unwrap()
|
|
||||||
|
|
||||||
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
|
|
||||||
link.insert_after(link["href"])
|
|
||||||
link.decompose()
|
|
||||||
|
|
||||||
text = soup.get_text()
|
|
||||||
text = re.sub("https://([^/]+)/(@[^ ]+)", r"\2@\1", text) # put mastodon-style mentions back in
|
|
||||||
text = re.sub("https://([^/]+)/users/([^ ]+)", r"@\2@\1", text) # put pleroma-style mentions back in
|
|
||||||
text = text.rstrip("\n") # remove trailing newline(s)
|
|
||||||
return text
|
|
81
service.py
Executable file → Normal file
81
service.py
Executable file → Normal file
|
@ -3,27 +3,15 @@ from mastodon import Mastodon
|
||||||
import MySQLdb
|
import MySQLdb
|
||||||
import requests
|
import requests
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
import json, re
|
import json
|
||||||
import functions
|
|
||||||
|
|
||||||
cfg = json.load(open('config.json'))
|
cfg = json.load(open('config.json'))
|
||||||
|
|
||||||
def scrape_posts(account):
|
def scrape_posts(handle, outbox):
|
||||||
handle = account[0]
|
# check for min_id
|
||||||
outbox = account[1]
|
|
||||||
print("Scraping {}".format(handle))
|
|
||||||
c = db.cursor()
|
|
||||||
last_post = 0
|
last_post = 0
|
||||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
|
|
||||||
if c.fetchone()[0] > 0:
|
|
||||||
# we've downloaded this user's posts before
|
|
||||||
# find out the most recently downloaded post of theirs
|
|
||||||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
|
||||||
last_post = c.fetchone()[0]
|
|
||||||
|
|
||||||
r = requests.get(outbox)
|
r = requests.get(outbox)
|
||||||
j = r.json()
|
j = r.json()
|
||||||
# check for pleroma
|
|
||||||
pleroma = 'next' not in j
|
pleroma = 'next' not in j
|
||||||
if pleroma:
|
if pleroma:
|
||||||
j = j['first']
|
j = j['first']
|
||||||
|
@ -32,58 +20,6 @@ def scrape_posts(account):
|
||||||
r = requests.get(uri)
|
r = requests.get(uri)
|
||||||
j = r.json()
|
j = r.json()
|
||||||
|
|
||||||
# here we go!
|
|
||||||
# warning: scraping posts from outbox.json is messy stuff
|
|
||||||
done = False
|
|
||||||
while not done and len(j['orderedItems']) > 0:
|
|
||||||
for oi in j['orderedItems']:
|
|
||||||
if oi['type'] == "Create":
|
|
||||||
# this is a status/post/toot/florp/whatever
|
|
||||||
# first, check to see if we already have this in the database
|
|
||||||
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
|
|
||||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
|
|
||||||
if c.fetchone()[0] > 0:
|
|
||||||
# this post is already in the DB.
|
|
||||||
# we'll set done to true because we've caught up to where we were last time.
|
|
||||||
done = True
|
|
||||||
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
|
|
||||||
continue
|
|
||||||
|
|
||||||
content = oi['object']['content']
|
|
||||||
# remove HTML tags and such from post
|
|
||||||
content = functions.extract_post(content)
|
|
||||||
|
|
||||||
if len(content) > 65535:
|
|
||||||
# post is too long to go into the DB
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
|
|
||||||
handle,
|
|
||||||
post_id,
|
|
||||||
content,
|
|
||||||
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
|
|
||||||
))
|
|
||||||
except:
|
|
||||||
#TODO: error handling
|
|
||||||
raise
|
|
||||||
|
|
||||||
if not done:
|
|
||||||
if pleroma:
|
|
||||||
r = requests.get(j['next'], timeout = 10)
|
|
||||||
else:
|
|
||||||
r = requests.get(j['prev'], timeout = 10)
|
|
||||||
|
|
||||||
if r.status_code == 429:
|
|
||||||
# we are now being ratelimited, move on to the next user
|
|
||||||
done = True
|
|
||||||
else:
|
|
||||||
j = r.json()
|
|
||||||
|
|
||||||
db.commit()
|
|
||||||
c.close()
|
|
||||||
print("Finished {}".format(handle))
|
|
||||||
|
|
||||||
print("Establishing DB connection")
|
print("Establishing DB connection")
|
||||||
db = MySQLdb.connect(
|
db = MySQLdb.connect(
|
||||||
host = cfg['db_host'],
|
host = cfg['db_host'],
|
||||||
|
@ -92,13 +28,6 @@ db = MySQLdb.connect(
|
||||||
db=cfg['db_name']
|
db=cfg['db_name']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
c = db.cursor()
|
||||||
|
|
||||||
print("Downloading posts")
|
print("Downloading posts")
|
||||||
|
|
||||||
cursor = db.cursor()
|
|
||||||
cursor.execute("SELECT `handle`, `outbox` FROM `fedi_accounts` ORDER BY RAND()")
|
|
||||||
accounts = cursor.fetchall()
|
|
||||||
cursor.close()
|
|
||||||
with Pool(8) as p:
|
|
||||||
p.map(scrape_posts, accounts)
|
|
||||||
|
|
||||||
#TODO: other cron tasks should be done here, like updating profile pictures
|
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
<div class="panel-text">
|
<div class="panel-text">
|
||||||
{% set handle_list = user['fedi_id'].split('@') %}
|
{% set handle_list = user['fedi_id'].split('@') %}
|
||||||
<div class="panel-name">@{{ handle_list[1] }}<span class="subtle tiny">@{{ handle_list[2] }}</span></div>
|
<div class="panel-name">@{{ handle_list[1] }}<span class="subtle tiny">@{{ handle_list[2] }}</span></div>
|
||||||
<div class="panel-status">{{ "Active" if user['enabled'] else "Inactive" }}, {{ post_count[user['fedi_id']] }} posts in database</div>
|
<div class="panel-status">{{ "Active" if user['enabled'] else "Inactive" }}</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="panel-actions">
|
<div class="panel-actions">
|
||||||
<a class="button btn-secondary" href="/bot/accounts/toggle/{{ user['fedi_id'] }}" title="Turn on/off"><i class="fas fa-power-off"></i></a><a class="button btn-dangerous" href="/bot/accounts/delete/{{ user['fedi_id'] }}" title="Delete"><i class="fas fa-trash"></i></a>
|
<a class="button btn-secondary" href="/bot/accounts/toggle/{{ user['fedi_id'] }}" title="Turn on/off"><i class="fas fa-power-off"></i></a><a class="button btn-dangerous" href="/bot/accounts/delete/{{ user['fedi_id'] }}" title="Delete"><i class="fas fa-trash"></i></a>
|
||||||
|
|
10
webui.py
10
webui.py
|
@ -128,7 +128,6 @@ def bot_accounts(id):
|
||||||
c.execute("SELECT COUNT(*) FROM `bot_learned_accounts` WHERE `bot_id` = %s", (id,))
|
c.execute("SELECT COUNT(*) FROM `bot_learned_accounts` WHERE `bot_id` = %s", (id,))
|
||||||
user_count = c.fetchone()[0]
|
user_count = c.fetchone()[0]
|
||||||
users = {}
|
users = {}
|
||||||
post_count = {}
|
|
||||||
|
|
||||||
if user_count > 0:
|
if user_count > 0:
|
||||||
dc = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
|
dc = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
|
||||||
|
@ -136,14 +135,9 @@ def bot_accounts(id):
|
||||||
users = dc.fetchall()
|
users = dc.fetchall()
|
||||||
dc.close()
|
dc.close()
|
||||||
|
|
||||||
post_count = {}
|
|
||||||
for user in users:
|
|
||||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (user['fedi_id'],))
|
|
||||||
post_count[user['fedi_id']] = c.fetchone()[0]
|
|
||||||
|
|
||||||
c.close()
|
c.close()
|
||||||
|
|
||||||
return render_template("bot_accounts.html", users = users, post_count = post_count)
|
return render_template("bot_accounts.html", users = users)
|
||||||
|
|
||||||
@app.route("/bot/accounts/add", methods = ['GET', 'POST'])
|
@app.route("/bot/accounts/add", methods = ['GET', 'POST'])
|
||||||
def bot_accounts_add():
|
def bot_accounts_add():
|
||||||
|
@ -220,7 +214,7 @@ def bot_accounts_delete(id):
|
||||||
c.close()
|
c.close()
|
||||||
mysql.connection.commit()
|
mysql.connection.commit()
|
||||||
|
|
||||||
return redirect("/bot/accounts/{}".format(session['bot']), 303)
|
return redirect(url_for("/bot/accounts/{}".format(session['bot'])), 303)
|
||||||
|
|
||||||
@app.route("/bot/create/", methods=['GET', 'POST'])
|
@app.route("/bot/create/", methods=['GET', 'POST'])
|
||||||
def bot_create():
|
def bot_create():
|
||||||
|
|
Loading…
Reference in a new issue