mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-25 16:48:58 +00:00
Compare commits
No commits in common. "fc8be5db40ee6bdd4852e20bf62beb81b45b8f7a" and "386d32367a01da8d87c46f9ac0ed97e6ff9c1f59" have entirely different histories.
fc8be5db40
...
386d32367a
3 changed files with 49 additions and 50 deletions
96
scrape.py
96
scrape.py
|
@ -41,55 +41,55 @@ def scrape_posts(account):
|
|||
r = requests.get(uri)
|
||||
j = r.json()
|
||||
|
||||
# here we go!
|
||||
# warning: scraping posts from outbox.json is messy stuff
|
||||
done = False
|
||||
while not done and len(j['orderedItems']) > 0:
|
||||
for oi in j['orderedItems']:
|
||||
if oi['type'] == "Create":
|
||||
# this is a status/post/toot/florp/whatever
|
||||
# first, check to see if we already have this in the database
|
||||
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
|
||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
|
||||
count = c.fetchone()
|
||||
if count is not None and int(count[0]) > 0:
|
||||
# this post is already in the DB.
|
||||
# we'll set done to true because we've caught up to where we were last time.
|
||||
# here we go!
|
||||
# warning: scraping posts from outbox.json is messy stuff
|
||||
done = False
|
||||
while not done and len(j['orderedItems']) > 0:
|
||||
for oi in j['orderedItems']:
|
||||
if oi['type'] == "Create":
|
||||
# this is a status/post/toot/florp/whatever
|
||||
# first, check to see if we already have this in the database
|
||||
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
|
||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
|
||||
count = c.fetchone()
|
||||
if count is not None and int(count[0]) > 0:
|
||||
# this post is already in the DB.
|
||||
# we'll set done to true because we've caught up to where we were last time.
|
||||
done = True
|
||||
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
|
||||
continue
|
||||
|
||||
content = oi['object']['content']
|
||||
# remove HTML tags and such from post
|
||||
content = functions.extract_post(content)
|
||||
|
||||
if len(content) > 65535:
|
||||
# post is too long to go into the DB
|
||||
continue
|
||||
|
||||
try:
|
||||
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
|
||||
handle,
|
||||
post_id,
|
||||
content,
|
||||
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
|
||||
))
|
||||
except:
|
||||
#TODO: error handling
|
||||
raise
|
||||
|
||||
if not done:
|
||||
if pleroma:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
else:
|
||||
r = requests.get(j['prev'], timeout = 10)
|
||||
|
||||
if r.status_code == 429:
|
||||
# we are now being ratelimited, move on to the next user
|
||||
print("Hit rate limit while scraping {}".format(handle))
|
||||
done = True
|
||||
# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
|
||||
continue
|
||||
|
||||
content = oi['object']['content']
|
||||
# remove HTML tags and such from post
|
||||
content = functions.extract_post(content)
|
||||
|
||||
if len(content) > 65535:
|
||||
# post is too long to go into the DB
|
||||
continue
|
||||
|
||||
try:
|
||||
c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
|
||||
handle,
|
||||
post_id,
|
||||
content,
|
||||
1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
|
||||
))
|
||||
except:
|
||||
#TODO: error handling
|
||||
raise
|
||||
|
||||
if not done:
|
||||
if pleroma:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
else:
|
||||
r = requests.get(j['prev'], timeout = 10)
|
||||
|
||||
if r.status_code == 429:
|
||||
# we are now being ratelimited, move on to the next user
|
||||
print("Hit rate limit while scraping {}".format(handle))
|
||||
done = True
|
||||
else:
|
||||
j = r.json()
|
||||
else:
|
||||
j = r.json()
|
||||
|
||||
db.commit()
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ CREATE TABLE IF NOT EXISTS `bots` (
|
|||
`push_private_key` BINARY(128) NOT NULL,
|
||||
`push_public_key` BINARY(128) NOT NULL,
|
||||
`push_secret` BINARY(16),
|
||||
`instance_type` VARCHAR(64) NOT NULL DEFAULT 'Mastodon',
|
||||
`enabled` BOOLEAN DEFAULT 0,
|
||||
`replies_enabled` BOOLEAN DEFAULT 1,
|
||||
`post_frequency` SMALLINT UNSIGNED DEFAULT 30,
|
||||
|
|
2
webui.py
2
webui.py
|
@ -483,7 +483,7 @@ def bot_create():
|
|||
secret = privated['auth']
|
||||
client.push_subscription_set("{}/push/{}".format(cfg['base_uri'], handle), publicd, mention_events = True)
|
||||
|
||||
c.execute("INSERT INTO `bots` (handle, user_id, credentials_id, push_public_key, push_private_key, push_secret, instance_type) VALUES (%s, %s, %s, %s, %s, %s)", (handle, session['user_id'], credentials_id, public, private, secret, session['instance_type']))
|
||||
c.execute("INSERT INTO `bots` (handle, user_id, credentials_id, push_public_key, push_private_key, push_secret) VALUES (%s, %s, %s, %s, %s, %s)", (handle, session['user_id'], credentials_id, public, private, secret))
|
||||
mysql.connection.commit()
|
||||
c.close()
|
||||
|
||||
|
|
Loading…
Reference in a new issue