1
0
Fork 0
mirror of https://github.com/Lynnesbian/FediBooks/ synced 2024-11-25 08:38:59 +00:00

added http timeouts to initial requests, better error handling

This commit is contained in:
Lynne Megido 2020-03-18 15:00:18 +10:00
parent ed08240619
commit 87b9f64d75
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90

View file

@ -30,18 +30,22 @@ def scrape_posts(account):
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,)) c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0] last_post = c.fetchone()[0]
r = requests.get(outbox) try:
j = r.json() r = requests.get(outbox, timeout = 10)
# check for pleroma
pleroma = 'next' not in j
if pleroma:
if 'first' in j:
# backwards compatibility for older (pre-v1.0.7) pleroma instances
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
j = r.json() j = r.json()
# check for pleroma
pleroma = 'next' not in j
if pleroma:
if 'first' in j:
# backwards compatibility for older (pre-v1.0.7) pleroma instances
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri, timeout = 10)
j = r.json()
except:
print("Couldn't load or parse outbox at URL {}".format(outbox))
done = True
# here we go! # here we go!
# warning: scraping posts from outbox.json is messy stuff # warning: scraping posts from outbox.json is messy stuff
@ -78,19 +82,24 @@ def scrape_posts(account):
)) ))
except: except:
#TODO: error handling #TODO: error handling
raise print("Failed to insert post {} for user {}".format(handle, post_id))
if not done: if not done:
if pleroma: try:
if 'next' in j: if pleroma:
r = requests.get(j['next'], timeout = 10) if 'next' in j:
r = requests.get(j['next'], timeout = 10)
else:
done = True
else: else:
done = True if 'prev' in j:
else: r = requests.get(j['prev'], timeout = 10)
if 'prev' in j: else:
r = requests.get(j['prev'], timeout = 10) done = True
else: except requests.Timeout:
done = True print("Timed out while loading next page for {}".format(handle))
except:
print("Encountered unknown error while getting next page for {}".format(handle))
if r.status_code == 429: if r.status_code == 429:
# we are now being ratelimited, move on to the next user # we are now being ratelimited, move on to the next user