added http timeouts to initial requests, better error handling

This commit is contained in:
Lynne Megido 2020-03-18 15:00:18 +10:00
parent ed08240619
commit 87b9f64d75
Signed by: lynnesbian
GPG Key ID: F0A184B5213D9F90

View File

@ -30,7 +30,8 @@ def scrape_posts(account):
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
last_post = c.fetchone()[0]
r = requests.get(outbox)
try:
r = requests.get(outbox, timeout = 10)
j = r.json()
# check for pleroma
pleroma = 'next' not in j
@ -40,8 +41,11 @@ def scrape_posts(account):
j = j['first']
else:
uri = "{}&min_id={}".format(outbox, last_post)
r = requests.get(uri)
r = requests.get(uri, timeout = 10)
j = r.json()
except:
print("Couldn't load or parse outbox at URL {}".format(outbox))
done = True
# here we go!
# warning: scraping posts from outbox.json is messy stuff
@ -78,9 +82,10 @@ def scrape_posts(account):
))
except:
#TODO: error handling
raise
print("Failed to insert post {} for user {}".format(handle, post_id))
if not done:
try:
if pleroma:
if 'next' in j:
r = requests.get(j['next'], timeout = 10)
@ -91,6 +96,10 @@ def scrape_posts(account):
r = requests.get(j['prev'], timeout = 10)
else:
done = True
except requests.Timeout:
print("Timed out while loading next page for {}".format(handle))
except:
print("Encountered unknown error while getting next page for {}".format(handle))
if r.status_code == 429:
# we are now being ratelimited, move on to the next user