mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-25 08:38:59 +00:00
added http timeouts to initial requests, better error handling
This commit is contained in:
parent
ed08240619
commit
87b9f64d75
1 changed files with 30 additions and 21 deletions
|
@ -30,18 +30,22 @@ def scrape_posts(account):
|
|||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
||||
last_post = c.fetchone()[0]
|
||||
|
||||
r = requests.get(outbox)
|
||||
j = r.json()
|
||||
# check for pleroma
|
||||
pleroma = 'next' not in j
|
||||
if pleroma:
|
||||
if 'first' in j:
|
||||
# backwards compatibility for older (pre-v1.0.7) pleroma instances
|
||||
j = j['first']
|
||||
else:
|
||||
uri = "{}&min_id={}".format(outbox, last_post)
|
||||
r = requests.get(uri)
|
||||
try:
|
||||
r = requests.get(outbox, timeout = 10)
|
||||
j = r.json()
|
||||
# check for pleroma
|
||||
pleroma = 'next' not in j
|
||||
if pleroma:
|
||||
if 'first' in j:
|
||||
# backwards compatibility for older (pre-v1.0.7) pleroma instances
|
||||
j = j['first']
|
||||
else:
|
||||
uri = "{}&min_id={}".format(outbox, last_post)
|
||||
r = requests.get(uri, timeout = 10)
|
||||
j = r.json()
|
||||
except:
|
||||
print("Couldn't load or parse outbox at URL {}".format(outbox))
|
||||
done = True
|
||||
|
||||
# here we go!
|
||||
# warning: scraping posts from outbox.json is messy stuff
|
||||
|
@ -78,19 +82,24 @@ def scrape_posts(account):
|
|||
))
|
||||
except:
|
||||
#TODO: error handling
|
||||
raise
|
||||
print("Failed to insert post {} for user {}".format(handle, post_id))
|
||||
|
||||
if not done:
|
||||
if pleroma:
|
||||
if 'next' in j:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
try:
|
||||
if pleroma:
|
||||
if 'next' in j:
|
||||
r = requests.get(j['next'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
else:
|
||||
done = True
|
||||
else:
|
||||
if 'prev' in j:
|
||||
r = requests.get(j['prev'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
if 'prev' in j:
|
||||
r = requests.get(j['prev'], timeout = 10)
|
||||
else:
|
||||
done = True
|
||||
except requests.Timeout:
|
||||
print("Timed out while loading next page for {}".format(handle))
|
||||
except:
|
||||
print("Encountered unknown error while getting next page for {}".format(handle))
|
||||
|
||||
if r.status_code == 429:
|
||||
# we are now being ratelimited, move on to the next user
|
||||
|
|
Loading…
Reference in a new issue