mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-25 08:38:59 +00:00
added http timeouts to initial requests, better error handling
This commit is contained in:
parent
ed08240619
commit
87b9f64d75
1 changed files with 30 additions and 21 deletions
|
@ -30,7 +30,8 @@ def scrape_posts(account):
|
||||||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
||||||
last_post = c.fetchone()[0]
|
last_post = c.fetchone()[0]
|
||||||
|
|
||||||
r = requests.get(outbox)
|
try:
|
||||||
|
r = requests.get(outbox, timeout = 10)
|
||||||
j = r.json()
|
j = r.json()
|
||||||
# check for pleroma
|
# check for pleroma
|
||||||
pleroma = 'next' not in j
|
pleroma = 'next' not in j
|
||||||
|
@ -40,8 +41,11 @@ def scrape_posts(account):
|
||||||
j = j['first']
|
j = j['first']
|
||||||
else:
|
else:
|
||||||
uri = "{}&min_id={}".format(outbox, last_post)
|
uri = "{}&min_id={}".format(outbox, last_post)
|
||||||
r = requests.get(uri)
|
r = requests.get(uri, timeout = 10)
|
||||||
j = r.json()
|
j = r.json()
|
||||||
|
except:
|
||||||
|
print("Couldn't load or parse outbox at URL {}".format(outbox))
|
||||||
|
done = True
|
||||||
|
|
||||||
# here we go!
|
# here we go!
|
||||||
# warning: scraping posts from outbox.json is messy stuff
|
# warning: scraping posts from outbox.json is messy stuff
|
||||||
|
@ -78,9 +82,10 @@ def scrape_posts(account):
|
||||||
))
|
))
|
||||||
except:
|
except:
|
||||||
#TODO: error handling
|
#TODO: error handling
|
||||||
raise
|
print("Failed to insert post {} for user {}".format(handle, post_id))
|
||||||
|
|
||||||
if not done:
|
if not done:
|
||||||
|
try:
|
||||||
if pleroma:
|
if pleroma:
|
||||||
if 'next' in j:
|
if 'next' in j:
|
||||||
r = requests.get(j['next'], timeout = 10)
|
r = requests.get(j['next'], timeout = 10)
|
||||||
|
@ -91,6 +96,10 @@ def scrape_posts(account):
|
||||||
r = requests.get(j['prev'], timeout = 10)
|
r = requests.get(j['prev'], timeout = 10)
|
||||||
else:
|
else:
|
||||||
done = True
|
done = True
|
||||||
|
except requests.Timeout:
|
||||||
|
print("Timed out while loading next page for {}".format(handle))
|
||||||
|
except:
|
||||||
|
print("Encountered unknown error while getting next page for {}".format(handle))
|
||||||
|
|
||||||
if r.status_code == 429:
|
if r.status_code == 429:
|
||||||
# we are now being ratelimited, move on to the next user
|
# we are now being ratelimited, move on to the next user
|
||||||
|
|
Loading…
Reference in a new issue