added http timeouts to initial requests, better error handling

2020-03-18 15:00:18 +10:00 · 2020-03-18 15:00:18 +10:00 · 87b9f64d75
parent ed08240619
commit 87b9f64d75
1 changed files with 30 additions and 21 deletions
--- a/app/scrape.py
+++ b/app/scrape.py
@ -30,7 +30,8 @@ def scrape_posts(account):
 		c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
 		last_post = c.fetchone()[0]

-	r = requests.get(outbox)
+	try:
+		r = requests.get(outbox, timeout = 10)
 		j = r.json()
 		# check for pleroma
 		pleroma = 'next' not in j
@ -40,8 +41,11 @@ def scrape_posts(account):
 				j = j['first']
 		else:
 			uri = "{}&min_id={}".format(outbox, last_post)
-		r = requests.get(uri)
+			r = requests.get(uri, timeout = 10)
 			j = r.json()
+	except:
+		print("Couldn't load or parse outbox at URL {}".format(outbox))
+		done = True

 	# here we go!
 	# warning: scraping posts from outbox.json is messy stuff
@ -78,9 +82,10 @@ def scrape_posts(account):
 					))
 				except:
 					#TODO: error handling
-					raise
+					print("Failed to insert post {} for user {}".format(handle, post_id))

 		if not done:
+			try:
 				if pleroma:
 					if 'next' in j:
 						r = requests.get(j['next'], timeout = 10)
@ -91,6 +96,10 @@ def scrape_posts(account):
 						r = requests.get(j['prev'], timeout = 10)
 					else:
 						done = True
+			except requests.Timeout:
+				print("Timed out while loading next page for {}".format(handle))
+			except:
+				print("Encountered unknown error while getting next page for {}".format(handle))

 			if r.status_code == 429:
 				# we are now being ratelimited, move on to the next user