fixed indentation issue, hopefully #21 is fixed now

2025-08-02 17:04:48 +00:00 · 2019-09-17 18:12:32 +10:00 · 2019-09-17 18:12:32 +10:00 · 5d35519818
commit 5d35519818
parent 386d32367a
1 changed files with 48 additions and 48 deletions
--- a/scrape.py
+++ b/scrape.py
@ -41,55 +41,55 @@ def scrape_posts(account):
 		r = requests.get(uri)
 		j = r.json()
-		# here we go!
+	# here we go!
-		# warning: scraping posts from outbox.json is messy stuff
+	# warning: scraping posts from outbox.json is messy stuff
-		done = False
+	done = False
-		while not done and len(j['orderedItems']) > 0:
+	while not done and len(j['orderedItems']) > 0:
-			for oi in j['orderedItems']:
+		for oi in j['orderedItems']:
-				if oi['type'] == "Create":
+			if oi['type'] == "Create":
-					# this is a status/post/toot/florp/whatever
+				# this is a status/post/toot/florp/whatever
-					# first, check to see if we already have this in the database
+				# first, check to see if we already have this in the database
-					post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
+				post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
-					c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
+				c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
-					count = c.fetchone()
+				count = c.fetchone()
-					if count is not None and int(count[0]) > 0:
+				if count is not None and int(count[0]) > 0:
-						# this post is already in the DB.
+					# this post is already in the DB.
-						# we'll set done to true because we've caught up to where we were last time.
+					# we'll set done to true because we've caught up to where we were last time.
 						done = True
 						# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
 						continue
 					content = oi['object']['content']
 					# remove HTML tags and such from post
 					content = functions.extract_post(content)
 					if len(content) > 65535:
 						# post is too long to go into the DB
 						continue
 					try:
 						c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
 							handle,
 							post_id,
 							content,
 							1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
 						))
 					except:
 						#TODO: error handling
 						raise
 			if not done:
 				if pleroma:
 					r = requests.get(j['next'], timeout = 10)
 				else:
 					r = requests.get(j['prev'], timeout = 10)
 				if r.status_code == 429:
 					# we are now being ratelimited, move on to the next user
 					print("Hit rate limit while scraping {}".format(handle))
 					done = True
-				else:
+					# we'll still iterate over the rest of the posts, though, in case there are still some new ones on this page.
-					j = r.json()
+					continue
 				content = oi['object']['content']
 				# remove HTML tags and such from post
 				content = functions.extract_post(content)
 				if len(content) > 65535:
 					# post is too long to go into the DB
 					continue
 				try:
 					c.execute("INSERT INTO `posts` (`fedi_id`, `post_id`, `content`, `cw`) VALUES (%s, %s, %s, %s)", (
 						handle,
 						post_id,
 						content,
 						1 if (oi['object']['summary'] != None and oi['object']['summary'] != "") else 0
 					))
 				except:
 					#TODO: error handling
 					raise
 		if not done:
 			if pleroma:
 				r = requests.get(j['next'], timeout = 10)
 			else:
 				r = requests.get(j['prev'], timeout = 10)
 			if r.status_code == 429:
 				# we are now being ratelimited, move on to the next user
 				print("Hit rate limit while scraping {}".format(handle))
 				done = True
 			else:
 				j = r.json()
 			db.commit()
editor.table_modal.label.rows
editor.table_modal.label.columns