mirror of
https://github.com/Lynnesbian/FediBooks/
synced 2024-11-25 08:38:59 +00:00
fixed some more issues that didn't happen before
This commit is contained in:
parent
8fceadf93d
commit
f982e54a2d
1 changed files with 6 additions and 3 deletions
|
@ -1,3 +1,5 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import MySQLdb
|
import MySQLdb
|
||||||
import requests
|
import requests
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
@ -14,7 +16,7 @@ def scrape_posts(account):
|
||||||
last_post = 0
|
last_post = 0
|
||||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
|
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s", (handle,))
|
||||||
count = c.fetchone()
|
count = c.fetchone()
|
||||||
if count is not None and count[0] > 0:
|
if count is not None and int(count[0]) > 0:
|
||||||
# we've downloaded this user's posts before
|
# we've downloaded this user's posts before
|
||||||
# find out the most recently downloaded post of theirs
|
# find out the most recently downloaded post of theirs
|
||||||
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
c.execute("SELECT `post_id` FROM `posts` WHERE `fedi_id` = %s ORDER BY `id` DESC LIMIT 1", (handle,))
|
||||||
|
@ -41,7 +43,8 @@ def scrape_posts(account):
|
||||||
# first, check to see if we already have this in the database
|
# first, check to see if we already have this in the database
|
||||||
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
|
post_id = re.search(r"([^\/]+)/?$", oi['object']['id']).group(1) # extract 123 from https://example.com/posts/123/
|
||||||
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
|
c.execute("SELECT COUNT(*) FROM `posts` WHERE `fedi_id` = %s AND `post_id` = %s", (handle, post_id))
|
||||||
if c.fetchone()[0] > 0:
|
count = c.fetchone()
|
||||||
|
if count is not None and int(count[0]) > 0:
|
||||||
# this post is already in the DB.
|
# this post is already in the DB.
|
||||||
# we'll set done to true because we've caught up to where we were last time.
|
# we'll set done to true because we've caught up to where we were last time.
|
||||||
done = True
|
done = True
|
||||||
|
@ -80,7 +83,7 @@ def scrape_posts(account):
|
||||||
j = r.json()
|
j = r.json()
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
c.close()
|
|
||||||
print("Finished scraping {}".format(handle))
|
print("Finished scraping {}".format(handle))
|
||||||
|
|
||||||
print("Establishing DB connection")
|
print("Establishing DB connection")
|
||||||
|
|
Loading…
Reference in a new issue