Break coach listing download per-page.

pull/2/head
Joshua Potter 2023-11-27 13:30:22 -07:00
parent 1710e1aefa
commit 23aa622126
2 changed files with 35 additions and 28 deletions

3
.gitignore vendored
View File

@ -3,3 +3,6 @@
# A symlink produced by default when running `nix build`. # A symlink produced by default when running `nix build`.
result result
# The results of the `chess.com` coach scraping.
data/

60
main.py
View File

@ -8,66 +8,70 @@ from bs4 import BeautifulSoup
# References to paths we use to save any scraped content. # References to paths we use to save any scraped content.
DATA_COACH_LINKS = "data/coach_links.txt" DATA_COACH_LIST = "data/pages/{}.txt"
DATA_COACH_DIR = "data/coach/{}/{}" DATA_COACH_DIR = "data/coach/{}/{}"
async def scrape_coach_links(page_no): async def scrape_coach_links(page_no):
"""Scrape a single coach page listing.""" """Scrape a single coach page listing."""
links = [] links = []
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
async with session.get(href) as response: async with session.get(href) as response:
if response.status != 200: if response.status != 200:
print(f"Encountered {response.status} when retrieving {href}") print(f"Encountered {response.status} when retrieving {href}.")
return return
html = await response.text() html = await response.text()
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
members = soup.find_all("a", class_="members-categories-username") members = soup.find_all("a", class_="members-categories-blahblah")
for member in members: for member in members:
links.append(member.get("href")) links.append(member.get("href"))
return links return links
async def scrape_all_coach_links(max_pages=62): async def scrape_all_coach_links(max_pages=64):
"""Scans through chess.com/coaches for all member links.""" """Scan through https://www.chess.com/coaches for all member links."""
if os.path.isfile(DATA_COACH_LINKS): links = []
with open(DATA_COACH_LINKS, "r") as f:
return f.readlines()
for i in range(1, max_pages + 1): for i in range(1, max_pages + 1):
# Nest the file context manager here so I can `tail -f` the file. filepath = DATA_COACH_LIST.format(i)
with open(DATA_COACH_LINKS, "a") as f: if os.path.isfile(filepath):
with open(filepath, "r") as f:
links = f.readlines()
print(f"{filepath} already exists.")
else:
links = await scrape_coach_links(i) links = await scrape_coach_links(i)
for link in links: with open(filepath, "w") as f:
f.write(f"{link}\n") for link in links:
await asyncio.sleep(random.randint(2, 5)) f.write(f"{link}\n")
print(f"Downloaded page {i} of coach list.")
await asyncio.sleep(random.randint(3, 7))
return links return links
async def download_member_info(member_name, filename, href): async def download_member_info(member_name, filename, href):
"""Download member-specific content. """Download member-specific content.
@return: True if we downloaded content. False if the results already @return: True if we downloaded content. False if the download already
existed locally. exists locally.
""" """
target = DATA_COACH_DIR.format(member_name, filename) target = DATA_COACH_DIR.format(member_name, filename)
if os.path.isfile(target): if os.path.isfile(target):
return False return False
with open(target, "w") as f: async with aiohttp.ClientSession() as session:
async with aiohttp.ClientSession() as session: async with session.get(href) as response:
async with session.get(href) as response: if response.status != 200:
if response.status != 200: print(f"Encountered {response.status} when retrieving {href}")
print(f"Encountered {response.status} when retrieving {href}") return
return with open(target, "w") as f:
f.write(await response.text()) f.write(await response.text())
return True return True
async def main(): async def main():
links = await scrape_all_coach_links() links = await scrape_all_coach_links()
for link in links: for href in [link.strip() for link in links]:
href = link.strip()
member_name = href[len("https://www.chess.com/member/") :] member_name = href[len("https://www.chess.com/member/") :]
downloaded = await asyncio.gather( downloaded = await asyncio.gather(
download_member_info( download_member_info(
@ -86,14 +90,14 @@ async def main():
f"https://www.chess.com/callback/member/stats/{member_name}", f"https://www.chess.com/callback/member/stats/{member_name}",
), ),
) )
# Only want to sleep if the files didn't already exist.
if any(downloaded): if any(downloaded):
await asyncio.sleep(random.randint(2, 5)) await asyncio.sleep(random.randint(3, 7))
print(f"Downloaded {member_name}") print(f"Downloaded {member_name} info.")
else: else:
print(f"Skipping {member_name}") print(f"Skipping {member_name} download.")
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs("data/pages", exist_ok=True)
os.makedirs("data/coach", exist_ok=True) os.makedirs("data/coach", exist_ok=True)
asyncio.run(main()) asyncio.run(main())