diff --git a/.gitignore b/.gitignore index 6123955..a99b6f0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ # A symlink produced by default when running `nix build`. result + +# The results of the `chess.com` coach scraping. +data/ diff --git a/main.py b/main.py index a77c3f1..e7fe86e 100644 --- a/main.py +++ b/main.py @@ -8,66 +8,70 @@ from bs4 import BeautifulSoup # References to paths we use to save any scraped content. -DATA_COACH_LINKS = "data/coach_links.txt" +DATA_COACH_LIST = "data/pages/{}.txt" DATA_COACH_DIR = "data/coach/{}/{}" async def scrape_coach_links(page_no): """Scrape a single coach page listing.""" links = [] + href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" async with aiohttp.ClientSession() as session: - href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" async with session.get(href) as response: if response.status != 200: - print(f"Encountered {response.status} when retrieving {href}") + print(f"Encountered {response.status} when retrieving {href}.") return html = await response.text() soup = BeautifulSoup(html, "html.parser") - members = soup.find_all("a", class_="members-categories-username") + members = soup.find_all("a", class_="members-categories-blahblah") for member in members: links.append(member.get("href")) return links -async def scrape_all_coach_links(max_pages=62): - """Scans through chess.com/coaches for all member links.""" - if os.path.isfile(DATA_COACH_LINKS): - with open(DATA_COACH_LINKS, "r") as f: - return f.readlines() +async def scrape_all_coach_links(max_pages=64): + """Scan through https://www.chess.com/coaches for all member links.""" + links = [] for i in range(1, max_pages + 1): - # Nest the file context manager here so I can `tail -f` the file. - with open(DATA_COACH_LINKS, "a") as f: + filepath = DATA_COACH_LIST.format(i) + if os.path.isfile(filepath): + with open(filepath, "r") as f: + links = f.readlines() + print(f"{filepath} already exists.") + else: links = await scrape_coach_links(i) - for link in links: - f.write(f"{link}\n") - await asyncio.sleep(random.randint(2, 5)) + with open(filepath, "w") as f: + for link in links: + f.write(f"{link}\n") + print(f"Downloaded page {i} of coach list.") + await asyncio.sleep(random.randint(3, 7)) + return links async def download_member_info(member_name, filename, href): """Download member-specific content. - @return: True if we downloaded content. False if the results already - existed locally. + @return: True if we downloaded content. False if the download already + exists locally. """ target = DATA_COACH_DIR.format(member_name, filename) if os.path.isfile(target): return False - with open(target, "w") as f: - async with aiohttp.ClientSession() as session: - async with session.get(href) as response: - if response.status != 200: - print(f"Encountered {response.status} when retrieving {href}") - return + async with aiohttp.ClientSession() as session: + async with session.get(href) as response: + if response.status != 200: + print(f"Encountered {response.status} when retrieving {href}") + return + with open(target, "w") as f: f.write(await response.text()) return True async def main(): links = await scrape_all_coach_links() - for link in links: - href = link.strip() + for href in [link.strip() for link in links]: member_name = href[len("https://www.chess.com/member/") :] downloaded = await asyncio.gather( download_member_info( @@ -86,14 +90,14 @@ async def main(): f"https://www.chess.com/callback/member/stats/{member_name}", ), ) - # Only want to sleep if the files didn't already exist. if any(downloaded): - await asyncio.sleep(random.randint(2, 5)) - print(f"Downloaded {member_name}") + await asyncio.sleep(random.randint(3, 7)) + print(f"Downloaded {member_name} info.") else: - print(f"Skipping {member_name}") + print(f"Skipping {member_name} download.") if __name__ == "__main__": + os.makedirs("data/pages", exist_ok=True) os.makedirs("data/coach", exist_ok=True) asyncio.run(main())