import aiohttp import asyncio import os import os.path import random from bs4 import BeautifulSoup # References to paths we use to save any scraped content. DATA_COACH_LIST = "data/pages/{page_no}.txt" DATA_COACH_DIR = "data/coach/{member_name}" DATA_COACH_FILE = "data/coach/{member_name}/{filename}" USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)" async def chesscom_request(url): body = None async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session: async with session.get(url) as response: if response.status != 200: print(f"Encountered {response.status} when retrieving {url}.") else: body = await response.text() return body async def scrape_coach_links(page_no): """Scrape a single coach page listing.""" url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" response = await chesscom_request(url) if response is None: return links = [] soup = BeautifulSoup(response, "html.parser") members = soup.find_all("a", class_="members-categories-username") for member in members: links.append(member.get("href")) return links async def scrape_all_coach_links(max_pages=64): """Scan through https://www.chess.com/coaches for all member links.""" links = [] for i in range(1, max_pages + 1): filepath = DATA_COACH_LIST.format(page_no=i) if os.path.isfile(filepath): with open(filepath, "r") as f: links.extend(f.readlines()) print(f"{filepath} already exists.") else: links.extend(await scrape_coach_links(i)) with open(filepath, "w") as f: for link in links: f.write(f"{link}\n") print(f"Downloaded page {i} of coach list.") await asyncio.sleep(random.randint(10, 15)) return links async def download_member_info(member_name, filename, url): """Download member-specific content. @return: True if we make a network request. False otherwise. """ filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename) if os.path.isfile(filepath): return False response = await chesscom_request(url) if response is not None: with open(filepath, "w") as f: f.write(response) return True async def main(): links = await scrape_all_coach_links() for url in [link.strip() for link in links]: member_name = url[len("https://www.chess.com/member/") :] os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True) made_network_request = await asyncio.gather( download_member_info( member_name, f"{member_name}.html", url, ), download_member_info( member_name, "activity.json", f"https://www.chess.com/callback/member/activity/{member_name}?page=1", ), download_member_info( member_name, "stats.json", f"https://www.chess.com/callback/member/stats/{member_name}", ), ) if any(made_network_request): await asyncio.sleep(random.randint(10, 15)) print(f"Downloaded {member_name} info.") else: print(f"Skipping {member_name} download.") if __name__ == "__main__": os.makedirs("data/pages", exist_ok=True) os.makedirs("data/coach", exist_ok=True) asyncio.run(main())