#!/usr/bin/env python3 import aiohttp import argparse import asyncio import os import os.path from bs4 import BeautifulSoup # The root directory containing downloaded files for a coach. DATA_COACH_DIR = "data/coach/{username}" # Where a part of coach-related data is stored. DATA_COACH_FILE = "data/coach/{username}/{filename}" # Where a part of all discovered coach usernames is stored. DATA_COACH_LIST = "data/pages/{page_no}.txt" # The "User-Agent" value set in every request to chess.com. USER_AGENT = "BoardWise chesscom-scraper ({user_agent})" # How long to wait between a batch of network requests. SLEEP_SECS = 3 def ANSI_COLOR(s): """Print colored output to the console.""" return f"\033[0;34m{s}\033[0m" # Blue async def chesscom_request(session, url): """Convenience function for network requests to chess.com. @param session The `aiohttp.ClientSession` context our requests are made from. @param url The URL to send a request to. @return The text response returned by the server at @url. """ async with session.get(url) as response: if response.status == 200: return await response.text() print(f"Encountered {response.status} when retrieving {url}.") async def _scrape_page_coach_usernames(session, page_no): """Scan through chess.com/coaches/?page= for all coaches' usernames. @param session The `aiohttp.ClientSession` context our requests are made from. @param page_no The page consisting of at most 25 coaches (at the time of writing) whose usernames are to be scraped. @return The list of scraped usernames on the specified coach listing page. """ url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" response = await chesscom_request(session, url) if response is None: return usernames = [] soup = BeautifulSoup(response, "html.parser") members = soup.find_all("a", class_="members-categories-username") for member in members: href = member.get("href") username = href[len("https://www.chess.com/member/") :] usernames.append(username) return usernames async def _scrape_all_coach_usernames(session, max_pages=64): """Scan through chess.com/coaches for all coaches' usernames. @param session The `aiohttp.ClientSession` context our requests are made from. @param max_pages The number of pages we will at most iterate through. This number was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and traversing to the last page. @return The complete list of scraped usernames across every coach listing page. """ usernames = [] for page_no in range(1, max_pages + 1): filepath = DATA_COACH_LIST.format(page_no=page_no) try: with open(filepath, "r") as f: usernames.extend(f.readlines()) print(f"Skipping {ANSI_COLOR(filepath)}") except FileNotFoundError: page_usernames = await _scrape_page_coach_usernames(session, page_no) with open(filepath, "w") as f: for username in page_usernames: f.write(f"{username}\n") usernames.extend(page_usernames) print(f"Downloaded {ANSI_COLOR(filepath)}") await asyncio.sleep(SLEEP_SECS) return usernames async def _download_coach_file(session, url, username, filename): """Writes the contents of @url into `DATA_COACH_FILE`. @param session The `aiohttp.ClientSession` context our requests are made from. @param url The URL of the file to download. @param username The coach username corresponding to the downloaded file. @param filename The output file to write the downloaded content to. @return: True if we make a network request. False otherwise. """ filepath = DATA_COACH_FILE.format(username=username, filename=filename) if os.path.isfile(filepath): return False response = await chesscom_request(session, url) if response is not None: with open(filepath, "w") as f: f.write(response) return True async def _download_coach_data(session, username): """Download coach-related data to the `DATA_COACH_DIR` directory. This sends three parallel requests for: * the coach's profile, * the coach's recent activity, * the coach's stats. @param session The `aiohttp.ClientSession` context our requests are made from. @param username The coach username corresponding to the downloaded files. """ used_network = await asyncio.gather( _download_coach_file( session, url=f"https://www.chess.com/member/{username}", username=username, filename=f"{username}.html", ), _download_coach_file( session, url=f"https://www.chess.com/callback/member/activity/{username}?page=1", username=username, filename="activity.json", ), _download_coach_file( session, url=f"https://www.chess.com/callback/member/stats/{username}", username=username, filename="stats.json", ), ) if any(used_network): print(f"Downloaded {ANSI_COLOR(username)}") await asyncio.sleep(SLEEP_SECS) else: print(f"Skipping {ANSI_COLOR(username)}") async def main(): parser = argparse.ArgumentParser( prog="chesscom-scraper", description="HTML scraping of chess.com coaches.", ) parser.add_argument("-u", "--user-agent", required=True) args = parser.parse_args() os.makedirs("data/pages", exist_ok=True) os.makedirs("data/coach", exist_ok=True) async with aiohttp.ClientSession( headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)} ) as session: # Retrieve all coaches on the platform. usernames = await _scrape_all_coach_usernames(session) # For each coach, download relevant data. for username in [u.strip() for u in usernames]: os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True) await _download_coach_data(session, username) if __name__ == "__main__": asyncio.run(main())