From 99c89a3a6dc73c97ff8cf14b2af9e707bd09452a Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Mon, 27 Nov 2023 20:04:47 -0700 Subject: [PATCH] Restructure and add documentation. Require specifying user-agent. --- README.md | 21 +++-- main.py | 230 ++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 164 insertions(+), 87 deletions(-) mode change 100644 => 100755 main.py diff --git a/README.md b/README.md index dd94e27..c1f64b9 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,25 @@ **Caution! Be careful running this script.** -We intentionally delay each request sent anywhere from 10 to 15 seconds. Make -sure any adjustments to this script appropriately rate-limit. +We intentionally delay each batch of requests by 3 seconds. Make sure any +adjustments to this script appropriately rate-limit. ## Overview This is a simple web scraper for [chess.com](https://www.chess.com/coaches) coaches. Running: ```bash -$> python3 main.py +$> python3 main.py --user-agent ``` -will query [chess.com](https://www.chess.com) for all listed coaches as well as -specific information about each of them (their profile, recent activity, and -stats). The result will be found in a newly created `data` directory with the -following structure: +will query [chess.com](https://www.chess.com) for all listed coach usernames as +well as specific information about each of corresponding coach (their profile, +recent activity, and stats). The result will be found in a newly created `data` +directory with the following structure: ``` data ├── coach -│ ├── -│ │ ├── .html +│ ├── +│ │ ├── .html │ │ ├── activity.json │ │ └── stats.json │ ├── ... @@ -29,9 +29,6 @@ data ├── ... ``` -Here, `member_name` corresponds to the name of the coach whereas `pages` -contains a fragmented list of URLs to coach profiles. - ## Development This script was written using Python (version 3.11.6). Packaging and dependency diff --git a/main.py b/main.py old mode 100644 new mode 100755 index 92c29e7..88ff856 --- a/main.py +++ b/main.py @@ -1,115 +1,195 @@ +#!/usr/bin/env python3 + import aiohttp +import argparse import asyncio import os import os.path -import random from bs4 import BeautifulSoup -# References to paths we use to save any scraped content. +# The root directory containing downloaded files for a coach. +DATA_COACH_DIR = "data/coach/{username}" + +# Where a part of coach-related data is stored. +DATA_COACH_FILE = "data/coach/{username}/{filename}" + +# Where a part of all discovered coach usernames is stored. DATA_COACH_LIST = "data/pages/{page_no}.txt" -DATA_COACH_DIR = "data/coach/{member_name}" -DATA_COACH_FILE = "data/coach/{member_name}/{filename}" -USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)" +# The "User-Agent" value set in every request to chess.com. +USER_AGENT = "BoardWise chesscom-scraper ({user_agent})" + +# How long to wait between a batch of network requests. +SLEEP_SECS = 3 -async def chesscom_request(url): - body = None - async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session: - async with session.get(url) as response: - if response.status != 200: - print(f"Encountered {response.status} when retrieving {url}.") - else: - body = await response.text() - - return body +def ANSI_COLOR(s): + """Print colored output to the console.""" + return f"\033[0;34m{s}\033[0m" # Blue -async def scrape_coach_links(page_no): - """Scrape a single coach page listing.""" +async def chesscom_request(session, url): + """Convenience function for network requests to chess.com. + + @param session + The `aiohttp.ClientSession` context our requests are made from. + @param url + The URL to send a request to. + @return + The text response returned by the server at @url. + """ + async with session.get(url) as response: + if response.status == 200: + return await response.text() + print(f"Encountered {response.status} when retrieving {url}.") + + +async def _scrape_page_coach_usernames(session, page_no): + """Scan through chess.com/coaches/?page= for all coaches' usernames. + + @param session + The `aiohttp.ClientSession` context our requests are made from. + @param page_no + The page consisting of at most 25 coaches (at the time of writing) + whose usernames are to be scraped. + @return + The list of scraped usernames on the specified coach listing page. + """ url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" - response = await chesscom_request(url) + response = await chesscom_request(session, url) if response is None: return - links = [] + usernames = [] soup = BeautifulSoup(response, "html.parser") members = soup.find_all("a", class_="members-categories-username") for member in members: - links.append(member.get("href")) + href = member.get("href") + username = href[len("https://www.chess.com/member/") :] + usernames.append(username) - return links + return usernames -async def scrape_all_coach_links(max_pages=64): - """Scan through https://www.chess.com/coaches for all member links.""" - links = [] - for i in range(1, max_pages + 1): - filepath = DATA_COACH_LIST.format(page_no=i) - if os.path.isfile(filepath): - with open(filepath, "r") as f: - links.extend(f.readlines()) - print(f"{filepath} already exists.") - else: - links.extend(await scrape_coach_links(i)) - with open(filepath, "w") as f: - for link in links: - f.write(f"{link}\n") - print(f"Downloaded page {i} of coach list.") - await asyncio.sleep(random.randint(10, 15)) +async def _scrape_all_coach_usernames(session, max_pages=64): + """Scan through chess.com/coaches for all coaches' usernames. - return links - - -async def download_member_info(member_name, filename, url): - """Download member-specific content. - - @return: True if we make a network request. False otherwise. + @param session + The `aiohttp.ClientSession` context our requests are made from. + @param max_pages + The number of pages we will at most iterate through. This number was + determined by going to chess.com/coaches?sortBy=alphabetical&page=1 + and traversing to the last page. + @return + The complete list of scraped usernames across every coach listing page. """ - filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename) + usernames = [] + for page_no in range(1, max_pages + 1): + filepath = DATA_COACH_LIST.format(page_no=page_no) + try: + with open(filepath, "r") as f: + usernames.extend(f.readlines()) + print(f"Skipping {ANSI_COLOR(filepath)}") + except FileNotFoundError: + page_usernames = await _scrape_page_coach_usernames(session, page_no) + with open(filepath, "w") as f: + for username in page_usernames: + f.write(f"{username}\n") + usernames.extend(page_usernames) + print(f"Downloaded {ANSI_COLOR(filepath)}") + await asyncio.sleep(SLEEP_SECS) + + return usernames + + +async def _download_coach_file(session, url, username, filename): + """Writes the contents of @url into `DATA_COACH_FILE`. + + @param session + The `aiohttp.ClientSession` context our requests are made from. + @param url + The URL of the file to download. + @param username + The coach username corresponding to the downloaded file. + @param filename + The output file to write the downloaded content to. + @return: + True if we make a network request. False otherwise. + """ + filepath = DATA_COACH_FILE.format(username=username, filename=filename) if os.path.isfile(filepath): return False - response = await chesscom_request(url) + response = await chesscom_request(session, url) if response is not None: with open(filepath, "w") as f: f.write(response) - return True +async def _download_coach_data(session, username): + """Download coach-related data to the `DATA_COACH_DIR` directory. + + This sends three parallel requests for: + * the coach's profile, + * the coach's recent activity, + * the coach's stats. + + @param session + The `aiohttp.ClientSession` context our requests are made from. + @param username + The coach username corresponding to the downloaded files. + """ + used_network = await asyncio.gather( + _download_coach_file( + session, + url=f"https://www.chess.com/member/{username}", + username=username, + filename=f"{username}.html", + ), + _download_coach_file( + session, + url=f"https://www.chess.com/callback/member/activity/{username}?page=1", + username=username, + filename="activity.json", + ), + _download_coach_file( + session, + url=f"https://www.chess.com/callback/member/stats/{username}", + username=username, + filename="stats.json", + ), + ) + if any(used_network): + print(f"Downloaded {ANSI_COLOR(username)}") + await asyncio.sleep(SLEEP_SECS) + else: + print(f"Skipping {ANSI_COLOR(username)}") + + async def main(): - links = await scrape_all_coach_links() - for url in [link.strip() for link in links]: - member_name = url[len("https://www.chess.com/member/") :] - os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True) - made_network_request = await asyncio.gather( - download_member_info( - member_name, - f"{member_name}.html", - url, - ), - download_member_info( - member_name, - "activity.json", - f"https://www.chess.com/callback/member/activity/{member_name}?page=1", - ), - download_member_info( - member_name, - "stats.json", - f"https://www.chess.com/callback/member/stats/{member_name}", - ), - ) - if any(made_network_request): - await asyncio.sleep(random.randint(10, 15)) - print(f"Downloaded {member_name} info.") - else: - print(f"Skipping {member_name} download.") + parser = argparse.ArgumentParser( + prog="chesscom-scraper", + description="HTML scraping of chess.com coaches.", + ) + parser.add_argument("-u", "--user-agent", required=True) + args = parser.parse_args() + + os.makedirs("data/pages", exist_ok=True) + os.makedirs("data/coach", exist_ok=True) + + async with aiohttp.ClientSession( + headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)} + ) as session: + # Retrieve all coaches on the platform. + usernames = await _scrape_all_coach_usernames(session) + # For each coach, download relevant data. + for username in [u.strip() for u in usernames]: + os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True) + await _download_coach_data(session, username) if __name__ == "__main__": - os.makedirs("data/pages", exist_ok=True) - os.makedirs("data/coach", exist_ok=True) asyncio.run(main())