From 99c89a3a6dc73c97ff8cf14b2af9e707bd09452a Mon Sep 17 00:00:00 2001
From: Joshua Potter <jrpotter2112@gmail.com>
Date: Mon, 27 Nov 2023 20:04:47 -0700
Subject: [PATCH] Restructure and add documentation. Require specifying
 user-agent.

---
 README.md |  21 +++--
 main.py   | 230 ++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 164 insertions(+), 87 deletions(-)
 mode change 100644 => 100755 main.py
diff --git a/README.md b/README.md
index dd94e27..c1f64b9 100644
--- a/README.md
+++ b/README.md
@@ -2,25 +2,25 @@
 
 **Caution! Be careful running this script.**
 
-We intentionally delay each request sent anywhere from 10 to 15 seconds. Make
-sure any adjustments to this script appropriately rate-limit.
+We intentionally delay each batch of requests by 3 seconds. Make sure any
+adjustments to this script appropriately rate-limit.
 
 ## Overview
 
 This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
 coaches. Running:
 ```bash
-$> python3 main.py
+$> python3 main.py --user-agent <your-email>
 ```
-will query [chess.com](https://www.chess.com) for all listed coaches as well as
-specific information about each of them (their profile, recent activity, and
-stats). The result will be found in a newly created `data` directory with the
-following structure:
+will query [chess.com](https://www.chess.com) for all listed coach usernames as
+well as specific information about each of corresponding coach (their profile,
+recent activity, and stats). The result will be found in a newly created `data`
+directory with the following structure:
 ```
 data
 ├── coach
-│   ├── <member_name>
-│   │   ├── <member_name>.html
+│   ├── <username>
+│   │   ├── <username>.html
 │   │   ├── activity.json
 │   │   └── stats.json
 │   ├── ...
@@ -29,9 +29,6 @@ data
     ├── ...
 ```
 
-Here, `member_name` corresponds to the name of the coach whereas `pages`
-contains a fragmented list of URLs to coach profiles.
-
 ## Development
 
 This script was written using Python (version 3.11.6). Packaging and dependency
diff --git a/main.py b/main.py
old mode 100644
new mode 100755
index 92c29e7..88ff856
--- a/main.py
+++ b/main.py
@@ -1,115 +1,195 @@
+#!/usr/bin/env python3
+
 import aiohttp
+import argparse
 import asyncio
 import os
 import os.path
-import random
 
 from bs4 import BeautifulSoup
 
 
-# References to paths we use to save any scraped content.
+# The root directory containing downloaded files for a coach.
+DATA_COACH_DIR = "data/coach/{username}"
+
+# Where a part of coach-related data is stored.
+DATA_COACH_FILE = "data/coach/{username}/{filename}"
+
+# Where a part of all discovered coach usernames is stored.
 DATA_COACH_LIST = "data/pages/{page_no}.txt"
-DATA_COACH_DIR = "data/coach/{member_name}"
-DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
 
-USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)"
+# The "User-Agent" value set in every request to chess.com.
+USER_AGENT = "BoardWise chesscom-scraper ({user_agent})"
+
+# How long to wait between a batch of network requests.
+SLEEP_SECS = 3
 
 
-async def chesscom_request(url):
-    body = None
-    async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session:
-        async with session.get(url) as response:
-            if response.status != 200:
-                print(f"Encountered {response.status} when retrieving {url}.")
-            else:
-                body = await response.text()
-
-    return body
+def ANSI_COLOR(s):
+    """Print colored output to the console."""
+    return f"\033[0;34m{s}\033[0m"  # Blue
 
 
-async def scrape_coach_links(page_no):
-    """Scrape a single coach page listing."""
+async def chesscom_request(session, url):
+    """Convenience function for network requests to chess.com.
+
+    @param session
+        The `aiohttp.ClientSession` context our requests are made from.
+    @param url
+        The URL to send a request to.
+    @return
+        The text response returned by the server at @url.
+    """
+    async with session.get(url) as response:
+        if response.status == 200:
+            return await response.text()
+        print(f"Encountered {response.status} when retrieving {url}.")
+
+
+async def _scrape_page_coach_usernames(session, page_no):
+    """Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
+
+    @param session
+        The `aiohttp.ClientSession` context our requests are made from.
+    @param page_no
+        The page consisting of at most 25 coaches (at the time of writing)
+        whose usernames are to be scraped.
+    @return
+        The list of scraped usernames on the specified coach listing page.
+    """
     url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
-    response = await chesscom_request(url)
+    response = await chesscom_request(session, url)
     if response is None:
         return
 
-    links = []
+    usernames = []
     soup = BeautifulSoup(response, "html.parser")
     members = soup.find_all("a", class_="members-categories-username")
     for member in members:
-        links.append(member.get("href"))
+        href = member.get("href")
+        username = href[len("https://www.chess.com/member/") :]
+        usernames.append(username)
 
-    return links
+    return usernames
 
 
-async def scrape_all_coach_links(max_pages=64):
-    """Scan through https://www.chess.com/coaches for all member links."""
-    links = []
-    for i in range(1, max_pages + 1):
-        filepath = DATA_COACH_LIST.format(page_no=i)
-        if os.path.isfile(filepath):
-            with open(filepath, "r") as f:
-                links.extend(f.readlines())
-            print(f"{filepath} already exists.")
-        else:
-            links.extend(await scrape_coach_links(i))
-            with open(filepath, "w") as f:
-                for link in links:
-                    f.write(f"{link}\n")
-            print(f"Downloaded page {i} of coach list.")
-            await asyncio.sleep(random.randint(10, 15))
+async def _scrape_all_coach_usernames(session, max_pages=64):
+    """Scan through chess.com/coaches for all coaches' usernames.
 
-    return links
-
-
-async def download_member_info(member_name, filename, url):
-    """Download member-specific content.
-
-    @return: True if we make a network request. False otherwise.
+    @param session
+        The `aiohttp.ClientSession` context our requests are made from.
+    @param max_pages
+        The number of pages we will at most iterate through. This number was
+        determined by going to chess.com/coaches?sortBy=alphabetical&page=1
+        and traversing to the last page.
+    @return
+        The complete list of scraped usernames across every coach listing page.
     """
-    filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename)
+    usernames = []
+    for page_no in range(1, max_pages + 1):
+        filepath = DATA_COACH_LIST.format(page_no=page_no)
+        try:
+            with open(filepath, "r") as f:
+                usernames.extend(f.readlines())
+            print(f"Skipping {ANSI_COLOR(filepath)}")
+        except FileNotFoundError:
+            page_usernames = await _scrape_page_coach_usernames(session, page_no)
+            with open(filepath, "w") as f:
+                for username in page_usernames:
+                    f.write(f"{username}\n")
+            usernames.extend(page_usernames)
+            print(f"Downloaded {ANSI_COLOR(filepath)}")
+            await asyncio.sleep(SLEEP_SECS)
+
+    return usernames
+
+
+async def _download_coach_file(session, url, username, filename):
+    """Writes the contents of @url into `DATA_COACH_FILE`.
+
+    @param session
+        The `aiohttp.ClientSession` context our requests are made from.
+    @param url
+        The URL of the file to download.
+    @param username
+        The coach username corresponding to the downloaded file.
+    @param filename
+        The output file to write the downloaded content to.
+    @return:
+        True if we make a network request. False otherwise.
+    """
+    filepath = DATA_COACH_FILE.format(username=username, filename=filename)
     if os.path.isfile(filepath):
         return False
 
-    response = await chesscom_request(url)
+    response = await chesscom_request(session, url)
     if response is not None:
         with open(filepath, "w") as f:
             f.write(response)
-
     return True
 
 
+async def _download_coach_data(session, username):
+    """Download coach-related data to the `DATA_COACH_DIR` directory.
+
+    This sends three parallel requests for:
+    * the coach's profile,
+    * the coach's recent activity,
+    * the coach's stats.
+
+    @param session
+        The `aiohttp.ClientSession` context our requests are made from.
+    @param username
+        The coach username corresponding to the downloaded files.
+    """
+    used_network = await asyncio.gather(
+        _download_coach_file(
+            session,
+            url=f"https://www.chess.com/member/{username}",
+            username=username,
+            filename=f"{username}.html",
+        ),
+        _download_coach_file(
+            session,
+            url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
+            username=username,
+            filename="activity.json",
+        ),
+        _download_coach_file(
+            session,
+            url=f"https://www.chess.com/callback/member/stats/{username}",
+            username=username,
+            filename="stats.json",
+        ),
+    )
+    if any(used_network):
+        print(f"Downloaded {ANSI_COLOR(username)}")
+        await asyncio.sleep(SLEEP_SECS)
+    else:
+        print(f"Skipping {ANSI_COLOR(username)}")
+
+
 async def main():
-    links = await scrape_all_coach_links()
-    for url in [link.strip() for link in links]:
-        member_name = url[len("https://www.chess.com/member/") :]
-        os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
-        made_network_request = await asyncio.gather(
-            download_member_info(
-                member_name,
-                f"{member_name}.html",
-                url,
-            ),
-            download_member_info(
-                member_name,
-                "activity.json",
-                f"https://www.chess.com/callback/member/activity/{member_name}?page=1",
-            ),
-            download_member_info(
-                member_name,
-                "stats.json",
-                f"https://www.chess.com/callback/member/stats/{member_name}",
-            ),
-        )
-        if any(made_network_request):
-            await asyncio.sleep(random.randint(10, 15))
-            print(f"Downloaded {member_name} info.")
-        else:
-            print(f"Skipping {member_name} download.")
+    parser = argparse.ArgumentParser(
+        prog="chesscom-scraper",
+        description="HTML scraping of chess.com coaches.",
+    )
+    parser.add_argument("-u", "--user-agent", required=True)
+    args = parser.parse_args()
+
+    os.makedirs("data/pages", exist_ok=True)
+    os.makedirs("data/coach", exist_ok=True)
+
+    async with aiohttp.ClientSession(
+        headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)}
+    ) as session:
+        # Retrieve all coaches on the platform.
+        usernames = await _scrape_all_coach_usernames(session)
+        # For each coach, download relevant data.
+        for username in [u.strip() for u in usernames]:
+            os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True)
+            await _download_coach_data(session, username)
 
 
 if __name__ == "__main__":
-    os.makedirs("data/pages", exist_ok=True)
-    os.makedirs("data/coach", exist_ok=True)
     asyncio.run(main())