Restructure and add documentation. Require specifying user-agent.

pull/2/head
Joshua Potter 2023-11-27 20:04:47 -07:00
parent 7308c47eb5
commit 99c89a3a6d
2 changed files with 164 additions and 87 deletions

View File

@ -2,25 +2,25 @@
**Caution! Be careful running this script.** **Caution! Be careful running this script.**
We intentionally delay each request sent anywhere from 10 to 15 seconds. Make We intentionally delay each batch of requests by 3 seconds. Make sure any
sure any adjustments to this script appropriately rate-limit. adjustments to this script appropriately rate-limit.
## Overview ## Overview
This is a simple web scraper for [chess.com](https://www.chess.com/coaches) This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
coaches. Running: coaches. Running:
```bash ```bash
$> python3 main.py $> python3 main.py --user-agent <your-email>
``` ```
will query [chess.com](https://www.chess.com) for all listed coaches as well as will query [chess.com](https://www.chess.com) for all listed coach usernames as
specific information about each of them (their profile, recent activity, and well as specific information about each of corresponding coach (their profile,
stats). The result will be found in a newly created `data` directory with the recent activity, and stats). The result will be found in a newly created `data`
following structure: directory with the following structure:
``` ```
data data
├── coach ├── coach
│ ├── <member_name> │ ├── <username>
│ │ ├── <member_name>.html │ │ ├── <username>.html
│ │ ├── activity.json │ │ ├── activity.json
│ │ └── stats.json │ │ └── stats.json
│ ├── ... │ ├── ...
@ -29,9 +29,6 @@ data
├── ... ├── ...
``` ```
Here, `member_name` corresponds to the name of the coach whereas `pages`
contains a fragmented list of URLs to coach profiles.
## Development ## Development
This script was written using Python (version 3.11.6). Packaging and dependency This script was written using Python (version 3.11.6). Packaging and dependency

218
main.py Normal file → Executable file
View File

@ -1,115 +1,195 @@
#!/usr/bin/env python3
import aiohttp import aiohttp
import argparse
import asyncio import asyncio
import os import os
import os.path import os.path
import random
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# References to paths we use to save any scraped content. # The root directory containing downloaded files for a coach.
DATA_COACH_DIR = "data/coach/{username}"
# Where a part of coach-related data is stored.
DATA_COACH_FILE = "data/coach/{username}/{filename}"
# Where a part of all discovered coach usernames is stored.
DATA_COACH_LIST = "data/pages/{page_no}.txt" DATA_COACH_LIST = "data/pages/{page_no}.txt"
DATA_COACH_DIR = "data/coach/{member_name}"
DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)" # The "User-Agent" value set in every request to chess.com.
USER_AGENT = "BoardWise chesscom-scraper ({user_agent})"
# How long to wait between a batch of network requests.
SLEEP_SECS = 3
async def chesscom_request(url): def ANSI_COLOR(s):
body = None """Print colored output to the console."""
async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session: return f"\033[0;34m{s}\033[0m" # Blue
async def chesscom_request(session, url):
"""Convenience function for network requests to chess.com.
@param session
The `aiohttp.ClientSession` context our requests are made from.
@param url
The URL to send a request to.
@return
The text response returned by the server at @url.
"""
async with session.get(url) as response: async with session.get(url) as response:
if response.status != 200: if response.status == 200:
return await response.text()
print(f"Encountered {response.status} when retrieving {url}.") print(f"Encountered {response.status} when retrieving {url}.")
else:
body = await response.text()
return body
async def scrape_coach_links(page_no): async def _scrape_page_coach_usernames(session, page_no):
"""Scrape a single coach page listing.""" """Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
@param session
The `aiohttp.ClientSession` context our requests are made from.
@param page_no
The page consisting of at most 25 coaches (at the time of writing)
whose usernames are to be scraped.
@return
The list of scraped usernames on the specified coach listing page.
"""
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
response = await chesscom_request(url) response = await chesscom_request(session, url)
if response is None: if response is None:
return return
links = [] usernames = []
soup = BeautifulSoup(response, "html.parser") soup = BeautifulSoup(response, "html.parser")
members = soup.find_all("a", class_="members-categories-username") members = soup.find_all("a", class_="members-categories-username")
for member in members: for member in members:
links.append(member.get("href")) href = member.get("href")
username = href[len("https://www.chess.com/member/") :]
usernames.append(username)
return links return usernames
async def scrape_all_coach_links(max_pages=64): async def _scrape_all_coach_usernames(session, max_pages=64):
"""Scan through https://www.chess.com/coaches for all member links.""" """Scan through chess.com/coaches for all coaches' usernames.
links = []
for i in range(1, max_pages + 1):
filepath = DATA_COACH_LIST.format(page_no=i)
if os.path.isfile(filepath):
with open(filepath, "r") as f:
links.extend(f.readlines())
print(f"{filepath} already exists.")
else:
links.extend(await scrape_coach_links(i))
with open(filepath, "w") as f:
for link in links:
f.write(f"{link}\n")
print(f"Downloaded page {i} of coach list.")
await asyncio.sleep(random.randint(10, 15))
return links @param session
The `aiohttp.ClientSession` context our requests are made from.
@param max_pages
async def download_member_info(member_name, filename, url): The number of pages we will at most iterate through. This number was
"""Download member-specific content. determined by going to chess.com/coaches?sortBy=alphabetical&page=1
and traversing to the last page.
@return: True if we make a network request. False otherwise. @return
The complete list of scraped usernames across every coach listing page.
""" """
filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename) usernames = []
for page_no in range(1, max_pages + 1):
filepath = DATA_COACH_LIST.format(page_no=page_no)
try:
with open(filepath, "r") as f:
usernames.extend(f.readlines())
print(f"Skipping {ANSI_COLOR(filepath)}")
except FileNotFoundError:
page_usernames = await _scrape_page_coach_usernames(session, page_no)
with open(filepath, "w") as f:
for username in page_usernames:
f.write(f"{username}\n")
usernames.extend(page_usernames)
print(f"Downloaded {ANSI_COLOR(filepath)}")
await asyncio.sleep(SLEEP_SECS)
return usernames
async def _download_coach_file(session, url, username, filename):
"""Writes the contents of @url into `DATA_COACH_FILE`.
@param session
The `aiohttp.ClientSession` context our requests are made from.
@param url
The URL of the file to download.
@param username
The coach username corresponding to the downloaded file.
@param filename
The output file to write the downloaded content to.
@return:
True if we make a network request. False otherwise.
"""
filepath = DATA_COACH_FILE.format(username=username, filename=filename)
if os.path.isfile(filepath): if os.path.isfile(filepath):
return False return False
response = await chesscom_request(url) response = await chesscom_request(session, url)
if response is not None: if response is not None:
with open(filepath, "w") as f: with open(filepath, "w") as f:
f.write(response) f.write(response)
return True return True
async def main(): async def _download_coach_data(session, username):
links = await scrape_all_coach_links() """Download coach-related data to the `DATA_COACH_DIR` directory.
for url in [link.strip() for link in links]:
member_name = url[len("https://www.chess.com/member/") :] This sends three parallel requests for:
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True) * the coach's profile,
made_network_request = await asyncio.gather( * the coach's recent activity,
download_member_info( * the coach's stats.
member_name,
f"{member_name}.html", @param session
url, The `aiohttp.ClientSession` context our requests are made from.
@param username
The coach username corresponding to the downloaded files.
"""
used_network = await asyncio.gather(
_download_coach_file(
session,
url=f"https://www.chess.com/member/{username}",
username=username,
filename=f"{username}.html",
), ),
download_member_info( _download_coach_file(
member_name, session,
"activity.json", url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
f"https://www.chess.com/callback/member/activity/{member_name}?page=1", username=username,
filename="activity.json",
), ),
download_member_info( _download_coach_file(
member_name, session,
"stats.json", url=f"https://www.chess.com/callback/member/stats/{username}",
f"https://www.chess.com/callback/member/stats/{member_name}", username=username,
filename="stats.json",
), ),
) )
if any(made_network_request): if any(used_network):
await asyncio.sleep(random.randint(10, 15)) print(f"Downloaded {ANSI_COLOR(username)}")
print(f"Downloaded {member_name} info.") await asyncio.sleep(SLEEP_SECS)
else: else:
print(f"Skipping {member_name} download.") print(f"Skipping {ANSI_COLOR(username)}")
async def main():
parser = argparse.ArgumentParser(
prog="chesscom-scraper",
description="HTML scraping of chess.com coaches.",
)
parser.add_argument("-u", "--user-agent", required=True)
args = parser.parse_args()
os.makedirs("data/pages", exist_ok=True)
os.makedirs("data/coach", exist_ok=True)
async with aiohttp.ClientSession(
headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)}
) as session:
# Retrieve all coaches on the platform.
usernames = await _scrape_all_coach_usernames(session)
# For each coach, download relevant data.
for username in [u.strip() for u in usernames]:
os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True)
await _download_coach_data(session, username)
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs("data/pages", exist_ok=True)
os.makedirs("data/coach", exist_ok=True)
asyncio.run(main()) asyncio.run(main())