Restructure and add documentation. Require specifying user-agent.
parent
7308c47eb5
commit
99c89a3a6d
21
README.md
21
README.md
|
@ -2,25 +2,25 @@
|
|||
|
||||
**Caution! Be careful running this script.**
|
||||
|
||||
We intentionally delay each request sent anywhere from 10 to 15 seconds. Make
|
||||
sure any adjustments to this script appropriately rate-limit.
|
||||
We intentionally delay each batch of requests by 3 seconds. Make sure any
|
||||
adjustments to this script appropriately rate-limit.
|
||||
|
||||
## Overview
|
||||
|
||||
This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
|
||||
coaches. Running:
|
||||
```bash
|
||||
$> python3 main.py
|
||||
$> python3 main.py --user-agent <your-email>
|
||||
```
|
||||
will query [chess.com](https://www.chess.com) for all listed coaches as well as
|
||||
specific information about each of them (their profile, recent activity, and
|
||||
stats). The result will be found in a newly created `data` directory with the
|
||||
following structure:
|
||||
will query [chess.com](https://www.chess.com) for all listed coach usernames as
|
||||
well as specific information about each of corresponding coach (their profile,
|
||||
recent activity, and stats). The result will be found in a newly created `data`
|
||||
directory with the following structure:
|
||||
```
|
||||
data
|
||||
├── coach
|
||||
│ ├── <member_name>
|
||||
│ │ ├── <member_name>.html
|
||||
│ ├── <username>
|
||||
│ │ ├── <username>.html
|
||||
│ │ ├── activity.json
|
||||
│ │ └── stats.json
|
||||
│ ├── ...
|
||||
|
@ -29,9 +29,6 @@ data
|
|||
├── ...
|
||||
```
|
||||
|
||||
Here, `member_name` corresponds to the name of the coach whereas `pages`
|
||||
contains a fragmented list of URLs to coach profiles.
|
||||
|
||||
## Development
|
||||
|
||||
This script was written using Python (version 3.11.6). Packaging and dependency
|
||||
|
|
|
@ -1,115 +1,195 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import aiohttp
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import os.path
|
||||
import random
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# References to paths we use to save any scraped content.
|
||||
# The root directory containing downloaded files for a coach.
|
||||
DATA_COACH_DIR = "data/coach/{username}"
|
||||
|
||||
# Where a part of coach-related data is stored.
|
||||
DATA_COACH_FILE = "data/coach/{username}/{filename}"
|
||||
|
||||
# Where a part of all discovered coach usernames is stored.
|
||||
DATA_COACH_LIST = "data/pages/{page_no}.txt"
|
||||
DATA_COACH_DIR = "data/coach/{member_name}"
|
||||
DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
|
||||
|
||||
USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)"
|
||||
# The "User-Agent" value set in every request to chess.com.
|
||||
USER_AGENT = "BoardWise chesscom-scraper ({user_agent})"
|
||||
|
||||
# How long to wait between a batch of network requests.
|
||||
SLEEP_SECS = 3
|
||||
|
||||
|
||||
async def chesscom_request(url):
|
||||
body = None
|
||||
async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session:
|
||||
def ANSI_COLOR(s):
|
||||
"""Print colored output to the console."""
|
||||
return f"\033[0;34m{s}\033[0m" # Blue
|
||||
|
||||
|
||||
async def chesscom_request(session, url):
|
||||
"""Convenience function for network requests to chess.com.
|
||||
|
||||
@param session
|
||||
The `aiohttp.ClientSession` context our requests are made from.
|
||||
@param url
|
||||
The URL to send a request to.
|
||||
@return
|
||||
The text response returned by the server at @url.
|
||||
"""
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
if response.status == 200:
|
||||
return await response.text()
|
||||
print(f"Encountered {response.status} when retrieving {url}.")
|
||||
else:
|
||||
body = await response.text()
|
||||
|
||||
return body
|
||||
|
||||
|
||||
async def scrape_coach_links(page_no):
|
||||
"""Scrape a single coach page listing."""
|
||||
async def _scrape_page_coach_usernames(session, page_no):
|
||||
"""Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
|
||||
|
||||
@param session
|
||||
The `aiohttp.ClientSession` context our requests are made from.
|
||||
@param page_no
|
||||
The page consisting of at most 25 coaches (at the time of writing)
|
||||
whose usernames are to be scraped.
|
||||
@return
|
||||
The list of scraped usernames on the specified coach listing page.
|
||||
"""
|
||||
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||
response = await chesscom_request(url)
|
||||
response = await chesscom_request(session, url)
|
||||
if response is None:
|
||||
return
|
||||
|
||||
links = []
|
||||
usernames = []
|
||||
soup = BeautifulSoup(response, "html.parser")
|
||||
members = soup.find_all("a", class_="members-categories-username")
|
||||
for member in members:
|
||||
links.append(member.get("href"))
|
||||
href = member.get("href")
|
||||
username = href[len("https://www.chess.com/member/") :]
|
||||
usernames.append(username)
|
||||
|
||||
return links
|
||||
return usernames
|
||||
|
||||
|
||||
async def scrape_all_coach_links(max_pages=64):
|
||||
"""Scan through https://www.chess.com/coaches for all member links."""
|
||||
links = []
|
||||
for i in range(1, max_pages + 1):
|
||||
filepath = DATA_COACH_LIST.format(page_no=i)
|
||||
if os.path.isfile(filepath):
|
||||
with open(filepath, "r") as f:
|
||||
links.extend(f.readlines())
|
||||
print(f"{filepath} already exists.")
|
||||
else:
|
||||
links.extend(await scrape_coach_links(i))
|
||||
with open(filepath, "w") as f:
|
||||
for link in links:
|
||||
f.write(f"{link}\n")
|
||||
print(f"Downloaded page {i} of coach list.")
|
||||
await asyncio.sleep(random.randint(10, 15))
|
||||
async def _scrape_all_coach_usernames(session, max_pages=64):
|
||||
"""Scan through chess.com/coaches for all coaches' usernames.
|
||||
|
||||
return links
|
||||
|
||||
|
||||
async def download_member_info(member_name, filename, url):
|
||||
"""Download member-specific content.
|
||||
|
||||
@return: True if we make a network request. False otherwise.
|
||||
@param session
|
||||
The `aiohttp.ClientSession` context our requests are made from.
|
||||
@param max_pages
|
||||
The number of pages we will at most iterate through. This number was
|
||||
determined by going to chess.com/coaches?sortBy=alphabetical&page=1
|
||||
and traversing to the last page.
|
||||
@return
|
||||
The complete list of scraped usernames across every coach listing page.
|
||||
"""
|
||||
filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename)
|
||||
usernames = []
|
||||
for page_no in range(1, max_pages + 1):
|
||||
filepath = DATA_COACH_LIST.format(page_no=page_no)
|
||||
try:
|
||||
with open(filepath, "r") as f:
|
||||
usernames.extend(f.readlines())
|
||||
print(f"Skipping {ANSI_COLOR(filepath)}")
|
||||
except FileNotFoundError:
|
||||
page_usernames = await _scrape_page_coach_usernames(session, page_no)
|
||||
with open(filepath, "w") as f:
|
||||
for username in page_usernames:
|
||||
f.write(f"{username}\n")
|
||||
usernames.extend(page_usernames)
|
||||
print(f"Downloaded {ANSI_COLOR(filepath)}")
|
||||
await asyncio.sleep(SLEEP_SECS)
|
||||
|
||||
return usernames
|
||||
|
||||
|
||||
async def _download_coach_file(session, url, username, filename):
|
||||
"""Writes the contents of @url into `DATA_COACH_FILE`.
|
||||
|
||||
@param session
|
||||
The `aiohttp.ClientSession` context our requests are made from.
|
||||
@param url
|
||||
The URL of the file to download.
|
||||
@param username
|
||||
The coach username corresponding to the downloaded file.
|
||||
@param filename
|
||||
The output file to write the downloaded content to.
|
||||
@return:
|
||||
True if we make a network request. False otherwise.
|
||||
"""
|
||||
filepath = DATA_COACH_FILE.format(username=username, filename=filename)
|
||||
if os.path.isfile(filepath):
|
||||
return False
|
||||
|
||||
response = await chesscom_request(url)
|
||||
response = await chesscom_request(session, url)
|
||||
if response is not None:
|
||||
with open(filepath, "w") as f:
|
||||
f.write(response)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def main():
|
||||
links = await scrape_all_coach_links()
|
||||
for url in [link.strip() for link in links]:
|
||||
member_name = url[len("https://www.chess.com/member/") :]
|
||||
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
|
||||
made_network_request = await asyncio.gather(
|
||||
download_member_info(
|
||||
member_name,
|
||||
f"{member_name}.html",
|
||||
url,
|
||||
async def _download_coach_data(session, username):
|
||||
"""Download coach-related data to the `DATA_COACH_DIR` directory.
|
||||
|
||||
This sends three parallel requests for:
|
||||
* the coach's profile,
|
||||
* the coach's recent activity,
|
||||
* the coach's stats.
|
||||
|
||||
@param session
|
||||
The `aiohttp.ClientSession` context our requests are made from.
|
||||
@param username
|
||||
The coach username corresponding to the downloaded files.
|
||||
"""
|
||||
used_network = await asyncio.gather(
|
||||
_download_coach_file(
|
||||
session,
|
||||
url=f"https://www.chess.com/member/{username}",
|
||||
username=username,
|
||||
filename=f"{username}.html",
|
||||
),
|
||||
download_member_info(
|
||||
member_name,
|
||||
"activity.json",
|
||||
f"https://www.chess.com/callback/member/activity/{member_name}?page=1",
|
||||
_download_coach_file(
|
||||
session,
|
||||
url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
|
||||
username=username,
|
||||
filename="activity.json",
|
||||
),
|
||||
download_member_info(
|
||||
member_name,
|
||||
"stats.json",
|
||||
f"https://www.chess.com/callback/member/stats/{member_name}",
|
||||
_download_coach_file(
|
||||
session,
|
||||
url=f"https://www.chess.com/callback/member/stats/{username}",
|
||||
username=username,
|
||||
filename="stats.json",
|
||||
),
|
||||
)
|
||||
if any(made_network_request):
|
||||
await asyncio.sleep(random.randint(10, 15))
|
||||
print(f"Downloaded {member_name} info.")
|
||||
if any(used_network):
|
||||
print(f"Downloaded {ANSI_COLOR(username)}")
|
||||
await asyncio.sleep(SLEEP_SECS)
|
||||
else:
|
||||
print(f"Skipping {member_name} download.")
|
||||
print(f"Skipping {ANSI_COLOR(username)}")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="chesscom-scraper",
|
||||
description="HTML scraping of chess.com coaches.",
|
||||
)
|
||||
parser.add_argument("-u", "--user-agent", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs("data/pages", exist_ok=True)
|
||||
os.makedirs("data/coach", exist_ok=True)
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)}
|
||||
) as session:
|
||||
# Retrieve all coaches on the platform.
|
||||
usernames = await _scrape_all_coach_usernames(session)
|
||||
# For each coach, download relevant data.
|
||||
for username in [u.strip() for u in usernames]:
|
||||
os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True)
|
||||
await _download_coach_data(session, username)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.makedirs("data/pages", exist_ok=True)
|
||||
os.makedirs("data/coach", exist_ok=True)
|
||||
asyncio.run(main())
|
||||
|
|
Loading…
Reference in New Issue