2023-11-27 20:09:40 +00:00
|
|
|
import aiohttp
|
|
|
|
import asyncio
|
|
|
|
import os
|
|
|
|
import os.path
|
|
|
|
import random
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
# References to paths we use to save any scraped content.
|
2023-11-27 20:53:51 +00:00
|
|
|
DATA_COACH_LIST = "data/pages/{page_no}.txt"
|
|
|
|
DATA_COACH_DIR = "data/coach/{member_name}"
|
|
|
|
DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
|
2023-11-27 20:09:40 +00:00
|
|
|
|
|
|
|
|
2023-11-27 21:13:56 +00:00
|
|
|
async def chesscom_requeset(url):
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
async with session.get(url) as response:
|
|
|
|
if response.status != 200:
|
|
|
|
print(f"Encountered {response.status} when retrieving {url}.")
|
|
|
|
return response.text()
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2023-11-27 20:09:40 +00:00
|
|
|
async def scrape_coach_links(page_no):
|
|
|
|
"""Scrape a single coach page listing."""
|
|
|
|
links = []
|
2023-11-27 21:13:56 +00:00
|
|
|
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
|
|
|
async with aiohttp.ClientSession(
|
|
|
|
headers={
|
|
|
|
"User-Agent": "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)",
|
|
|
|
}
|
|
|
|
) as session:
|
|
|
|
async with session.get(url) as response:
|
2023-11-27 20:09:40 +00:00
|
|
|
if response.status != 200:
|
2023-11-27 21:13:56 +00:00
|
|
|
print(f"Encountered {response.status} when retrieving {url}.")
|
2023-11-27 20:09:40 +00:00
|
|
|
return
|
|
|
|
html = await response.text()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
2023-11-27 20:41:06 +00:00
|
|
|
members = soup.find_all("a", class_="members-categories-username")
|
2023-11-27 20:09:40 +00:00
|
|
|
for member in members:
|
|
|
|
links.append(member.get("href"))
|
|
|
|
|
|
|
|
return links
|
|
|
|
|
|
|
|
|
2023-11-27 20:30:22 +00:00
|
|
|
async def scrape_all_coach_links(max_pages=64):
|
|
|
|
"""Scan through https://www.chess.com/coaches for all member links."""
|
|
|
|
links = []
|
2023-11-27 20:09:40 +00:00
|
|
|
for i in range(1, max_pages + 1):
|
2023-11-27 20:53:51 +00:00
|
|
|
filepath = DATA_COACH_LIST.format(page_no=i)
|
2023-11-27 20:30:22 +00:00
|
|
|
if os.path.isfile(filepath):
|
|
|
|
with open(filepath, "r") as f:
|
2023-11-27 20:41:06 +00:00
|
|
|
links.extend(f.readlines())
|
2023-11-27 20:30:22 +00:00
|
|
|
print(f"{filepath} already exists.")
|
|
|
|
else:
|
2023-11-27 20:41:06 +00:00
|
|
|
links.extend(await scrape_coach_links(i))
|
2023-11-27 20:30:22 +00:00
|
|
|
with open(filepath, "w") as f:
|
|
|
|
for link in links:
|
|
|
|
f.write(f"{link}\n")
|
|
|
|
print(f"Downloaded page {i} of coach list.")
|
|
|
|
await asyncio.sleep(random.randint(3, 7))
|
|
|
|
|
2023-11-27 20:09:40 +00:00
|
|
|
return links
|
|
|
|
|
|
|
|
|
2023-11-27 21:13:56 +00:00
|
|
|
async def download_member_info(member_name, filename, url):
|
2023-11-27 20:09:40 +00:00
|
|
|
"""Download member-specific content.
|
|
|
|
|
2023-11-27 20:30:22 +00:00
|
|
|
@return: True if we downloaded content. False if the download already
|
|
|
|
exists locally.
|
2023-11-27 20:09:40 +00:00
|
|
|
"""
|
2023-11-27 20:53:51 +00:00
|
|
|
filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename)
|
|
|
|
if os.path.isfile(filepath):
|
2023-11-27 20:09:40 +00:00
|
|
|
return False
|
2023-11-27 20:30:22 +00:00
|
|
|
async with aiohttp.ClientSession() as session:
|
2023-11-27 21:13:56 +00:00
|
|
|
async with session.get(url) as response:
|
2023-11-27 20:30:22 +00:00
|
|
|
if response.status != 200:
|
2023-11-27 21:13:56 +00:00
|
|
|
print(f"Encountered {response.status} when retrieving {url}")
|
2023-11-27 20:30:22 +00:00
|
|
|
return
|
2023-11-27 20:53:51 +00:00
|
|
|
with open(filepath, "w") as f:
|
2023-11-27 20:09:40 +00:00
|
|
|
f.write(await response.text())
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
links = await scrape_all_coach_links()
|
2023-11-27 21:13:56 +00:00
|
|
|
for url in [link.strip() for link in links]:
|
|
|
|
member_name = url[len("https://www.chess.com/member/") :]
|
2023-11-27 20:53:51 +00:00
|
|
|
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
|
2023-11-27 20:09:40 +00:00
|
|
|
downloaded = await asyncio.gather(
|
|
|
|
download_member_info(
|
|
|
|
member_name,
|
|
|
|
f"{member_name}.html",
|
2023-11-27 21:13:56 +00:00
|
|
|
url,
|
2023-11-27 20:09:40 +00:00
|
|
|
),
|
|
|
|
download_member_info(
|
|
|
|
member_name,
|
|
|
|
"activity.json",
|
|
|
|
f"https://www.chess.com/callback/member/activity/{member_name}?page=1",
|
|
|
|
),
|
|
|
|
download_member_info(
|
|
|
|
member_name,
|
|
|
|
"stats.json",
|
|
|
|
f"https://www.chess.com/callback/member/stats/{member_name}",
|
|
|
|
),
|
|
|
|
)
|
|
|
|
if any(downloaded):
|
2023-11-27 20:30:22 +00:00
|
|
|
await asyncio.sleep(random.randint(3, 7))
|
|
|
|
print(f"Downloaded {member_name} info.")
|
2023-11-27 20:09:40 +00:00
|
|
|
else:
|
2023-11-27 20:30:22 +00:00
|
|
|
print(f"Skipping {member_name} download.")
|
2023-11-27 20:09:40 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-11-27 20:30:22 +00:00
|
|
|
os.makedirs("data/pages", exist_ok=True)
|
2023-11-27 20:09:40 +00:00
|
|
|
os.makedirs("data/coach", exist_ok=True)
|
|
|
|
asyncio.run(main())
|