Add a scraper for lichess. (#2)

pull/3/head v0.1.2
Joshua Potter 2023-11-30 15:36:44 -07:00 committed by GitHub
parent 10801b560c
commit bc2ffeae9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 153 additions and 20 deletions

View File

@ -1,25 +1,29 @@
# chesscom-scraper
# coach-scraper
**Caution! Be careful running this script.**
We intentionally delay each batch of requests by 3 seconds. Make sure any
adjustments to this script appropriately rate-limit.
We intentionally delay each batch of requests. Make sure any adjustments to this
script appropriately rate-limit.
## Overview
This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
coaches. The program searches for all listed coaches as well as specific
information about each of them (their profile, recent activity, and stats). The
result will be found in a newly created `data` directory with the following
structure:
This is a simple web scraper for coaches listed on:
* [chess.com](https://www.chess.com/coaches)
* [lichess.org](https://www.lichess.org/coach)
The program searches for coach usernames as well as specific information about
each of them (their profile, recent activity, and stats). The result will be
found in a newly created `data` directory with the following structure:
```
data
├── coach
│ ├── <username>
│ │ ├── <username>.html
│ │ ├── activity.json
│ │ └── stats.json
│ ├── ...
└── <site>
│ ├── coaches
│ │ ├── <username>
│ │ │ ├── <username>.html
│ │ │ ├── export.json
│ │ │ └── ...
│ │ ├── ...
└── pages
├── <n>.txt
├── ...

View File

@ -3,6 +3,7 @@ import argparse
import asyncio
from app.chesscom import Scraper as ChesscomScraper
from app.lichess import Scraper as LichessScraper
from app.scraper import Site
@ -18,6 +19,7 @@ async def run():
required=True,
choices=[
Site.CHESSCOM.value,
Site.LICHESS.value,
],
)
args = parser.parse_args()
@ -27,6 +29,8 @@ async def run():
) as session:
if args.site == Site.CHESSCOM.value:
scraper = ChesscomScraper(session)
elif args.site == Site.LICHESS.value:
scraper = LichessScraper(session)
await scraper.scrape()

View File

@ -179,15 +179,12 @@ class Scraper(BaseScraper):
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
stat_export: Export = {}
export: Export = {}
try:
with open(self.path_coach_file(username, "stats.json"), "r") as f:
stat_export = self._load_stats_json(json.load(f))
export.update(self._load_stats_json(json.load(f)))
except FileNotFoundError:
pass
export: Export = {
"fide_rapid": None,
}
export.update(stat_export)
return export

128
app/lichess.py Normal file
View File

@ -0,0 +1,128 @@
import aiohttp
import asyncio
import os
import os.path
from app.scraper import AnsiColor, BaseScraper, Export, Site
from bs4 import BeautifulSoup
from typing import List
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
# and traversing to the last page.
MAX_PAGES = 162
# How long to wait between each network request.
SLEEP_SECS = 5
class Scraper(BaseScraper):
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.LICHESS.value, session=session)
async def download_usernames(self) -> List[str]:
"""Scan through lichess.org/coach for all coaches' usernames.
@return
The complete list of scraped usernames across every coach listing
page.
"""
usernames = []
for page_no in range(1, MAX_PAGES + 1):
filepath = self.path_page_file(page_no)
try:
with open(filepath, "r") as f:
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Reading file "),
(AnsiColor.DATA, filepath),
]
)
usernames.extend([line.strip() for line in f.readlines()])
except FileNotFoundError:
page_usernames = await self._scrape_page(page_no)
if not page_usernames:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Could not scrape page "),
(AnsiColor.DATA, str(page_no)),
]
)
continue
with open(filepath, "w") as f:
for username in page_usernames:
f.write(f"{username}\n")
usernames.extend(page_usernames)
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded page "),
(AnsiColor.DATA, filepath),
]
)
await asyncio.sleep(SLEEP_SECS)
return usernames
async def _scrape_page(self, page_no: int):
"""Scan through lichess.org/coach/.../?page=<n> for all coaches'
usernames.
@param page_no
The page consisting of at most 10 coaches (at the time of writing)
whose usernames are to be scraped.
@return
The list of scraped usernames on the specified coach listing page.
"""
url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
response, status_code = await self.request(url)
if response is None:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Received status "),
(AnsiColor.DATA, f"{status_code} "),
(None, "when downloading page "),
(AnsiColor.DATA, str(page_no)),
]
)
return
usernames = []
soup = BeautifulSoup(response, "html.parser")
members = soup.find_all("article", class_="coach-widget")
for member in members:
anchor = member.find("a", class_="overlay")
if anchor:
href = anchor.get("href")
username = href[len("/coach/") :]
usernames.append(username)
return usernames
async def download_profile(self, username: str):
"""For each coach, download coach-specific data.
@param username
The coach username corresponding to the downloaded files.
"""
filepath = self.path_coach_file(username, f"{username}.html")
if os.path.isfile(filepath):
return False
response, _unused_status = await self.request(
url=f"https://lichess.org/coach/{username}"
)
if response is not None:
with open(filepath, "w") as f:
f.write(response)
return True
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
export: Export = {}
return export