Add a scraper for lichess. (#2)

pull/3/head v0.1.2
Joshua Potter 2023-11-30 15:36:44 -07:00 committed by GitHub
parent 10801b560c
commit bc2ffeae9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 153 additions and 20 deletions

View File

@ -1,25 +1,29 @@
# chesscom-scraper # coach-scraper
**Caution! Be careful running this script.** **Caution! Be careful running this script.**
We intentionally delay each batch of requests by 3 seconds. Make sure any We intentionally delay each batch of requests. Make sure any adjustments to this
adjustments to this script appropriately rate-limit. script appropriately rate-limit.
## Overview ## Overview
This is a simple web scraper for [chess.com](https://www.chess.com/coaches) This is a simple web scraper for coaches listed on:
coaches. The program searches for all listed coaches as well as specific
information about each of them (their profile, recent activity, and stats). The * [chess.com](https://www.chess.com/coaches)
result will be found in a newly created `data` directory with the following * [lichess.org](https://www.lichess.org/coach)
structure:
The program searches for coach usernames as well as specific information about
each of them (their profile, recent activity, and stats). The result will be
found in a newly created `data` directory with the following structure:
``` ```
data data
├── coach └── <site>
│ ├── <username> │ ├── coaches
│ │ ├── <username>.html │ │ ├── <username>
│ │ ├── activity.json │ │ │ ├── <username>.html
│ │ └── stats.json │ │ │ ├── export.json
│ ├── ... │ │ │ └── ...
│ │ ├── ...
└── pages └── pages
├── <n>.txt ├── <n>.txt
├── ... ├── ...

View File

@ -3,6 +3,7 @@ import argparse
import asyncio import asyncio
from app.chesscom import Scraper as ChesscomScraper from app.chesscom import Scraper as ChesscomScraper
from app.lichess import Scraper as LichessScraper
from app.scraper import Site from app.scraper import Site
@ -18,6 +19,7 @@ async def run():
required=True, required=True,
choices=[ choices=[
Site.CHESSCOM.value, Site.CHESSCOM.value,
Site.LICHESS.value,
], ],
) )
args = parser.parse_args() args = parser.parse_args()
@ -27,6 +29,8 @@ async def run():
) as session: ) as session:
if args.site == Site.CHESSCOM.value: if args.site == Site.CHESSCOM.value:
scraper = ChesscomScraper(session) scraper = ChesscomScraper(session)
elif args.site == Site.LICHESS.value:
scraper = LichessScraper(session)
await scraper.scrape() await scraper.scrape()

View File

@ -179,15 +179,12 @@ class Scraper(BaseScraper):
async def export(self, username: str) -> Export: async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format.""" """Transform coach-specific data into uniform format."""
stat_export: Export = {} export: Export = {}
try: try:
with open(self.path_coach_file(username, "stats.json"), "r") as f: with open(self.path_coach_file(username, "stats.json"), "r") as f:
stat_export = self._load_stats_json(json.load(f)) export.update(self._load_stats_json(json.load(f)))
except FileNotFoundError: except FileNotFoundError:
pass pass
export: Export = {
"fide_rapid": None,
}
export.update(stat_export)
return export return export

128
app/lichess.py Normal file
View File

@ -0,0 +1,128 @@
import aiohttp
import asyncio
import os
import os.path
from app.scraper import AnsiColor, BaseScraper, Export, Site
from bs4 import BeautifulSoup
from typing import List
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
# and traversing to the last page.
MAX_PAGES = 162
# How long to wait between each network request.
SLEEP_SECS = 5
class Scraper(BaseScraper):
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.LICHESS.value, session=session)
async def download_usernames(self) -> List[str]:
"""Scan through lichess.org/coach for all coaches' usernames.
@return
The complete list of scraped usernames across every coach listing
page.
"""
usernames = []
for page_no in range(1, MAX_PAGES + 1):
filepath = self.path_page_file(page_no)
try:
with open(filepath, "r") as f:
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Reading file "),
(AnsiColor.DATA, filepath),
]
)
usernames.extend([line.strip() for line in f.readlines()])
except FileNotFoundError:
page_usernames = await self._scrape_page(page_no)
if not page_usernames:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Could not scrape page "),
(AnsiColor.DATA, str(page_no)),
]
)
continue
with open(filepath, "w") as f:
for username in page_usernames:
f.write(f"{username}\n")
usernames.extend(page_usernames)
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded page "),
(AnsiColor.DATA, filepath),
]
)
await asyncio.sleep(SLEEP_SECS)
return usernames
async def _scrape_page(self, page_no: int):
"""Scan through lichess.org/coach/.../?page=<n> for all coaches'
usernames.
@param page_no
The page consisting of at most 10 coaches (at the time of writing)
whose usernames are to be scraped.
@return
The list of scraped usernames on the specified coach listing page.
"""
url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
response, status_code = await self.request(url)
if response is None:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Received status "),
(AnsiColor.DATA, f"{status_code} "),
(None, "when downloading page "),
(AnsiColor.DATA, str(page_no)),
]
)
return
usernames = []
soup = BeautifulSoup(response, "html.parser")
members = soup.find_all("article", class_="coach-widget")
for member in members:
anchor = member.find("a", class_="overlay")
if anchor:
href = anchor.get("href")
username = href[len("/coach/") :]
usernames.append(username)
return usernames
async def download_profile(self, username: str):
"""For each coach, download coach-specific data.
@param username
The coach username corresponding to the downloaded files.
"""
filepath = self.path_coach_file(username, f"{username}.html")
if os.path.isfile(filepath):
return False
response, _unused_status = await self.request(
url=f"https://lichess.org/coach/{username}"
)
if response is not None:
with open(filepath, "w") as f:
f.write(response)
return True
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
export: Export = {}
return export