coach-scraper/app/lichess.py

216 lines
7.0 KiB
Python
Raw Normal View History

2023-11-30 22:36:44 +00:00
import asyncio
import os
import os.path
2023-12-01 23:36:53 +00:00
from typing import List
2023-11-30 22:36:44 +00:00
2023-12-01 23:36:53 +00:00
import aiohttp
2023-12-01 14:10:58 +00:00
from bs4 import BeautifulSoup, SoupStrainer
2023-11-30 22:36:44 +00:00
2023-12-01 23:36:53 +00:00
from app.exporter import BaseExporter
from app.repo import AnsiColor, Site
from app.scraper import BaseScraper
2023-11-30 22:36:44 +00:00
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
# and traversing to the last page.
MAX_PAGES = 162
# How long to wait between each network request.
SLEEP_SECS = 5
class Scraper(BaseScraper):
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.LICHESS.value, session=session)
async def download_usernames(self) -> List[str]:
"""Scan through lichess.org/coach for all coaches' usernames.
@return
The complete list of scraped usernames across every coach listing
page.
"""
usernames = []
for page_no in range(1, MAX_PAGES + 1):
filepath = self.path_page_file(page_no)
try:
with open(filepath, "r") as f:
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Reading file "),
(AnsiColor.DATA, filepath),
]
)
usernames.extend([line.strip() for line in f.readlines()])
except FileNotFoundError:
page_usernames = await self._scrape_page(page_no)
if not page_usernames:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Could not scrape page "),
(AnsiColor.DATA, str(page_no)),
]
)
continue
with open(filepath, "w") as f:
for username in page_usernames:
f.write(f"{username}\n")
usernames.extend(page_usernames)
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded page "),
(AnsiColor.DATA, filepath),
]
)
await asyncio.sleep(SLEEP_SECS)
return usernames
async def _scrape_page(self, page_no: int):
"""Scan through lichess.org/coach/.../?page=<n> for all coaches'
usernames.
@param page_no
The page consisting of at most 10 coaches (at the time of writing)
whose usernames are to be scraped.
@return
The list of scraped usernames on the specified coach listing page.
"""
url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
response, status_code = await self.request(url)
if response is None:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Received status "),
(AnsiColor.DATA, f"{status_code} "),
(None, "when downloading page "),
(AnsiColor.DATA, str(page_no)),
]
)
return
usernames = []
2023-12-01 14:12:40 +00:00
soup = BeautifulSoup(response, "lxml")
2023-11-30 22:36:44 +00:00
members = soup.find_all("article", class_="coach-widget")
for member in members:
anchor = member.find("a", class_="overlay")
if anchor:
href = anchor.get("href")
username = href[len("/coach/") :]
usernames.append(username)
return usernames
async def download_profile(self, username: str):
"""For each coach, download coach-specific data.
@param username
The coach username corresponding to the downloaded files.
"""
2023-12-01 03:35:20 +00:00
used_network1 = await self._download_profile_file(
url=f"https://lichess.org/coach/{username}",
username=username,
filename=self.path_coach_file(username, f"{username}.html"),
)
used_network2 = await self._download_profile_file(
url=f"https://lichess.org/@/{username}",
username=username,
filename=self.path_coach_file(username, "stats.html"),
)
if any([used_network1, used_network2]):
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded data for coach "),
(AnsiColor.DATA, username),
]
)
await asyncio.sleep(SLEEP_SECS)
else:
2023-12-01 00:12:16 +00:00
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Skipping download for coach "),
(AnsiColor.DATA, username),
]
)
2023-11-30 22:36:44 +00:00
2023-12-01 03:35:20 +00:00
async def _download_profile_file(self, url: str, username: str, filename: str):
"""Writes the contents of url into the specified file.
@param url
The URL of the file to download.
@param username
The coach username corresponding to the downloaded file.
@param filename
The output file to write the downloaded content to.
@return:
True if we make a network request. False otherwise.
"""
if os.path.isfile(filename):
return False
response, _unused_status = await self.request(url)
2023-11-30 22:36:44 +00:00
if response is not None:
2023-12-01 03:35:20 +00:00
with open(filename, "w") as f:
2023-11-30 22:36:44 +00:00
f.write(response)
2023-12-01 03:35:20 +00:00
return True
2023-12-01 00:12:16 +00:00
2023-12-01 14:10:58 +00:00
def _stats_filter(elem, attrs):
"""Includes only relevant segments of the `stats.html` file."""
if "sub-ratings" in attrs.get("class", ""):
return True
2023-12-01 00:12:16 +00:00
class Exporter(BaseExporter):
def __init__(self, username: str):
super().__init__(site=Site.LICHESS.value, username=username)
2023-11-30 22:36:44 +00:00
2023-12-01 03:35:20 +00:00
self.stats_soup = None
try:
with open(self.path_coach_file(username, "stats.html"), "r") as f:
2023-12-01 14:10:58 +00:00
stats_strainer = SoupStrainer(_stats_filter)
self.stats_soup = BeautifulSoup(
2023-12-01 14:12:40 +00:00
f.read(), "lxml", parse_only=stats_strainer
2023-12-01 14:10:58 +00:00
)
2023-12-01 03:35:20 +00:00
except FileNotFoundError:
pass
def export_rapid(self):
2023-12-01 14:10:58 +00:00
return self._find_rating("rapid")
def export_blitz(self):
return self._find_rating("blitz")
def export_bullet(self):
return self._find_rating("bullet")
def _find_rating(self, name):
2023-12-01 03:35:20 +00:00
if self.stats_soup is None:
return None
2023-12-01 14:10:58 +00:00
anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
if anchor is None:
2023-12-01 03:35:20 +00:00
return None
2023-12-01 14:10:58 +00:00
rating = anchor.find("rating")
2023-12-01 03:35:20 +00:00
if rating is None:
return None
strong = rating.find("strong")
if strong is None:
return None
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
try:
return int(value)
except ValueError:
return None