coach-scraper/app/lichess.py

234 lines
7.3 KiB
Python
Raw Normal View History

2023-11-30 22:36:44 +00:00
import asyncio
import os
import os.path
2023-12-05 19:54:12 +00:00
from typing import List
2023-11-30 22:36:44 +00:00
2023-12-01 23:36:53 +00:00
import aiohttp
2023-12-05 19:54:12 +00:00
from bs4 import BeautifulSoup, SoupStrainer, Tag
from lingua import LanguageDetector
2023-11-30 22:36:44 +00:00
from app.locale import Locale, native_to_locale
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.types import Site, Title
2023-11-30 22:36:44 +00:00
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
# and traversing to the last page.
MAX_PAGES = 162
# How long to wait between each network request.
SLEEP_SECS = 5
class Fetcher(BaseFetcher):
2023-11-30 22:36:44 +00:00
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.LICHESS, session=session)
2023-11-30 22:36:44 +00:00
2023-12-05 19:54:12 +00:00
async def scrape_usernames(self, page_no: int) -> List[str] | None:
if page_no > MAX_PAGES:
return []
2023-11-30 22:36:44 +00:00
print(f"{self.site.value}: Scraping page {page_no}/{MAX_PAGES}")
2023-11-30 22:36:44 +00:00
filepath = self.path_page_file(page_no)
try:
with open(filepath, "r") as f:
return [line.strip() for line in f.readlines()]
except FileNotFoundError:
pass
2023-11-30 22:36:44 +00:00
if self.has_made_request:
await asyncio.sleep(SLEEP_SECS)
2023-11-30 22:36:44 +00:00
url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
response, status_code = await self.fetch(url)
2023-11-30 22:36:44 +00:00
if response is None:
return None # Skips this page.
2023-11-30 22:36:44 +00:00
usernames = []
2023-12-01 14:12:40 +00:00
soup = BeautifulSoup(response, "lxml")
2023-11-30 22:36:44 +00:00
members = soup.find_all("article", class_="coach-widget")
for member in members:
a = member.find("a", class_="overlay")
if a:
href = a.get("href")
2023-11-30 22:36:44 +00:00
username = href[len("/coach/") :]
usernames.append(username)
with open(filepath, "w") as f:
for username in usernames:
f.write(f"{username}\n")
2023-11-30 22:36:44 +00:00
return usernames
async def download_user_files(self, username: str) -> None:
maybe_download = [
(
f"https://lichess.org/coach/{username}",
self.path_coach_file(username, f"{username}.html"),
),
(
f"https://lichess.org/@/{username}",
self.path_coach_file(username, "stats.html"),
),
]
to_download = []
for d_url, d_filename in maybe_download:
if os.path.isfile(d_filename):
continue
to_download.append((d_url, d_filename))
if not to_download:
return
2023-11-30 22:36:44 +00:00
if self.has_made_request:
await asyncio.sleep(SLEEP_SECS)
await asyncio.gather(
*[self._download_file(url=d[0], filename=d[1]) for d in to_download]
2023-12-01 03:35:20 +00:00
)
async def _download_file(self, url: str, filename: str) -> None:
response, _unused_status = await self.fetch(url)
2023-11-30 22:36:44 +00:00
if response is not None:
2023-12-01 03:35:20 +00:00
with open(filename, "w") as f:
2023-11-30 22:36:44 +00:00
f.write(response)
2023-12-01 00:12:16 +00:00
2023-12-05 19:54:12 +00:00
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
if "coach-widget" in attrs.get("class", ""):
return True
2023-12-05 19:54:12 +00:00
return False
2023-12-05 19:54:12 +00:00
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
2023-12-07 02:52:40 +00:00
if "user-link" in attrs.get("class", ""):
return True
if "profile-side" in attrs.get("class", ""):
return True
2023-12-01 14:10:58 +00:00
if "sub-ratings" in attrs.get("class", ""):
return True
2023-12-05 19:54:12 +00:00
return False
2023-12-01 14:10:58 +00:00
class Extractor(BaseExtractor):
def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
super().__init__(fetcher, detector, username)
2023-11-30 22:36:44 +00:00
self.profile_soup = None
try:
filename = self.fetcher.path_coach_file(username, f"{username}.html")
with open(filename, "r") as f:
self.profile_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
)
except FileNotFoundError:
pass
2023-12-01 03:35:20 +00:00
self.stats_soup = None
try:
filename = self.fetcher.path_coach_file(username, "stats.html")
with open(filename, "r") as f:
2023-12-01 14:10:58 +00:00
self.stats_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
2023-12-01 14:10:58 +00:00
)
2023-12-01 03:35:20 +00:00
except FileNotFoundError:
pass
2023-12-05 19:54:12 +00:00
def get_name(self) -> str | None:
if self.stats_soup is None:
return None
profile_side = self.stats_soup.find("div", class_="profile-side")
if not isinstance(profile_side, Tag):
return None
user_infos = profile_side.find("div", class_="user-infos")
if not isinstance(user_infos, Tag):
return None
2023-12-05 19:54:12 +00:00
name = user_infos.find("strong", class_="name")
if not isinstance(name, Tag):
return None
return name.get_text().strip()
2023-12-05 19:54:12 +00:00
def get_image_url(self) -> str | None:
if self.profile_soup is None:
return None
picture = self.profile_soup.find("img", class_="picture")
if not isinstance(picture, Tag):
return None
2023-12-05 19:54:12 +00:00
src = picture.get("src", "")
if not isinstance(src, str):
return None
if "image.lichess1.org" not in src:
return None
return src
2023-12-07 02:52:40 +00:00
def get_title(self) -> Title | None:
if self.stats_soup is None:
return None
utitle = self.stats_soup.find("span", class_="utitle")
if not isinstance(utitle, Tag):
return None
title = utitle.get_text().strip()
try:
return Title(title)
except ValueError:
return None
def get_languages(self) -> List[Locale] | None:
if self.profile_soup is None:
return None
tr = self.profile_soup.find("tr", class_="languages")
if not isinstance(tr, Tag):
return None
td = tr.find("td")
if not isinstance(td, Tag):
return None
codes = []
for lang in [s.strip() for s in tr.get_text().split(",")]:
if lang in native_to_locale:
codes.append(native_to_locale[lang])
return codes
2023-12-05 19:54:12 +00:00
def get_rapid(self) -> int | None:
2023-12-01 14:10:58 +00:00
return self._find_rating("rapid")
2023-12-05 19:54:12 +00:00
def get_blitz(self) -> int | None:
2023-12-01 14:10:58 +00:00
return self._find_rating("blitz")
2023-12-05 19:54:12 +00:00
def get_bullet(self) -> int | None:
2023-12-01 14:10:58 +00:00
return self._find_rating("bullet")
2023-12-05 19:54:12 +00:00
def _find_rating(self, name) -> int | None:
if self.stats_soup is None:
return None
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
if not isinstance(a, Tag):
return None
rating = a.find("rating")
if not isinstance(rating, Tag):
return None
strong = rating.find("strong")
if not isinstance(strong, Tag):
return None
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
try:
return int(value)
2023-12-05 19:54:12 +00:00
except ValueError:
return None
class Pipeline(BasePipeline):
def get_fetcher(self, session: aiohttp.ClientSession):
return Fetcher(session)
def get_extractor(
self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
):
return Extractor(fetcher, detector, username)