diff --git a/app/__main__.py b/app/__main__.py index 9df2e71..7ee542b 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -8,7 +8,7 @@ import psycopg2 from app.chesscom import Pipeline as ChesscomPipeline from app.database import backup_database from app.lichess import Pipeline as LichessPipeline -from app.pipeline import Site +from app.site import Site # The number of parallel extraction jobs that are run at a time. WORKER_COUNT = 10 diff --git a/app/chesscom.py b/app/chesscom.py index 3f0b9cf..98129b0 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -2,15 +2,15 @@ import asyncio import json import os import os.path -from typing import List, Union +from typing import List import aiohttp -from bs4 import BeautifulSoup, SoupStrainer +from bs4 import BeautifulSoup, SoupStrainer, Tag from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.pipeline import Site +from app.site import Site # The number of coach listing pages we will at most iterate through. This number # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and @@ -25,7 +25,7 @@ class Fetcher(BaseFetcher): def __init__(self, session: aiohttp.ClientSession): super().__init__(site=Site.CHESSCOM, session=session) - async def scrape_usernames(self, page_no: int) -> List[str]: + async def scrape_usernames(self, page_no: int) -> List[str] | None: if page_no > MAX_PAGES: return [] @@ -100,15 +100,16 @@ class Fetcher(BaseFetcher): f.write(response) -def _profile_filter(elem, attrs): +def _profile_filter(elem: Tag | str | None, attrs={}) -> bool: if "profile-header-info" in attrs.get("class", ""): return True if "profile-card-info" in attrs.get("class", ""): return True + return False class Extractor(BaseExtractor): - def __init__(self, fetcher: Fetcher, username: str): + def __init__(self, fetcher: BaseFetcher, username: str): super().__init__(fetcher, username) self.profile_soup = None @@ -131,29 +132,37 @@ class Extractor(BaseExtractor): except FileNotFoundError: pass - def get_name(self) -> Union[str, None]: - try: - name = self.profile_soup.find("div", class_="profile-card-name") - return name.get_text().strip() - except AttributeError: + def get_name(self) -> str | None: + if self.profile_soup is None: return None - - def get_image_url(self) -> Union[str, None]: - try: - div = self.profile_soup.find("div", class_="profile-header-avatar") - src = div.find("img").get("src", "") - if "images.chesscomfiles.com" in src: - return src - except AttributeError: + name = self.profile_soup.find("div", class_="profile-card-name") + if not isinstance(name, Tag): return None + return name.get_text().strip() - def get_rapid(self) -> Union[int, None]: + def get_image_url(self) -> str | None: + if self.profile_soup is None: + return None + div = self.profile_soup.find("div", class_="profile-header-avatar") + if not isinstance(div, Tag): + return None + img = div.find("img") + if not isinstance(img, Tag): + return None + src = img.get("src", "") + if not isinstance(src, str): + return None + if "images.chesscomfiles.com" not in src: + return None + return src + + def get_rapid(self) -> int | None: return self.stats_json.get("rapid", {}).get("rating") - def get_blitz(self) -> Union[int, None]: + def get_blitz(self) -> int | None: return self.stats_json.get("lightning", {}).get("rating") - def get_bullet(self) -> Union[int, None]: + def get_bullet(self) -> int | None: return self.stats_json.get("bullet", {}).get("rating") @@ -161,5 +170,5 @@ class Pipeline(BasePipeline): def get_fetcher(self, session: aiohttp.ClientSession): return Fetcher(session) - def get_extractor(self, fetcher: Fetcher, username: str): + def get_extractor(self, fetcher: BaseFetcher, username: str): return Extractor(fetcher, username) diff --git a/app/database.py b/app/database.py index e598135..6b4275d 100644 --- a/app/database.py +++ b/app/database.py @@ -1,12 +1,26 @@ import sys from datetime import datetime +from typing import Literal from typing_extensions import TypedDict +from app.site import Site + SCHEMA_NAME = "coach_scraper" TABLE_NAME = "export" +RowKey = ( + Literal["site"] + | Literal["username"] + | Literal["name"] + | Literal["image_url"] + | Literal["rapid"] + | Literal["blitz"] + | Literal["bullet"] +) + + class Row(TypedDict, total=False): """Representation of a row of the export table. @@ -14,7 +28,7 @@ class Row(TypedDict, total=False): """ # Website the given coach was sourced from. - site: str + site: Site # Username used on the source site. username: str # Real name. @@ -110,4 +124,5 @@ def upsert_row(conn, row: Row): ) conn.commit() finally: - cursor.close() + if cursor: + cursor.close() diff --git a/app/lichess.py b/app/lichess.py index 9b67d48..e7e14e5 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -1,15 +1,15 @@ import asyncio import os import os.path -from typing import List, Union +from typing import List import aiohttp -from bs4 import BeautifulSoup, SoupStrainer +from bs4 import BeautifulSoup, SoupStrainer, Tag from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.pipeline import Site +from app.site import Site # The number of pages we will at most iterate through. This number was # determined by going to https://lichess.org/coach/all/all/alphabetical @@ -24,7 +24,7 @@ class Fetcher(BaseFetcher): def __init__(self, session: aiohttp.ClientSession): super().__init__(site=Site.LICHESS, session=session) - async def scrape_usernames(self, page_no: int) -> List[str]: + async def scrape_usernames(self, page_no: int) -> List[str] | None: if page_no > MAX_PAGES: return [] @@ -96,20 +96,22 @@ class Fetcher(BaseFetcher): f.write(response) -def _profile_filter(elem, attrs): +def _profile_filter(elem: Tag | str | None, attrs={}) -> bool: if "coach-widget" in attrs.get("class", ""): return True + return False -def _stats_filter(elem, attrs): +def _stats_filter(elem: Tag | str | None, attrs={}) -> bool: if "profile-side" in attrs.get("class", ""): return True if "sub-ratings" in attrs.get("class", ""): return True + return False class Extractor(BaseExtractor): - def __init__(self, fetcher: Fetcher, username: str): + def __init__(self, fetcher: BaseFetcher, username: str): super().__init__(fetcher, username) self.profile_soup = None @@ -132,43 +134,60 @@ class Extractor(BaseExtractor): except FileNotFoundError: pass - def get_name(self) -> Union[str, None]: - try: - profile_side = self.stats_soup.find("div", class_="profile-side") - user_infos = profile_side.find("div", class_="user-infos") - name = user_infos.find("strong", class_="name") - return name.get_text().strip() - except AttributeError: + def get_name(self) -> str | None: + if self.stats_soup is None: return None - - def get_image_url(self) -> Union[str, None]: - try: - picture = self.profile_soup.find("img", class_="picture") - src = picture.get("src", "") - if "image.lichess1.org" in src: - return src - except AttributeError: + profile_side = self.stats_soup.find("div", class_="profile-side") + if not isinstance(profile_side, Tag): return None + user_infos = profile_side.find("div", class_="user-infos") + if not isinstance(user_infos, Tag): + return None + name = user_infos.find("strong", class_="name") + if not isinstance(name, Tag): + return None + return name.get_text().strip() - def get_rapid(self) -> Union[int, None]: + def get_image_url(self) -> str | None: + if self.profile_soup is None: + return None + picture = self.profile_soup.find("img", class_="picture") + if not isinstance(picture, Tag): + return None + src = picture.get("src", "") + if not isinstance(src, str): + return None + if "image.lichess1.org" not in src: + return None + return src + + def get_rapid(self) -> int | None: return self._find_rating("rapid") - def get_blitz(self) -> Union[int, None]: + def get_blitz(self) -> int | None: return self._find_rating("blitz") - def get_bullet(self) -> Union[int, None]: + def get_bullet(self) -> int | None: return self._find_rating("bullet") - def _find_rating(self, name) -> Union[int, None]: + def _find_rating(self, name) -> int | None: + if self.stats_soup is None: + return None + a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}") + if not isinstance(a, Tag): + return None + rating = a.find("rating") + if not isinstance(rating, Tag): + return None + strong = rating.find("strong") + if not isinstance(strong, Tag): + return None + value = strong.get_text() + if value[-1] == "?": + value = value[:-1] try: - a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}") - rating = a.find("rating") - strong = rating.find("strong") - value = strong.get_text() - if value[-1] == "?": - value = value[:-1] return int(value) - except (AttributeError, ValueError): + except ValueError: return None @@ -176,5 +195,5 @@ class Pipeline(BasePipeline): def get_fetcher(self, session: aiohttp.ClientSession): return Fetcher(session) - def get_extractor(self, fetcher: Fetcher, username: str): + def get_extractor(self, fetcher: BaseFetcher, username: str): return Extractor(fetcher, username) diff --git a/app/pipeline.py b/app/pipeline.py index fca5ed9..a9280a6 100644 --- a/app/pipeline.py +++ b/app/pipeline.py @@ -1,16 +1,11 @@ import asyncio -import enum import os.path -from typing import Any, List, Tuple, Union +from typing import Any, List, Tuple import aiohttp -from app.database import Row, upsert_row - - -class Site(enum.Enum): - CHESSCOM = "chesscom" - LICHESS = "lichess" +from app.database import Row, RowKey, upsert_row +from app.site import Site class Fetcher: @@ -48,7 +43,7 @@ class Fetcher: def path_page_file(self, page_no: int): return os.path.join(self.path_pages_dir(), f"{page_no}.txt") - async def fetch(self, url: str) -> Tuple[Union[str, None], int]: + async def fetch(self, url: str) -> Tuple[str | None, int]: """Make network requests using the internal session. @param url @@ -63,7 +58,7 @@ class Fetcher: return await response.text(), 200 return None, response.status - async def scrape_usernames(self, page_no: int) -> Union[List[str], None]: + async def scrape_usernames(self, page_no: int) -> List[str] | None: """Source the specified site for all coach usernames. All pages should be downloaded at `self.path_page_file()`. Any cached @@ -93,7 +88,7 @@ class Fetcher: raise NotImplementedError() -def _insert(row: Row, key: str, value: Any): +def _insert(row: Row, key: RowKey, value: Any): if value is not None: row[key] = value @@ -103,19 +98,19 @@ class Extractor: self.fetcher = fetcher self.username = username - def get_name(self) -> Union[str, None]: + def get_name(self) -> str | None: raise NotImplementedError() - def get_image_url(self) -> Union[str, None]: + def get_image_url(self) -> str | None: raise NotImplementedError() - def get_rapid(self) -> Union[int, None]: + def get_rapid(self) -> int | None: raise NotImplementedError() - def get_blitz(self) -> Union[int, None]: + def get_blitz(self) -> int | None: raise NotImplementedError() - def get_bullet(self) -> Union[int, None]: + def get_bullet(self) -> int | None: raise NotImplementedError() def extract(self) -> Row: @@ -160,7 +155,7 @@ class Pipeline: async def process(self, conn, session: aiohttp.ClientSession): fetcher = self.get_fetcher(session) - queue = asyncio.Queue() + queue: asyncio.Queue = asyncio.Queue() # Create a batch of workers to process the jobs put into the queue. workers = [] @@ -171,14 +166,11 @@ class Pipeline: # Begin downloading all coach usernames and files. The workers will # run concurrently to extract all the relvant information and write page_no = 1 - usernames = [None] - while len(usernames): + usernames: List[str] | None = [""] + while usernames is None or len(usernames): usernames = await fetcher.scrape_usernames(page_no) page_no += 1 - if usernames is None: - usernames = [None] - continue - for username in usernames: + for username in usernames or []: await fetcher._download_user_files(username) extractor = self.get_extractor(fetcher, username) queue.put_nowait((conn, extractor)) diff --git a/app/site.py b/app/site.py new file mode 100644 index 0000000..ce5eb48 --- /dev/null +++ b/app/site.py @@ -0,0 +1,6 @@ +import enum + + +class Site(enum.Enum): + CHESSCOM = "chesscom" + LICHESS = "lichess" diff --git a/flake.nix b/flake.nix index 562a693..e945c66 100644 --- a/flake.nix +++ b/flake.nix @@ -58,6 +58,27 @@ }) ) pypkgs-build-requirements ); + + types = with pkgs.python311Packages; { + beautifulsoup4 = buildPythonPackage rec { + pname = "types-beautifulsoup4"; + version = "4.12.0.7"; + src = pkgs.fetchPypi { + inherit pname version; + sha256 = "sha256-WZgAKNKb9V0Ns1nvowW3W6zwy5Lj8/az/UCPJTHfJ0w"; + }; + doCheck = false; + }; + psycopg2 = buildPythonPackage rec { + pname = "types-psycopg2"; + version = "2.9.21.19"; + src = pkgs.fetchPypi { + inherit pname version; + sha256 = "sha256-7DquUi3enEEUFZe8QRI7TJVftAk7H8fsbuYHeVoKCI8="; + }; + doCheck = false; + }; + }; in { packages = { @@ -88,6 +109,9 @@ pyls-isort python-lsp-black python-lsp-server + types.beautifulsoup4 + types.psycopg2 + typing-extensions ]); }; }); diff --git a/poetry.lock b/poetry.lock index 90592f3..a8abcc9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -449,42 +449,6 @@ files = [ {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, ] -[[package]] -name = "types-beautifulsoup4" -version = "4.12.0.7" -description = "Typing stubs for beautifulsoup4" -optional = false -python-versions = ">=3.7" -files = [ - {file = "types-beautifulsoup4-4.12.0.7.tar.gz", hash = "sha256:59980028d29bf55d0db359efa305b75bacf0cb92e3f3f6b3fd408f2531df274c"}, - {file = "types_beautifulsoup4-4.12.0.7-py3-none-any.whl", hash = "sha256:8b03b054cb2e62abf82bbbeda57a07257026f4ed9010ef17d8f8eff43bb1f9b7"}, -] - -[package.dependencies] -types-html5lib = "*" - -[[package]] -name = "types-html5lib" -version = "1.1.11.15" -description = "Typing stubs for html5lib" -optional = false -python-versions = "*" -files = [ - {file = "types-html5lib-1.1.11.15.tar.gz", hash = "sha256:80e1a2062d22a3affe5c28d97da30bffbf3a076d393c80fc6f1671216c1bd492"}, - {file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"}, -] - -[[package]] -name = "typing-extensions" -version = "4.8.0" -description = "Backported and Experimental Type Hints for Python 3.8+" -optional = false -python-versions = ">=3.8" -files = [ - {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, - {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, -] - [[package]] name = "yarl" version = "1.9.3" @@ -591,4 +555,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "9e4078c4f5eeffbc90b895528738c457dadd671a784ab8c411a5c3fe91925e34" +content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6" diff --git a/pyproject.toml b/pyproject.toml index c18ca63..6ea0d68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,13 +12,13 @@ aiohttp = "^3.8.6" lxml = "^4.9.3" psycopg2 = "^2.9.9" -[tool.poetry.group.dev.dependencies] -types-beautifulsoup4 = "^4.12.0.7" -typing-extensions = "^4.8.0" - [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] app = "app.__main__:main" + +[[tool.mypy.overrides]] +module = "aiohttp" +ignore_missing_imports = true