Fix all mypy warnings. (#9)

2023-12-05 12:54:12 -07:00 · 2023-12-05 12:54:12 -07:00 · 82dbef21b6
parent 8d7f1e7c4a
commit 82dbef21b6
9 changed files with 153 additions and 124 deletions
--- a/app/main.py
+++ b/app/main.py
@ -8,7 +8,7 @@ import psycopg2
 from app.chesscom import Pipeline as ChesscomPipeline
 from app.database import backup_database
 from app.lichess import Pipeline as LichessPipeline
-from app.pipeline import Site
+from app.site import Site
 # The number of parallel extraction jobs that are run at a time.
 WORKER_COUNT = 10
--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -2,15 +2,15 @@ import asyncio
 import json
 import os
 import os.path
-from typing import List, Union
+from typing import List
 import aiohttp
-from bs4 import BeautifulSoup, SoupStrainer
+from bs4 import BeautifulSoup, SoupStrainer, Tag
 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.pipeline import Site
+from app.site import Site
 # The number of coach listing pages we will at most iterate through. This number
 # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -25,7 +25,7 @@ class Fetcher(BaseFetcher):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.CHESSCOM, session=session)
-    async def scrape_usernames(self, page_no: int) -> List[str]:
+    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        if page_no > MAX_PAGES:
            return []
@ -100,15 +100,16 @@ class Fetcher(BaseFetcher):
                f.write(response)
-def _profile_filter(elem, attrs):
+def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "profile-header-info" in attrs.get("class", ""):
        return True
    if "profile-card-info" in attrs.get("class", ""):
        return True
    return False
 class Extractor(BaseExtractor):
-    def __init__(self, fetcher: Fetcher, username: str):
+    def __init__(self, fetcher: BaseFetcher, username: str):
        super().__init__(fetcher, username)
        self.profile_soup = None
@ -131,29 +132,37 @@ class Extractor(BaseExtractor):
        except FileNotFoundError:
            pass
-    def get_name(self) -> Union[str, None]:
+    def get_name(self) -> str | None:
-        try:
+        if self.profile_soup is None:
            name = self.profile_soup.find("div", class_="profile-card-name")
            return name.get_text().strip()
        except AttributeError:
            return None
-
+        name = self.profile_soup.find("div", class_="profile-card-name")
-    def get_image_url(self) -> Union[str, None]:
+        if not isinstance(name, Tag):
        try:
            div = self.profile_soup.find("div", class_="profile-header-avatar")
            src = div.find("img").get("src", "")
            if "images.chesscomfiles.com" in src:
                return src
        except AttributeError:
            return None
        return name.get_text().strip()
-    def get_rapid(self) -> Union[int, None]:
+    def get_image_url(self) -> str | None:
        if self.profile_soup is None:
            return None
        div = self.profile_soup.find("div", class_="profile-header-avatar")
        if not isinstance(div, Tag):
            return None
        img = div.find("img")
        if not isinstance(img, Tag):
            return None
        src = img.get("src", "")
        if not isinstance(src, str):
            return None
        if "images.chesscomfiles.com" not in src:
            return None
        return src
    def get_rapid(self) -> int | None:
        return self.stats_json.get("rapid", {}).get("rating")
-    def get_blitz(self) -> Union[int, None]:
+    def get_blitz(self) -> int | None:
        return self.stats_json.get("lightning", {}).get("rating")
-    def get_bullet(self) -> Union[int, None]:
+    def get_bullet(self) -> int | None:
        return self.stats_json.get("bullet", {}).get("rating")
@ -161,5 +170,5 @@ class Pipeline(BasePipeline):
    def get_fetcher(self, session: aiohttp.ClientSession):
        return Fetcher(session)
-    def get_extractor(self, fetcher: Fetcher, username: str):
+    def get_extractor(self, fetcher: BaseFetcher, username: str):
        return Extractor(fetcher, username)
--- a/app/database.py
+++ b/app/database.py
@ -1,12 +1,26 @@
 import sys
 from datetime import datetime
 from typing import Literal
 from typing_extensions import TypedDict
 from app.site import Site
 SCHEMA_NAME = "coach_scraper"
 TABLE_NAME = "export"
 RowKey = (
    Literal["site"]
    | Literal["username"]
    | Literal["name"]
    | Literal["image_url"]
    | Literal["rapid"]
    | Literal["blitz"]
    | Literal["bullet"]
 )
 class Row(TypedDict, total=False):
    """Representation of a row of the export table.
@ -14,7 +28,7 @@ class Row(TypedDict, total=False):
    """
    # Website the given coach was sourced from.
-    site: str
+    site: Site
    # Username used on the source site.
    username: str
    # Real name.
@ -110,4 +124,5 @@ def upsert_row(conn, row: Row):
        )
        conn.commit()
    finally:
-        cursor.close()
+        if cursor:
            cursor.close()
--- a/app/lichess.py
+++ b/app/lichess.py
@ -1,15 +1,15 @@
 import asyncio
 import os
 import os.path
-from typing import List, Union
+from typing import List
 import aiohttp
-from bs4 import BeautifulSoup, SoupStrainer
+from bs4 import BeautifulSoup, SoupStrainer, Tag
 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.pipeline import Site
+from app.site import Site
 # The number of pages we will at most iterate through. This number was
 # determined by going to https://lichess.org/coach/all/all/alphabetical
@ -24,7 +24,7 @@ class Fetcher(BaseFetcher):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.LICHESS, session=session)
-    async def scrape_usernames(self, page_no: int) -> List[str]:
+    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        if page_no > MAX_PAGES:
            return []
@ -96,20 +96,22 @@ class Fetcher(BaseFetcher):
                f.write(response)
-def _profile_filter(elem, attrs):
+def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "coach-widget" in attrs.get("class", ""):
        return True
    return False
-def _stats_filter(elem, attrs):
+def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "profile-side" in attrs.get("class", ""):
        return True
    if "sub-ratings" in attrs.get("class", ""):
        return True
    return False
 class Extractor(BaseExtractor):
-    def __init__(self, fetcher: Fetcher, username: str):
+    def __init__(self, fetcher: BaseFetcher, username: str):
        super().__init__(fetcher, username)
        self.profile_soup = None
@ -132,43 +134,60 @@ class Extractor(BaseExtractor):
        except FileNotFoundError:
            pass
-    def get_name(self) -> Union[str, None]:
+    def get_name(self) -> str | None:
-        try:
+        if self.stats_soup is None:
            profile_side = self.stats_soup.find("div", class_="profile-side")
            user_infos = profile_side.find("div", class_="user-infos")
            name = user_infos.find("strong", class_="name")
            return name.get_text().strip()
        except AttributeError:
            return None
-
+        profile_side = self.stats_soup.find("div", class_="profile-side")
-    def get_image_url(self) -> Union[str, None]:
+        if not isinstance(profile_side, Tag):
        try:
            picture = self.profile_soup.find("img", class_="picture")
            src = picture.get("src", "")
            if "image.lichess1.org" in src:
                return src
        except AttributeError:
            return None
        user_infos = profile_side.find("div", class_="user-infos")
        if not isinstance(user_infos, Tag):
            return None
        name = user_infos.find("strong", class_="name")
        if not isinstance(name, Tag):
            return None
        return name.get_text().strip()
-    def get_rapid(self) -> Union[int, None]:
+    def get_image_url(self) -> str | None:
        if self.profile_soup is None:
            return None
        picture = self.profile_soup.find("img", class_="picture")
        if not isinstance(picture, Tag):
            return None
        src = picture.get("src", "")
        if not isinstance(src, str):
            return None
        if "image.lichess1.org" not in src:
            return None
        return src
    def get_rapid(self) -> int | None:
        return self._find_rating("rapid")
-    def get_blitz(self) -> Union[int, None]:
+    def get_blitz(self) -> int | None:
        return self._find_rating("blitz")
-    def get_bullet(self) -> Union[int, None]:
+    def get_bullet(self) -> int | None:
        return self._find_rating("bullet")
-    def _find_rating(self, name) -> Union[int, None]:
+    def _find_rating(self, name) -> int | None:
        if self.stats_soup is None:
            return None
        a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
        if not isinstance(a, Tag):
            return None
        rating = a.find("rating")
        if not isinstance(rating, Tag):
            return None
        strong = rating.find("strong")
        if not isinstance(strong, Tag):
            return None
        value = strong.get_text()
        if value[-1] == "?":
            value = value[:-1]
        try:
            a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
            rating = a.find("rating")
            strong = rating.find("strong")
            value = strong.get_text()
            if value[-1] == "?":
                value = value[:-1]
            return int(value)
-        except (AttributeError, ValueError):
+        except ValueError:
            return None
@ -176,5 +195,5 @@ class Pipeline(BasePipeline):
    def get_fetcher(self, session: aiohttp.ClientSession):
        return Fetcher(session)
-    def get_extractor(self, fetcher: Fetcher, username: str):
+    def get_extractor(self, fetcher: BaseFetcher, username: str):
        return Extractor(fetcher, username)
--- a/app/pipeline.py
+++ b/app/pipeline.py
@ -1,16 +1,11 @@
 import asyncio
 import enum
 import os.path
-from typing import Any, List, Tuple, Union
+from typing import Any, List, Tuple
 import aiohttp
-from app.database import Row, upsert_row
+from app.database import Row, RowKey, upsert_row
-
+from app.site import Site
 class Site(enum.Enum):
    CHESSCOM = "chesscom"
    LICHESS = "lichess"
 class Fetcher:
@ -48,7 +43,7 @@ class Fetcher:
    def path_page_file(self, page_no: int):
        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
-    async def fetch(self, url: str) -> Tuple[Union[str, None], int]:
+    async def fetch(self, url: str) -> Tuple[str | None, int]:
        """Make network requests using the internal session.
        @param url
@ -63,7 +58,7 @@ class Fetcher:
                return await response.text(), 200
        return None, response.status
-    async def scrape_usernames(self, page_no: int) -> Union[List[str], None]:
+    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        """Source the specified site for all coach usernames.
        All pages should be downloaded at `self.path_page_file()`. Any cached
@ -93,7 +88,7 @@ class Fetcher:
        raise NotImplementedError()
-def _insert(row: Row, key: str, value: Any):
+def _insert(row: Row, key: RowKey, value: Any):
    if value is not None:
        row[key] = value
@ -103,19 +98,19 @@ class Extractor:
        self.fetcher = fetcher
        self.username = username
-    def get_name(self) -> Union[str, None]:
+    def get_name(self) -> str | None:
        raise NotImplementedError()
-    def get_image_url(self) -> Union[str, None]:
+    def get_image_url(self) -> str | None:
        raise NotImplementedError()
-    def get_rapid(self) -> Union[int, None]:
+    def get_rapid(self) -> int | None:
        raise NotImplementedError()
-    def get_blitz(self) -> Union[int, None]:
+    def get_blitz(self) -> int | None:
        raise NotImplementedError()
-    def get_bullet(self) -> Union[int, None]:
+    def get_bullet(self) -> int | None:
        raise NotImplementedError()
    def extract(self) -> Row:
@ -160,7 +155,7 @@ class Pipeline:
    async def process(self, conn, session: aiohttp.ClientSession):
        fetcher = self.get_fetcher(session)
-        queue = asyncio.Queue()
+        queue: asyncio.Queue = asyncio.Queue()
        # Create a batch of workers to process the jobs put into the queue.
        workers = []
@ -171,14 +166,11 @@ class Pipeline:
        # Begin downloading all coach usernames and files. The workers will
        # run concurrently to extract all the relvant information and write
        page_no = 1
-        usernames = [None]
+        usernames: List[str] | None = [""]
-        while len(usernames):
+        while usernames is None or len(usernames):
            usernames = await fetcher.scrape_usernames(page_no)
            page_no += 1
-            if usernames is None:
+            for username in usernames or []:
                usernames = [None]
                continue
            for username in usernames:
                await fetcher._download_user_files(username)
                extractor = self.get_extractor(fetcher, username)
                queue.put_nowait((conn, extractor))
--- a/app/site.py
+++ b/app/site.py
@ -0,0 +1,6 @@
 import enum
 class Site(enum.Enum):
    CHESSCOM = "chesscom"
    LICHESS = "lichess"
--- a/flake.nix
+++ b/flake.nix
@ -58,6 +58,27 @@
            })
          ) pypkgs-build-requirements
        );
        types = with pkgs.python311Packages; {
          beautifulsoup4 = buildPythonPackage rec {
            pname = "types-beautifulsoup4";
            version = "4.12.0.7";
            src = pkgs.fetchPypi {
              inherit pname version;
              sha256 = "sha256-WZgAKNKb9V0Ns1nvowW3W6zwy5Lj8/az/UCPJTHfJ0w";
            };
            doCheck = false;
          };
          psycopg2 = buildPythonPackage rec {
            pname = "types-psycopg2";
            version = "2.9.21.19";
            src = pkgs.fetchPypi {
              inherit pname version;
              sha256 = "sha256-7DquUi3enEEUFZe8QRI7TJVftAk7H8fsbuYHeVoKCI8=";
            };
            doCheck = false;
          };
        };
      in
      {
        packages = {
@ -88,6 +109,9 @@
            pyls-isort
            python-lsp-black
            python-lsp-server
            types.beautifulsoup4
            types.psycopg2
            typing-extensions
          ]);
        };
      });
--- a/poetry.lock
+++ b/poetry.lock
@ -449,42 +449,6 @@ files = [
    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
 ]
 [[package]]
 name = "types-beautifulsoup4"
 version = "4.12.0.7"
 description = "Typing stubs for beautifulsoup4"
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "types-beautifulsoup4-4.12.0.7.tar.gz", hash = "sha256:59980028d29bf55d0db359efa305b75bacf0cb92e3f3f6b3fd408f2531df274c"},
    {file = "types_beautifulsoup4-4.12.0.7-py3-none-any.whl", hash = "sha256:8b03b054cb2e62abf82bbbeda57a07257026f4ed9010ef17d8f8eff43bb1f9b7"},
 ]
 [package.dependencies]
 types-html5lib = "*"
 [[package]]
 name = "types-html5lib"
 version = "1.1.11.15"
 description = "Typing stubs for html5lib"
 optional = false
 python-versions = "*"
 files = [
    {file = "types-html5lib-1.1.11.15.tar.gz", hash = "sha256:80e1a2062d22a3affe5c28d97da30bffbf3a076d393c80fc6f1671216c1bd492"},
    {file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
 ]
 [[package]]
 name = "typing-extensions"
 version = "4.8.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 files = [
    {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
    {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
 ]
 [[package]]
 name = "yarl"
 version = "1.9.3"
@ -591,4 +555,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "9e4078c4f5eeffbc90b895528738c457dadd671a784ab8c411a5c3fe91925e34"
+content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,13 +12,13 @@ aiohttp = "^3.8.6"
 lxml = "^4.9.3"
 psycopg2 = "^2.9.9"
 [tool.poetry.group.dev.dependencies]
 types-beautifulsoup4 = "^4.12.0.7"
 typing-extensions = "^4.8.0"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 [tool.poetry.scripts]
 app = "app.__main__:main"
 [[tool.mypy.overrides]]
 module = "aiohttp"
 ignore_missing_imports = true