Fix all mypy warnings. (#9)

2023-12-05 12:54:12 -07:00 · 2023-12-05 12:54:12 -07:00 · 82dbef21b6
parent 8d7f1e7c4a
commit 82dbef21b6
9 changed files with 153 additions and 124 deletions
--- a/app/main.py
+++ b/app/main.py
@ -8,7 +8,7 @@ import psycopg2
 from app.chesscom import Pipeline as ChesscomPipeline
 from app.database import backup_database
 from app.lichess import Pipeline as LichessPipeline
-from app.pipeline import Site
+from app.site import Site

 # The number of parallel extraction jobs that are run at a time.
 WORKER_COUNT = 10
--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -2,15 +2,15 @@ import asyncio
 import json
 import os
 import os.path
-from typing import List, Union
+from typing import List

 import aiohttp
-from bs4 import BeautifulSoup, SoupStrainer
+from bs4 import BeautifulSoup, SoupStrainer, Tag

 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.pipeline import Site
+from app.site import Site

 # The number of coach listing pages we will at most iterate through. This number
 # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -25,7 +25,7 @@ class Fetcher(BaseFetcher):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.CHESSCOM, session=session)

-    async def scrape_usernames(self, page_no: int) -> List[str]:
+    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        if page_no > MAX_PAGES:
            return []

@ -100,15 +100,16 @@ class Fetcher(BaseFetcher):
                f.write(response)


-def _profile_filter(elem, attrs):
+def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "profile-header-info" in attrs.get("class", ""):
        return True
    if "profile-card-info" in attrs.get("class", ""):
        return True
+    return False


 class Extractor(BaseExtractor):
-    def __init__(self, fetcher: Fetcher, username: str):
+    def __init__(self, fetcher: BaseFetcher, username: str):
        super().__init__(fetcher, username)

        self.profile_soup = None
@ -131,29 +132,37 @@ class Extractor(BaseExtractor):
        except FileNotFoundError:
            pass

-    def get_name(self) -> Union[str, None]:
-        try:
-            name = self.profile_soup.find("div", class_="profile-card-name")
-            return name.get_text().strip()
-        except AttributeError:
+    def get_name(self) -> str | None:
+        if self.profile_soup is None:
            return None
-
-    def get_image_url(self) -> Union[str, None]:
-        try:
-            div = self.profile_soup.find("div", class_="profile-header-avatar")
-            src = div.find("img").get("src", "")
-            if "images.chesscomfiles.com" in src:
-                return src
-        except AttributeError:
+        name = self.profile_soup.find("div", class_="profile-card-name")
+        if not isinstance(name, Tag):
            return None
+        return name.get_text().strip()

-    def get_rapid(self) -> Union[int, None]:
+    def get_image_url(self) -> str | None:
+        if self.profile_soup is None:
+            return None
+        div = self.profile_soup.find("div", class_="profile-header-avatar")
+        if not isinstance(div, Tag):
+            return None
+        img = div.find("img")
+        if not isinstance(img, Tag):
+            return None
+        src = img.get("src", "")
+        if not isinstance(src, str):
+            return None
+        if "images.chesscomfiles.com" not in src:
+            return None
+        return src
+
+    def get_rapid(self) -> int | None:
        return self.stats_json.get("rapid", {}).get("rating")

-    def get_blitz(self) -> Union[int, None]:
+    def get_blitz(self) -> int | None:
        return self.stats_json.get("lightning", {}).get("rating")

-    def get_bullet(self) -> Union[int, None]:
+    def get_bullet(self) -> int | None:
        return self.stats_json.get("bullet", {}).get("rating")


@ -161,5 +170,5 @@ class Pipeline(BasePipeline):
    def get_fetcher(self, session: aiohttp.ClientSession):
        return Fetcher(session)

-    def get_extractor(self, fetcher: Fetcher, username: str):
+    def get_extractor(self, fetcher: BaseFetcher, username: str):
        return Extractor(fetcher, username)
--- a/app/database.py
+++ b/app/database.py
@ -1,12 +1,26 @@
 import sys
 from datetime import datetime
+from typing import Literal

 from typing_extensions import TypedDict

+from app.site import Site
+
 SCHEMA_NAME = "coach_scraper"
 TABLE_NAME = "export"


+RowKey = (
+    Literal["site"]
+    | Literal["username"]
+    | Literal["name"]
+    | Literal["image_url"]
+    | Literal["rapid"]
+    | Literal["blitz"]
+    | Literal["bullet"]
+)
+
+
 class Row(TypedDict, total=False):
    """Representation of a row of the export table.

@ -14,7 +28,7 @@ class Row(TypedDict, total=False):
    """

    # Website the given coach was sourced from.
-    site: str
+    site: Site
    # Username used on the source site.
    username: str
    # Real name.
@ -110,4 +124,5 @@ def upsert_row(conn, row: Row):
        )
        conn.commit()
    finally:
-        cursor.close()
+        if cursor:
+            cursor.close()
--- a/app/lichess.py
+++ b/app/lichess.py
@ -1,15 +1,15 @@
 import asyncio
 import os
 import os.path
-from typing import List, Union
+from typing import List

 import aiohttp
-from bs4 import BeautifulSoup, SoupStrainer
+from bs4 import BeautifulSoup, SoupStrainer, Tag

 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.pipeline import Site
+from app.site import Site

 # The number of pages we will at most iterate through. This number was
 # determined by going to https://lichess.org/coach/all/all/alphabetical
@ -24,7 +24,7 @@ class Fetcher(BaseFetcher):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.LICHESS, session=session)

-    async def scrape_usernames(self, page_no: int) -> List[str]:
+    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        if page_no > MAX_PAGES:
            return []

@ -96,20 +96,22 @@ class Fetcher(BaseFetcher):
                f.write(response)


-def _profile_filter(elem, attrs):
+def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "coach-widget" in attrs.get("class", ""):
        return True
+    return False


-def _stats_filter(elem, attrs):
+def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "profile-side" in attrs.get("class", ""):
        return True
    if "sub-ratings" in attrs.get("class", ""):
        return True
+    return False


 class Extractor(BaseExtractor):
-    def __init__(self, fetcher: Fetcher, username: str):
+    def __init__(self, fetcher: BaseFetcher, username: str):
        super().__init__(fetcher, username)

        self.profile_soup = None
@ -132,43 +134,60 @@ class Extractor(BaseExtractor):
        except FileNotFoundError:
            pass

-    def get_name(self) -> Union[str, None]:
-        try:
-            profile_side = self.stats_soup.find("div", class_="profile-side")
-            user_infos = profile_side.find("div", class_="user-infos")
-            name = user_infos.find("strong", class_="name")
-            return name.get_text().strip()
-        except AttributeError:
+    def get_name(self) -> str | None:
+        if self.stats_soup is None:
            return None
-
-    def get_image_url(self) -> Union[str, None]:
-        try:
-            picture = self.profile_soup.find("img", class_="picture")
-            src = picture.get("src", "")
-            if "image.lichess1.org" in src:
-                return src
-        except AttributeError:
+        profile_side = self.stats_soup.find("div", class_="profile-side")
+        if not isinstance(profile_side, Tag):
            return None
+        user_infos = profile_side.find("div", class_="user-infos")
+        if not isinstance(user_infos, Tag):
+            return None
+        name = user_infos.find("strong", class_="name")
+        if not isinstance(name, Tag):
+            return None
+        return name.get_text().strip()

-    def get_rapid(self) -> Union[int, None]:
+    def get_image_url(self) -> str | None:
+        if self.profile_soup is None:
+            return None
+        picture = self.profile_soup.find("img", class_="picture")
+        if not isinstance(picture, Tag):
+            return None
+        src = picture.get("src", "")
+        if not isinstance(src, str):
+            return None
+        if "image.lichess1.org" not in src:
+            return None
+        return src
+
+    def get_rapid(self) -> int | None:
        return self._find_rating("rapid")

-    def get_blitz(self) -> Union[int, None]:
+    def get_blitz(self) -> int | None:
        return self._find_rating("blitz")

-    def get_bullet(self) -> Union[int, None]:
+    def get_bullet(self) -> int | None:
        return self._find_rating("bullet")

-    def _find_rating(self, name) -> Union[int, None]:
+    def _find_rating(self, name) -> int | None:
+        if self.stats_soup is None:
+            return None
+        a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
+        if not isinstance(a, Tag):
+            return None
+        rating = a.find("rating")
+        if not isinstance(rating, Tag):
+            return None
+        strong = rating.find("strong")
+        if not isinstance(strong, Tag):
+            return None
+        value = strong.get_text()
+        if value[-1] == "?":
+            value = value[:-1]
        try:
-            a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
-            rating = a.find("rating")
-            strong = rating.find("strong")
-            value = strong.get_text()
-            if value[-1] == "?":
-                value = value[:-1]
            return int(value)
-        except (AttributeError, ValueError):
+        except ValueError:
            return None


@ -176,5 +195,5 @@ class Pipeline(BasePipeline):
    def get_fetcher(self, session: aiohttp.ClientSession):
        return Fetcher(session)

-    def get_extractor(self, fetcher: Fetcher, username: str):
+    def get_extractor(self, fetcher: BaseFetcher, username: str):
        return Extractor(fetcher, username)
--- a/app/pipeline.py
+++ b/app/pipeline.py
@ -1,16 +1,11 @@
 import asyncio
-import enum
 import os.path
-from typing import Any, List, Tuple, Union
+from typing import Any, List, Tuple

 import aiohttp

-from app.database import Row, upsert_row
-
-
-class Site(enum.Enum):
-    CHESSCOM = "chesscom"
-    LICHESS = "lichess"
+from app.database import Row, RowKey, upsert_row
+from app.site import Site


 class Fetcher:
@ -48,7 +43,7 @@ class Fetcher:
    def path_page_file(self, page_no: int):
        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")

-    async def fetch(self, url: str) -> Tuple[Union[str, None], int]:
+    async def fetch(self, url: str) -> Tuple[str | None, int]:
        """Make network requests using the internal session.

        @param url
@ -63,7 +58,7 @@ class Fetcher:
                return await response.text(), 200
        return None, response.status

-    async def scrape_usernames(self, page_no: int) -> Union[List[str], None]:
+    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        """Source the specified site for all coach usernames.

        All pages should be downloaded at `self.path_page_file()`. Any cached
@ -93,7 +88,7 @@ class Fetcher:
        raise NotImplementedError()


-def _insert(row: Row, key: str, value: Any):
+def _insert(row: Row, key: RowKey, value: Any):
    if value is not None:
        row[key] = value

@ -103,19 +98,19 @@ class Extractor:
        self.fetcher = fetcher
        self.username = username

-    def get_name(self) -> Union[str, None]:
+    def get_name(self) -> str | None:
        raise NotImplementedError()

-    def get_image_url(self) -> Union[str, None]:
+    def get_image_url(self) -> str | None:
        raise NotImplementedError()

-    def get_rapid(self) -> Union[int, None]:
+    def get_rapid(self) -> int | None:
        raise NotImplementedError()

-    def get_blitz(self) -> Union[int, None]:
+    def get_blitz(self) -> int | None:
        raise NotImplementedError()

-    def get_bullet(self) -> Union[int, None]:
+    def get_bullet(self) -> int | None:
        raise NotImplementedError()

    def extract(self) -> Row:
@ -160,7 +155,7 @@ class Pipeline:
    async def process(self, conn, session: aiohttp.ClientSession):
        fetcher = self.get_fetcher(session)

-        queue = asyncio.Queue()
+        queue: asyncio.Queue = asyncio.Queue()

        # Create a batch of workers to process the jobs put into the queue.
        workers = []
@ -171,14 +166,11 @@ class Pipeline:
        # Begin downloading all coach usernames and files. The workers will
        # run concurrently to extract all the relvant information and write
        page_no = 1
-        usernames = [None]
-        while len(usernames):
+        usernames: List[str] | None = [""]
+        while usernames is None or len(usernames):
            usernames = await fetcher.scrape_usernames(page_no)
            page_no += 1
-            if usernames is None:
-                usernames = [None]
-                continue
-            for username in usernames:
+            for username in usernames or []:
                await fetcher._download_user_files(username)
                extractor = self.get_extractor(fetcher, username)
                queue.put_nowait((conn, extractor))
--- a/app/site.py
+++ b/app/site.py
@ -0,0 +1,6 @@
+import enum
+
+
+class Site(enum.Enum):
+    CHESSCOM = "chesscom"
+    LICHESS = "lichess"
--- a/flake.nix
+++ b/flake.nix
@ -58,6 +58,27 @@
            })
          ) pypkgs-build-requirements
        );
+
+        types = with pkgs.python311Packages; {
+          beautifulsoup4 = buildPythonPackage rec {
+            pname = "types-beautifulsoup4";
+            version = "4.12.0.7";
+            src = pkgs.fetchPypi {
+              inherit pname version;
+              sha256 = "sha256-WZgAKNKb9V0Ns1nvowW3W6zwy5Lj8/az/UCPJTHfJ0w";
+            };
+            doCheck = false;
+          };
+          psycopg2 = buildPythonPackage rec {
+            pname = "types-psycopg2";
+            version = "2.9.21.19";
+            src = pkgs.fetchPypi {
+              inherit pname version;
+              sha256 = "sha256-7DquUi3enEEUFZe8QRI7TJVftAk7H8fsbuYHeVoKCI8=";
+            };
+            doCheck = false;
+          };
+        };
      in
      {
        packages = {
@ -88,6 +109,9 @@
            pyls-isort
            python-lsp-black
            python-lsp-server
+            types.beautifulsoup4
+            types.psycopg2
+            typing-extensions
          ]);
        };
      });
--- a/poetry.lock
+++ b/poetry.lock
@ -449,42 +449,6 @@ files = [
    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
 ]

-[[package]]
-name = "types-beautifulsoup4"
-version = "4.12.0.7"
-description = "Typing stubs for beautifulsoup4"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "types-beautifulsoup4-4.12.0.7.tar.gz", hash = "sha256:59980028d29bf55d0db359efa305b75bacf0cb92e3f3f6b3fd408f2531df274c"},
-    {file = "types_beautifulsoup4-4.12.0.7-py3-none-any.whl", hash = "sha256:8b03b054cb2e62abf82bbbeda57a07257026f4ed9010ef17d8f8eff43bb1f9b7"},
-]
-
-[package.dependencies]
-types-html5lib = "*"
-
-[[package]]
-name = "types-html5lib"
-version = "1.1.11.15"
-description = "Typing stubs for html5lib"
-optional = false
-python-versions = "*"
-files = [
-    {file = "types-html5lib-1.1.11.15.tar.gz", hash = "sha256:80e1a2062d22a3affe5c28d97da30bffbf3a076d393c80fc6f1671216c1bd492"},
-    {file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
-]
-
-[[package]]
-name = "typing-extensions"
-version = "4.8.0"
-description = "Backported and Experimental Type Hints for Python 3.8+"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
-    {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
-]
-
 [[package]]
 name = "yarl"
 version = "1.9.3"
@ -591,4 +555,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "9e4078c4f5eeffbc90b895528738c457dadd671a784ab8c411a5c3fe91925e34"
+content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,13 +12,13 @@ aiohttp = "^3.8.6"
 lxml = "^4.9.3"
 psycopg2 = "^2.9.9"

-[tool.poetry.group.dev.dependencies]
-types-beautifulsoup4 = "^4.12.0.7"
-typing-extensions = "^4.8.0"
-
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

 [tool.poetry.scripts]
 app = "app.__main__:main"
+
+[[tool.mypy.overrides]]
+module = "aiohttp"
+ignore_missing_imports = true