Fix all mypy warnings. (#9)

pull/10/head
Joshua Potter 2023-12-05 12:54:12 -07:00 committed by GitHub
parent 8d7f1e7c4a
commit 82dbef21b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 153 additions and 124 deletions

View File

@ -8,7 +8,7 @@ import psycopg2
from app.chesscom import Pipeline as ChesscomPipeline from app.chesscom import Pipeline as ChesscomPipeline
from app.database import backup_database from app.database import backup_database
from app.lichess import Pipeline as LichessPipeline from app.lichess import Pipeline as LichessPipeline
from app.pipeline import Site from app.site import Site
# The number of parallel extraction jobs that are run at a time. # The number of parallel extraction jobs that are run at a time.
WORKER_COUNT = 10 WORKER_COUNT = 10

View File

@ -2,15 +2,15 @@ import asyncio
import json import json
import os import os
import os.path import os.path
from typing import List, Union from typing import List
import aiohttp import aiohttp
from bs4 import BeautifulSoup, SoupStrainer from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline from app.pipeline import Pipeline as BasePipeline
from app.pipeline import Site from app.site import Site
# The number of coach listing pages we will at most iterate through. This number # The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -25,7 +25,7 @@ class Fetcher(BaseFetcher):
def __init__(self, session: aiohttp.ClientSession): def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.CHESSCOM, session=session) super().__init__(site=Site.CHESSCOM, session=session)
async def scrape_usernames(self, page_no: int) -> List[str]: async def scrape_usernames(self, page_no: int) -> List[str] | None:
if page_no > MAX_PAGES: if page_no > MAX_PAGES:
return [] return []
@ -100,15 +100,16 @@ class Fetcher(BaseFetcher):
f.write(response) f.write(response)
def _profile_filter(elem, attrs): def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
if "profile-header-info" in attrs.get("class", ""): if "profile-header-info" in attrs.get("class", ""):
return True return True
if "profile-card-info" in attrs.get("class", ""): if "profile-card-info" in attrs.get("class", ""):
return True return True
return False
class Extractor(BaseExtractor): class Extractor(BaseExtractor):
def __init__(self, fetcher: Fetcher, username: str): def __init__(self, fetcher: BaseFetcher, username: str):
super().__init__(fetcher, username) super().__init__(fetcher, username)
self.profile_soup = None self.profile_soup = None
@ -131,29 +132,37 @@ class Extractor(BaseExtractor):
except FileNotFoundError: except FileNotFoundError:
pass pass
def get_name(self) -> Union[str, None]: def get_name(self) -> str | None:
try: if self.profile_soup is None:
name = self.profile_soup.find("div", class_="profile-card-name")
return name.get_text().strip()
except AttributeError:
return None return None
name = self.profile_soup.find("div", class_="profile-card-name")
def get_image_url(self) -> Union[str, None]: if not isinstance(name, Tag):
try:
div = self.profile_soup.find("div", class_="profile-header-avatar")
src = div.find("img").get("src", "")
if "images.chesscomfiles.com" in src:
return src
except AttributeError:
return None return None
return name.get_text().strip()
def get_rapid(self) -> Union[int, None]: def get_image_url(self) -> str | None:
if self.profile_soup is None:
return None
div = self.profile_soup.find("div", class_="profile-header-avatar")
if not isinstance(div, Tag):
return None
img = div.find("img")
if not isinstance(img, Tag):
return None
src = img.get("src", "")
if not isinstance(src, str):
return None
if "images.chesscomfiles.com" not in src:
return None
return src
def get_rapid(self) -> int | None:
return self.stats_json.get("rapid", {}).get("rating") return self.stats_json.get("rapid", {}).get("rating")
def get_blitz(self) -> Union[int, None]: def get_blitz(self) -> int | None:
return self.stats_json.get("lightning", {}).get("rating") return self.stats_json.get("lightning", {}).get("rating")
def get_bullet(self) -> Union[int, None]: def get_bullet(self) -> int | None:
return self.stats_json.get("bullet", {}).get("rating") return self.stats_json.get("bullet", {}).get("rating")
@ -161,5 +170,5 @@ class Pipeline(BasePipeline):
def get_fetcher(self, session: aiohttp.ClientSession): def get_fetcher(self, session: aiohttp.ClientSession):
return Fetcher(session) return Fetcher(session)
def get_extractor(self, fetcher: Fetcher, username: str): def get_extractor(self, fetcher: BaseFetcher, username: str):
return Extractor(fetcher, username) return Extractor(fetcher, username)

View File

@ -1,12 +1,26 @@
import sys import sys
from datetime import datetime from datetime import datetime
from typing import Literal
from typing_extensions import TypedDict from typing_extensions import TypedDict
from app.site import Site
SCHEMA_NAME = "coach_scraper" SCHEMA_NAME = "coach_scraper"
TABLE_NAME = "export" TABLE_NAME = "export"
RowKey = (
Literal["site"]
| Literal["username"]
| Literal["name"]
| Literal["image_url"]
| Literal["rapid"]
| Literal["blitz"]
| Literal["bullet"]
)
class Row(TypedDict, total=False): class Row(TypedDict, total=False):
"""Representation of a row of the export table. """Representation of a row of the export table.
@ -14,7 +28,7 @@ class Row(TypedDict, total=False):
""" """
# Website the given coach was sourced from. # Website the given coach was sourced from.
site: str site: Site
# Username used on the source site. # Username used on the source site.
username: str username: str
# Real name. # Real name.
@ -110,4 +124,5 @@ def upsert_row(conn, row: Row):
) )
conn.commit() conn.commit()
finally: finally:
cursor.close() if cursor:
cursor.close()

View File

@ -1,15 +1,15 @@
import asyncio import asyncio
import os import os
import os.path import os.path
from typing import List, Union from typing import List
import aiohttp import aiohttp
from bs4 import BeautifulSoup, SoupStrainer from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline from app.pipeline import Pipeline as BasePipeline
from app.pipeline import Site from app.site import Site
# The number of pages we will at most iterate through. This number was # The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical # determined by going to https://lichess.org/coach/all/all/alphabetical
@ -24,7 +24,7 @@ class Fetcher(BaseFetcher):
def __init__(self, session: aiohttp.ClientSession): def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.LICHESS, session=session) super().__init__(site=Site.LICHESS, session=session)
async def scrape_usernames(self, page_no: int) -> List[str]: async def scrape_usernames(self, page_no: int) -> List[str] | None:
if page_no > MAX_PAGES: if page_no > MAX_PAGES:
return [] return []
@ -96,20 +96,22 @@ class Fetcher(BaseFetcher):
f.write(response) f.write(response)
def _profile_filter(elem, attrs): def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
if "coach-widget" in attrs.get("class", ""): if "coach-widget" in attrs.get("class", ""):
return True return True
return False
def _stats_filter(elem, attrs): def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
if "profile-side" in attrs.get("class", ""): if "profile-side" in attrs.get("class", ""):
return True return True
if "sub-ratings" in attrs.get("class", ""): if "sub-ratings" in attrs.get("class", ""):
return True return True
return False
class Extractor(BaseExtractor): class Extractor(BaseExtractor):
def __init__(self, fetcher: Fetcher, username: str): def __init__(self, fetcher: BaseFetcher, username: str):
super().__init__(fetcher, username) super().__init__(fetcher, username)
self.profile_soup = None self.profile_soup = None
@ -132,43 +134,60 @@ class Extractor(BaseExtractor):
except FileNotFoundError: except FileNotFoundError:
pass pass
def get_name(self) -> Union[str, None]: def get_name(self) -> str | None:
try: if self.stats_soup is None:
profile_side = self.stats_soup.find("div", class_="profile-side")
user_infos = profile_side.find("div", class_="user-infos")
name = user_infos.find("strong", class_="name")
return name.get_text().strip()
except AttributeError:
return None return None
profile_side = self.stats_soup.find("div", class_="profile-side")
def get_image_url(self) -> Union[str, None]: if not isinstance(profile_side, Tag):
try:
picture = self.profile_soup.find("img", class_="picture")
src = picture.get("src", "")
if "image.lichess1.org" in src:
return src
except AttributeError:
return None return None
user_infos = profile_side.find("div", class_="user-infos")
if not isinstance(user_infos, Tag):
return None
name = user_infos.find("strong", class_="name")
if not isinstance(name, Tag):
return None
return name.get_text().strip()
def get_rapid(self) -> Union[int, None]: def get_image_url(self) -> str | None:
if self.profile_soup is None:
return None
picture = self.profile_soup.find("img", class_="picture")
if not isinstance(picture, Tag):
return None
src = picture.get("src", "")
if not isinstance(src, str):
return None
if "image.lichess1.org" not in src:
return None
return src
def get_rapid(self) -> int | None:
return self._find_rating("rapid") return self._find_rating("rapid")
def get_blitz(self) -> Union[int, None]: def get_blitz(self) -> int | None:
return self._find_rating("blitz") return self._find_rating("blitz")
def get_bullet(self) -> Union[int, None]: def get_bullet(self) -> int | None:
return self._find_rating("bullet") return self._find_rating("bullet")
def _find_rating(self, name) -> Union[int, None]: def _find_rating(self, name) -> int | None:
if self.stats_soup is None:
return None
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
if not isinstance(a, Tag):
return None
rating = a.find("rating")
if not isinstance(rating, Tag):
return None
strong = rating.find("strong")
if not isinstance(strong, Tag):
return None
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
try: try:
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
rating = a.find("rating")
strong = rating.find("strong")
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
return int(value) return int(value)
except (AttributeError, ValueError): except ValueError:
return None return None
@ -176,5 +195,5 @@ class Pipeline(BasePipeline):
def get_fetcher(self, session: aiohttp.ClientSession): def get_fetcher(self, session: aiohttp.ClientSession):
return Fetcher(session) return Fetcher(session)
def get_extractor(self, fetcher: Fetcher, username: str): def get_extractor(self, fetcher: BaseFetcher, username: str):
return Extractor(fetcher, username) return Extractor(fetcher, username)

View File

@ -1,16 +1,11 @@
import asyncio import asyncio
import enum
import os.path import os.path
from typing import Any, List, Tuple, Union from typing import Any, List, Tuple
import aiohttp import aiohttp
from app.database import Row, upsert_row from app.database import Row, RowKey, upsert_row
from app.site import Site
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class Fetcher: class Fetcher:
@ -48,7 +43,7 @@ class Fetcher:
def path_page_file(self, page_no: int): def path_page_file(self, page_no: int):
return os.path.join(self.path_pages_dir(), f"{page_no}.txt") return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
async def fetch(self, url: str) -> Tuple[Union[str, None], int]: async def fetch(self, url: str) -> Tuple[str | None, int]:
"""Make network requests using the internal session. """Make network requests using the internal session.
@param url @param url
@ -63,7 +58,7 @@ class Fetcher:
return await response.text(), 200 return await response.text(), 200
return None, response.status return None, response.status
async def scrape_usernames(self, page_no: int) -> Union[List[str], None]: async def scrape_usernames(self, page_no: int) -> List[str] | None:
"""Source the specified site for all coach usernames. """Source the specified site for all coach usernames.
All pages should be downloaded at `self.path_page_file()`. Any cached All pages should be downloaded at `self.path_page_file()`. Any cached
@ -93,7 +88,7 @@ class Fetcher:
raise NotImplementedError() raise NotImplementedError()
def _insert(row: Row, key: str, value: Any): def _insert(row: Row, key: RowKey, value: Any):
if value is not None: if value is not None:
row[key] = value row[key] = value
@ -103,19 +98,19 @@ class Extractor:
self.fetcher = fetcher self.fetcher = fetcher
self.username = username self.username = username
def get_name(self) -> Union[str, None]: def get_name(self) -> str | None:
raise NotImplementedError() raise NotImplementedError()
def get_image_url(self) -> Union[str, None]: def get_image_url(self) -> str | None:
raise NotImplementedError() raise NotImplementedError()
def get_rapid(self) -> Union[int, None]: def get_rapid(self) -> int | None:
raise NotImplementedError() raise NotImplementedError()
def get_blitz(self) -> Union[int, None]: def get_blitz(self) -> int | None:
raise NotImplementedError() raise NotImplementedError()
def get_bullet(self) -> Union[int, None]: def get_bullet(self) -> int | None:
raise NotImplementedError() raise NotImplementedError()
def extract(self) -> Row: def extract(self) -> Row:
@ -160,7 +155,7 @@ class Pipeline:
async def process(self, conn, session: aiohttp.ClientSession): async def process(self, conn, session: aiohttp.ClientSession):
fetcher = self.get_fetcher(session) fetcher = self.get_fetcher(session)
queue = asyncio.Queue() queue: asyncio.Queue = asyncio.Queue()
# Create a batch of workers to process the jobs put into the queue. # Create a batch of workers to process the jobs put into the queue.
workers = [] workers = []
@ -171,14 +166,11 @@ class Pipeline:
# Begin downloading all coach usernames and files. The workers will # Begin downloading all coach usernames and files. The workers will
# run concurrently to extract all the relvant information and write # run concurrently to extract all the relvant information and write
page_no = 1 page_no = 1
usernames = [None] usernames: List[str] | None = [""]
while len(usernames): while usernames is None or len(usernames):
usernames = await fetcher.scrape_usernames(page_no) usernames = await fetcher.scrape_usernames(page_no)
page_no += 1 page_no += 1
if usernames is None: for username in usernames or []:
usernames = [None]
continue
for username in usernames:
await fetcher._download_user_files(username) await fetcher._download_user_files(username)
extractor = self.get_extractor(fetcher, username) extractor = self.get_extractor(fetcher, username)
queue.put_nowait((conn, extractor)) queue.put_nowait((conn, extractor))

6
app/site.py Normal file
View File

@ -0,0 +1,6 @@
import enum
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"

View File

@ -58,6 +58,27 @@
}) })
) pypkgs-build-requirements ) pypkgs-build-requirements
); );
types = with pkgs.python311Packages; {
beautifulsoup4 = buildPythonPackage rec {
pname = "types-beautifulsoup4";
version = "4.12.0.7";
src = pkgs.fetchPypi {
inherit pname version;
sha256 = "sha256-WZgAKNKb9V0Ns1nvowW3W6zwy5Lj8/az/UCPJTHfJ0w";
};
doCheck = false;
};
psycopg2 = buildPythonPackage rec {
pname = "types-psycopg2";
version = "2.9.21.19";
src = pkgs.fetchPypi {
inherit pname version;
sha256 = "sha256-7DquUi3enEEUFZe8QRI7TJVftAk7H8fsbuYHeVoKCI8=";
};
doCheck = false;
};
};
in in
{ {
packages = { packages = {
@ -88,6 +109,9 @@
pyls-isort pyls-isort
python-lsp-black python-lsp-black
python-lsp-server python-lsp-server
types.beautifulsoup4
types.psycopg2
typing-extensions
]); ]);
}; };
}); });

38
poetry.lock generated
View File

@ -449,42 +449,6 @@ files = [
{file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
] ]
[[package]]
name = "types-beautifulsoup4"
version = "4.12.0.7"
description = "Typing stubs for beautifulsoup4"
optional = false
python-versions = ">=3.7"
files = [
{file = "types-beautifulsoup4-4.12.0.7.tar.gz", hash = "sha256:59980028d29bf55d0db359efa305b75bacf0cb92e3f3f6b3fd408f2531df274c"},
{file = "types_beautifulsoup4-4.12.0.7-py3-none-any.whl", hash = "sha256:8b03b054cb2e62abf82bbbeda57a07257026f4ed9010ef17d8f8eff43bb1f9b7"},
]
[package.dependencies]
types-html5lib = "*"
[[package]]
name = "types-html5lib"
version = "1.1.11.15"
description = "Typing stubs for html5lib"
optional = false
python-versions = "*"
files = [
{file = "types-html5lib-1.1.11.15.tar.gz", hash = "sha256:80e1a2062d22a3affe5c28d97da30bffbf3a076d393c80fc6f1671216c1bd492"},
{file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
]
[[package]]
name = "typing-extensions"
version = "4.8.0"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
{file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
]
[[package]] [[package]]
name = "yarl" name = "yarl"
version = "1.9.3" version = "1.9.3"
@ -591,4 +555,4 @@ multidict = ">=4.0"
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.11" python-versions = "^3.11"
content-hash = "9e4078c4f5eeffbc90b895528738c457dadd671a784ab8c411a5c3fe91925e34" content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"

View File

@ -12,13 +12,13 @@ aiohttp = "^3.8.6"
lxml = "^4.9.3" lxml = "^4.9.3"
psycopg2 = "^2.9.9" psycopg2 = "^2.9.9"
[tool.poetry.group.dev.dependencies]
types-beautifulsoup4 = "^4.12.0.7"
typing-extensions = "^4.8.0"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts] [tool.poetry.scripts]
app = "app.__main__:main" app = "app.__main__:main"
[[tool.mypy.overrides]]
module = "aiohttp"
ignore_missing_imports = true