Fix all mypy warnings. (#9)
parent
8d7f1e7c4a
commit
82dbef21b6
|
@ -8,7 +8,7 @@ import psycopg2
|
|||
from app.chesscom import Pipeline as ChesscomPipeline
|
||||
from app.database import backup_database
|
||||
from app.lichess import Pipeline as LichessPipeline
|
||||
from app.pipeline import Site
|
||||
from app.site import Site
|
||||
|
||||
# The number of parallel extraction jobs that are run at a time.
|
||||
WORKER_COUNT = 10
|
||||
|
|
|
@ -2,15 +2,15 @@ import asyncio
|
|||
import json
|
||||
import os
|
||||
import os.path
|
||||
from typing import List, Union
|
||||
from typing import List
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup, SoupStrainer
|
||||
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||
|
||||
from app.pipeline import Extractor as BaseExtractor
|
||||
from app.pipeline import Fetcher as BaseFetcher
|
||||
from app.pipeline import Pipeline as BasePipeline
|
||||
from app.pipeline import Site
|
||||
from app.site import Site
|
||||
|
||||
# The number of coach listing pages we will at most iterate through. This number
|
||||
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
||||
|
@ -25,7 +25,7 @@ class Fetcher(BaseFetcher):
|
|||
def __init__(self, session: aiohttp.ClientSession):
|
||||
super().__init__(site=Site.CHESSCOM, session=session)
|
||||
|
||||
async def scrape_usernames(self, page_no: int) -> List[str]:
|
||||
async def scrape_usernames(self, page_no: int) -> List[str] | None:
|
||||
if page_no > MAX_PAGES:
|
||||
return []
|
||||
|
||||
|
@ -100,15 +100,16 @@ class Fetcher(BaseFetcher):
|
|||
f.write(response)
|
||||
|
||||
|
||||
def _profile_filter(elem, attrs):
|
||||
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||
if "profile-header-info" in attrs.get("class", ""):
|
||||
return True
|
||||
if "profile-card-info" in attrs.get("class", ""):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class Extractor(BaseExtractor):
|
||||
def __init__(self, fetcher: Fetcher, username: str):
|
||||
def __init__(self, fetcher: BaseFetcher, username: str):
|
||||
super().__init__(fetcher, username)
|
||||
|
||||
self.profile_soup = None
|
||||
|
@ -131,29 +132,37 @@ class Extractor(BaseExtractor):
|
|||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def get_name(self) -> Union[str, None]:
|
||||
try:
|
||||
def get_name(self) -> str | None:
|
||||
if self.profile_soup is None:
|
||||
return None
|
||||
name = self.profile_soup.find("div", class_="profile-card-name")
|
||||
if not isinstance(name, Tag):
|
||||
return None
|
||||
return name.get_text().strip()
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
def get_image_url(self) -> Union[str, None]:
|
||||
try:
|
||||
def get_image_url(self) -> str | None:
|
||||
if self.profile_soup is None:
|
||||
return None
|
||||
div = self.profile_soup.find("div", class_="profile-header-avatar")
|
||||
src = div.find("img").get("src", "")
|
||||
if "images.chesscomfiles.com" in src:
|
||||
return src
|
||||
except AttributeError:
|
||||
if not isinstance(div, Tag):
|
||||
return None
|
||||
img = div.find("img")
|
||||
if not isinstance(img, Tag):
|
||||
return None
|
||||
src = img.get("src", "")
|
||||
if not isinstance(src, str):
|
||||
return None
|
||||
if "images.chesscomfiles.com" not in src:
|
||||
return None
|
||||
return src
|
||||
|
||||
def get_rapid(self) -> Union[int, None]:
|
||||
def get_rapid(self) -> int | None:
|
||||
return self.stats_json.get("rapid", {}).get("rating")
|
||||
|
||||
def get_blitz(self) -> Union[int, None]:
|
||||
def get_blitz(self) -> int | None:
|
||||
return self.stats_json.get("lightning", {}).get("rating")
|
||||
|
||||
def get_bullet(self) -> Union[int, None]:
|
||||
def get_bullet(self) -> int | None:
|
||||
return self.stats_json.get("bullet", {}).get("rating")
|
||||
|
||||
|
||||
|
@ -161,5 +170,5 @@ class Pipeline(BasePipeline):
|
|||
def get_fetcher(self, session: aiohttp.ClientSession):
|
||||
return Fetcher(session)
|
||||
|
||||
def get_extractor(self, fetcher: Fetcher, username: str):
|
||||
def get_extractor(self, fetcher: BaseFetcher, username: str):
|
||||
return Extractor(fetcher, username)
|
||||
|
|
|
@ -1,12 +1,26 @@
|
|||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Literal
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from app.site import Site
|
||||
|
||||
SCHEMA_NAME = "coach_scraper"
|
||||
TABLE_NAME = "export"
|
||||
|
||||
|
||||
RowKey = (
|
||||
Literal["site"]
|
||||
| Literal["username"]
|
||||
| Literal["name"]
|
||||
| Literal["image_url"]
|
||||
| Literal["rapid"]
|
||||
| Literal["blitz"]
|
||||
| Literal["bullet"]
|
||||
)
|
||||
|
||||
|
||||
class Row(TypedDict, total=False):
|
||||
"""Representation of a row of the export table.
|
||||
|
||||
|
@ -14,7 +28,7 @@ class Row(TypedDict, total=False):
|
|||
"""
|
||||
|
||||
# Website the given coach was sourced from.
|
||||
site: str
|
||||
site: Site
|
||||
# Username used on the source site.
|
||||
username: str
|
||||
# Real name.
|
||||
|
@ -110,4 +124,5 @@ def upsert_row(conn, row: Row):
|
|||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
import asyncio
|
||||
import os
|
||||
import os.path
|
||||
from typing import List, Union
|
||||
from typing import List
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup, SoupStrainer
|
||||
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||
|
||||
from app.pipeline import Extractor as BaseExtractor
|
||||
from app.pipeline import Fetcher as BaseFetcher
|
||||
from app.pipeline import Pipeline as BasePipeline
|
||||
from app.pipeline import Site
|
||||
from app.site import Site
|
||||
|
||||
# The number of pages we will at most iterate through. This number was
|
||||
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||
|
@ -24,7 +24,7 @@ class Fetcher(BaseFetcher):
|
|||
def __init__(self, session: aiohttp.ClientSession):
|
||||
super().__init__(site=Site.LICHESS, session=session)
|
||||
|
||||
async def scrape_usernames(self, page_no: int) -> List[str]:
|
||||
async def scrape_usernames(self, page_no: int) -> List[str] | None:
|
||||
if page_no > MAX_PAGES:
|
||||
return []
|
||||
|
||||
|
@ -96,20 +96,22 @@ class Fetcher(BaseFetcher):
|
|||
f.write(response)
|
||||
|
||||
|
||||
def _profile_filter(elem, attrs):
|
||||
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||
if "coach-widget" in attrs.get("class", ""):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _stats_filter(elem, attrs):
|
||||
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||
if "profile-side" in attrs.get("class", ""):
|
||||
return True
|
||||
if "sub-ratings" in attrs.get("class", ""):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class Extractor(BaseExtractor):
|
||||
def __init__(self, fetcher: Fetcher, username: str):
|
||||
def __init__(self, fetcher: BaseFetcher, username: str):
|
||||
super().__init__(fetcher, username)
|
||||
|
||||
self.profile_soup = None
|
||||
|
@ -132,43 +134,60 @@ class Extractor(BaseExtractor):
|
|||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def get_name(self) -> Union[str, None]:
|
||||
try:
|
||||
def get_name(self) -> str | None:
|
||||
if self.stats_soup is None:
|
||||
return None
|
||||
profile_side = self.stats_soup.find("div", class_="profile-side")
|
||||
if not isinstance(profile_side, Tag):
|
||||
return None
|
||||
user_infos = profile_side.find("div", class_="user-infos")
|
||||
if not isinstance(user_infos, Tag):
|
||||
return None
|
||||
name = user_infos.find("strong", class_="name")
|
||||
if not isinstance(name, Tag):
|
||||
return None
|
||||
return name.get_text().strip()
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
def get_image_url(self) -> Union[str, None]:
|
||||
try:
|
||||
def get_image_url(self) -> str | None:
|
||||
if self.profile_soup is None:
|
||||
return None
|
||||
picture = self.profile_soup.find("img", class_="picture")
|
||||
src = picture.get("src", "")
|
||||
if "image.lichess1.org" in src:
|
||||
return src
|
||||
except AttributeError:
|
||||
if not isinstance(picture, Tag):
|
||||
return None
|
||||
src = picture.get("src", "")
|
||||
if not isinstance(src, str):
|
||||
return None
|
||||
if "image.lichess1.org" not in src:
|
||||
return None
|
||||
return src
|
||||
|
||||
def get_rapid(self) -> Union[int, None]:
|
||||
def get_rapid(self) -> int | None:
|
||||
return self._find_rating("rapid")
|
||||
|
||||
def get_blitz(self) -> Union[int, None]:
|
||||
def get_blitz(self) -> int | None:
|
||||
return self._find_rating("blitz")
|
||||
|
||||
def get_bullet(self) -> Union[int, None]:
|
||||
def get_bullet(self) -> int | None:
|
||||
return self._find_rating("bullet")
|
||||
|
||||
def _find_rating(self, name) -> Union[int, None]:
|
||||
try:
|
||||
def _find_rating(self, name) -> int | None:
|
||||
if self.stats_soup is None:
|
||||
return None
|
||||
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
|
||||
if not isinstance(a, Tag):
|
||||
return None
|
||||
rating = a.find("rating")
|
||||
if not isinstance(rating, Tag):
|
||||
return None
|
||||
strong = rating.find("strong")
|
||||
if not isinstance(strong, Tag):
|
||||
return None
|
||||
value = strong.get_text()
|
||||
if value[-1] == "?":
|
||||
value = value[:-1]
|
||||
try:
|
||||
return int(value)
|
||||
except (AttributeError, ValueError):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
|
@ -176,5 +195,5 @@ class Pipeline(BasePipeline):
|
|||
def get_fetcher(self, session: aiohttp.ClientSession):
|
||||
return Fetcher(session)
|
||||
|
||||
def get_extractor(self, fetcher: Fetcher, username: str):
|
||||
def get_extractor(self, fetcher: BaseFetcher, username: str):
|
||||
return Extractor(fetcher, username)
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
import asyncio
|
||||
import enum
|
||||
import os.path
|
||||
from typing import Any, List, Tuple, Union
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import aiohttp
|
||||
|
||||
from app.database import Row, upsert_row
|
||||
|
||||
|
||||
class Site(enum.Enum):
|
||||
CHESSCOM = "chesscom"
|
||||
LICHESS = "lichess"
|
||||
from app.database import Row, RowKey, upsert_row
|
||||
from app.site import Site
|
||||
|
||||
|
||||
class Fetcher:
|
||||
|
@ -48,7 +43,7 @@ class Fetcher:
|
|||
def path_page_file(self, page_no: int):
|
||||
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
||||
|
||||
async def fetch(self, url: str) -> Tuple[Union[str, None], int]:
|
||||
async def fetch(self, url: str) -> Tuple[str | None, int]:
|
||||
"""Make network requests using the internal session.
|
||||
|
||||
@param url
|
||||
|
@ -63,7 +58,7 @@ class Fetcher:
|
|||
return await response.text(), 200
|
||||
return None, response.status
|
||||
|
||||
async def scrape_usernames(self, page_no: int) -> Union[List[str], None]:
|
||||
async def scrape_usernames(self, page_no: int) -> List[str] | None:
|
||||
"""Source the specified site for all coach usernames.
|
||||
|
||||
All pages should be downloaded at `self.path_page_file()`. Any cached
|
||||
|
@ -93,7 +88,7 @@ class Fetcher:
|
|||
raise NotImplementedError()
|
||||
|
||||
|
||||
def _insert(row: Row, key: str, value: Any):
|
||||
def _insert(row: Row, key: RowKey, value: Any):
|
||||
if value is not None:
|
||||
row[key] = value
|
||||
|
||||
|
@ -103,19 +98,19 @@ class Extractor:
|
|||
self.fetcher = fetcher
|
||||
self.username = username
|
||||
|
||||
def get_name(self) -> Union[str, None]:
|
||||
def get_name(self) -> str | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_image_url(self) -> Union[str, None]:
|
||||
def get_image_url(self) -> str | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_rapid(self) -> Union[int, None]:
|
||||
def get_rapid(self) -> int | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_blitz(self) -> Union[int, None]:
|
||||
def get_blitz(self) -> int | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_bullet(self) -> Union[int, None]:
|
||||
def get_bullet(self) -> int | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def extract(self) -> Row:
|
||||
|
@ -160,7 +155,7 @@ class Pipeline:
|
|||
async def process(self, conn, session: aiohttp.ClientSession):
|
||||
fetcher = self.get_fetcher(session)
|
||||
|
||||
queue = asyncio.Queue()
|
||||
queue: asyncio.Queue = asyncio.Queue()
|
||||
|
||||
# Create a batch of workers to process the jobs put into the queue.
|
||||
workers = []
|
||||
|
@ -171,14 +166,11 @@ class Pipeline:
|
|||
# Begin downloading all coach usernames and files. The workers will
|
||||
# run concurrently to extract all the relvant information and write
|
||||
page_no = 1
|
||||
usernames = [None]
|
||||
while len(usernames):
|
||||
usernames: List[str] | None = [""]
|
||||
while usernames is None or len(usernames):
|
||||
usernames = await fetcher.scrape_usernames(page_no)
|
||||
page_no += 1
|
||||
if usernames is None:
|
||||
usernames = [None]
|
||||
continue
|
||||
for username in usernames:
|
||||
for username in usernames or []:
|
||||
await fetcher._download_user_files(username)
|
||||
extractor = self.get_extractor(fetcher, username)
|
||||
queue.put_nowait((conn, extractor))
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
import enum
|
||||
|
||||
|
||||
class Site(enum.Enum):
|
||||
CHESSCOM = "chesscom"
|
||||
LICHESS = "lichess"
|
24
flake.nix
24
flake.nix
|
@ -58,6 +58,27 @@
|
|||
})
|
||||
) pypkgs-build-requirements
|
||||
);
|
||||
|
||||
types = with pkgs.python311Packages; {
|
||||
beautifulsoup4 = buildPythonPackage rec {
|
||||
pname = "types-beautifulsoup4";
|
||||
version = "4.12.0.7";
|
||||
src = pkgs.fetchPypi {
|
||||
inherit pname version;
|
||||
sha256 = "sha256-WZgAKNKb9V0Ns1nvowW3W6zwy5Lj8/az/UCPJTHfJ0w";
|
||||
};
|
||||
doCheck = false;
|
||||
};
|
||||
psycopg2 = buildPythonPackage rec {
|
||||
pname = "types-psycopg2";
|
||||
version = "2.9.21.19";
|
||||
src = pkgs.fetchPypi {
|
||||
inherit pname version;
|
||||
sha256 = "sha256-7DquUi3enEEUFZe8QRI7TJVftAk7H8fsbuYHeVoKCI8=";
|
||||
};
|
||||
doCheck = false;
|
||||
};
|
||||
};
|
||||
in
|
||||
{
|
||||
packages = {
|
||||
|
@ -88,6 +109,9 @@
|
|||
pyls-isort
|
||||
python-lsp-black
|
||||
python-lsp-server
|
||||
types.beautifulsoup4
|
||||
types.psycopg2
|
||||
typing-extensions
|
||||
]);
|
||||
};
|
||||
});
|
||||
|
|
|
@ -449,42 +449,6 @@ files = [
|
|||
{file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-beautifulsoup4"
|
||||
version = "4.12.0.7"
|
||||
description = "Typing stubs for beautifulsoup4"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "types-beautifulsoup4-4.12.0.7.tar.gz", hash = "sha256:59980028d29bf55d0db359efa305b75bacf0cb92e3f3f6b3fd408f2531df274c"},
|
||||
{file = "types_beautifulsoup4-4.12.0.7-py3-none-any.whl", hash = "sha256:8b03b054cb2e62abf82bbbeda57a07257026f4ed9010ef17d8f8eff43bb1f9b7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
types-html5lib = "*"
|
||||
|
||||
[[package]]
|
||||
name = "types-html5lib"
|
||||
version = "1.1.11.15"
|
||||
description = "Typing stubs for html5lib"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "types-html5lib-1.1.11.15.tar.gz", hash = "sha256:80e1a2062d22a3affe5c28d97da30bffbf3a076d393c80fc6f1671216c1bd492"},
|
||||
{file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.8.0"
|
||||
description = "Backported and Experimental Type Hints for Python 3.8+"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
|
||||
{file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yarl"
|
||||
version = "1.9.3"
|
||||
|
@ -591,4 +555,4 @@ multidict = ">=4.0"
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "9e4078c4f5eeffbc90b895528738c457dadd671a784ab8c411a5c3fe91925e34"
|
||||
content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"
|
||||
|
|
|
@ -12,13 +12,13 @@ aiohttp = "^3.8.6"
|
|||
lxml = "^4.9.3"
|
||||
psycopg2 = "^2.9.9"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
types-beautifulsoup4 = "^4.12.0.7"
|
||||
typing-extensions = "^4.8.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
app = "app.__main__:main"
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = "aiohttp"
|
||||
ignore_missing_imports = true
|
||||
|
|
Loading…
Reference in New Issue