Fix all mypy warnings. (#9)

pull/10/head
Joshua Potter 2023-12-05 12:54:12 -07:00 committed by GitHub
parent 8d7f1e7c4a
commit 82dbef21b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 153 additions and 124 deletions

View File

@ -8,7 +8,7 @@ import psycopg2
from app.chesscom import Pipeline as ChesscomPipeline
from app.database import backup_database
from app.lichess import Pipeline as LichessPipeline
from app.pipeline import Site
from app.site import Site
# The number of parallel extraction jobs that are run at a time.
WORKER_COUNT = 10

View File

@ -2,15 +2,15 @@ import asyncio
import json
import os
import os.path
from typing import List, Union
from typing import List
import aiohttp
from bs4 import BeautifulSoup, SoupStrainer
from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.pipeline import Site
from app.site import Site
# The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -25,7 +25,7 @@ class Fetcher(BaseFetcher):
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.CHESSCOM, session=session)
async def scrape_usernames(self, page_no: int) -> List[str]:
async def scrape_usernames(self, page_no: int) -> List[str] | None:
if page_no > MAX_PAGES:
return []
@ -100,15 +100,16 @@ class Fetcher(BaseFetcher):
f.write(response)
def _profile_filter(elem, attrs):
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
if "profile-header-info" in attrs.get("class", ""):
return True
if "profile-card-info" in attrs.get("class", ""):
return True
return False
class Extractor(BaseExtractor):
def __init__(self, fetcher: Fetcher, username: str):
def __init__(self, fetcher: BaseFetcher, username: str):
super().__init__(fetcher, username)
self.profile_soup = None
@ -131,29 +132,37 @@ class Extractor(BaseExtractor):
except FileNotFoundError:
pass
def get_name(self) -> Union[str, None]:
try:
name = self.profile_soup.find("div", class_="profile-card-name")
return name.get_text().strip()
except AttributeError:
def get_name(self) -> str | None:
if self.profile_soup is None:
return None
def get_image_url(self) -> Union[str, None]:
try:
div = self.profile_soup.find("div", class_="profile-header-avatar")
src = div.find("img").get("src", "")
if "images.chesscomfiles.com" in src:
return src
except AttributeError:
name = self.profile_soup.find("div", class_="profile-card-name")
if not isinstance(name, Tag):
return None
return name.get_text().strip()
def get_rapid(self) -> Union[int, None]:
def get_image_url(self) -> str | None:
if self.profile_soup is None:
return None
div = self.profile_soup.find("div", class_="profile-header-avatar")
if not isinstance(div, Tag):
return None
img = div.find("img")
if not isinstance(img, Tag):
return None
src = img.get("src", "")
if not isinstance(src, str):
return None
if "images.chesscomfiles.com" not in src:
return None
return src
def get_rapid(self) -> int | None:
return self.stats_json.get("rapid", {}).get("rating")
def get_blitz(self) -> Union[int, None]:
def get_blitz(self) -> int | None:
return self.stats_json.get("lightning", {}).get("rating")
def get_bullet(self) -> Union[int, None]:
def get_bullet(self) -> int | None:
return self.stats_json.get("bullet", {}).get("rating")
@ -161,5 +170,5 @@ class Pipeline(BasePipeline):
def get_fetcher(self, session: aiohttp.ClientSession):
return Fetcher(session)
def get_extractor(self, fetcher: Fetcher, username: str):
def get_extractor(self, fetcher: BaseFetcher, username: str):
return Extractor(fetcher, username)

View File

@ -1,12 +1,26 @@
import sys
from datetime import datetime
from typing import Literal
from typing_extensions import TypedDict
from app.site import Site
SCHEMA_NAME = "coach_scraper"
TABLE_NAME = "export"
RowKey = (
Literal["site"]
| Literal["username"]
| Literal["name"]
| Literal["image_url"]
| Literal["rapid"]
| Literal["blitz"]
| Literal["bullet"]
)
class Row(TypedDict, total=False):
"""Representation of a row of the export table.
@ -14,7 +28,7 @@ class Row(TypedDict, total=False):
"""
# Website the given coach was sourced from.
site: str
site: Site
# Username used on the source site.
username: str
# Real name.
@ -110,4 +124,5 @@ def upsert_row(conn, row: Row):
)
conn.commit()
finally:
cursor.close()
if cursor:
cursor.close()

View File

@ -1,15 +1,15 @@
import asyncio
import os
import os.path
from typing import List, Union
from typing import List
import aiohttp
from bs4 import BeautifulSoup, SoupStrainer
from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.pipeline import Site
from app.site import Site
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
@ -24,7 +24,7 @@ class Fetcher(BaseFetcher):
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.LICHESS, session=session)
async def scrape_usernames(self, page_no: int) -> List[str]:
async def scrape_usernames(self, page_no: int) -> List[str] | None:
if page_no > MAX_PAGES:
return []
@ -96,20 +96,22 @@ class Fetcher(BaseFetcher):
f.write(response)
def _profile_filter(elem, attrs):
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
if "coach-widget" in attrs.get("class", ""):
return True
return False
def _stats_filter(elem, attrs):
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
if "profile-side" in attrs.get("class", ""):
return True
if "sub-ratings" in attrs.get("class", ""):
return True
return False
class Extractor(BaseExtractor):
def __init__(self, fetcher: Fetcher, username: str):
def __init__(self, fetcher: BaseFetcher, username: str):
super().__init__(fetcher, username)
self.profile_soup = None
@ -132,43 +134,60 @@ class Extractor(BaseExtractor):
except FileNotFoundError:
pass
def get_name(self) -> Union[str, None]:
try:
profile_side = self.stats_soup.find("div", class_="profile-side")
user_infos = profile_side.find("div", class_="user-infos")
name = user_infos.find("strong", class_="name")
return name.get_text().strip()
except AttributeError:
def get_name(self) -> str | None:
if self.stats_soup is None:
return None
def get_image_url(self) -> Union[str, None]:
try:
picture = self.profile_soup.find("img", class_="picture")
src = picture.get("src", "")
if "image.lichess1.org" in src:
return src
except AttributeError:
profile_side = self.stats_soup.find("div", class_="profile-side")
if not isinstance(profile_side, Tag):
return None
user_infos = profile_side.find("div", class_="user-infos")
if not isinstance(user_infos, Tag):
return None
name = user_infos.find("strong", class_="name")
if not isinstance(name, Tag):
return None
return name.get_text().strip()
def get_rapid(self) -> Union[int, None]:
def get_image_url(self) -> str | None:
if self.profile_soup is None:
return None
picture = self.profile_soup.find("img", class_="picture")
if not isinstance(picture, Tag):
return None
src = picture.get("src", "")
if not isinstance(src, str):
return None
if "image.lichess1.org" not in src:
return None
return src
def get_rapid(self) -> int | None:
return self._find_rating("rapid")
def get_blitz(self) -> Union[int, None]:
def get_blitz(self) -> int | None:
return self._find_rating("blitz")
def get_bullet(self) -> Union[int, None]:
def get_bullet(self) -> int | None:
return self._find_rating("bullet")
def _find_rating(self, name) -> Union[int, None]:
def _find_rating(self, name) -> int | None:
if self.stats_soup is None:
return None
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
if not isinstance(a, Tag):
return None
rating = a.find("rating")
if not isinstance(rating, Tag):
return None
strong = rating.find("strong")
if not isinstance(strong, Tag):
return None
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
try:
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
rating = a.find("rating")
strong = rating.find("strong")
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
return int(value)
except (AttributeError, ValueError):
except ValueError:
return None
@ -176,5 +195,5 @@ class Pipeline(BasePipeline):
def get_fetcher(self, session: aiohttp.ClientSession):
return Fetcher(session)
def get_extractor(self, fetcher: Fetcher, username: str):
def get_extractor(self, fetcher: BaseFetcher, username: str):
return Extractor(fetcher, username)

View File

@ -1,16 +1,11 @@
import asyncio
import enum
import os.path
from typing import Any, List, Tuple, Union
from typing import Any, List, Tuple
import aiohttp
from app.database import Row, upsert_row
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
from app.database import Row, RowKey, upsert_row
from app.site import Site
class Fetcher:
@ -48,7 +43,7 @@ class Fetcher:
def path_page_file(self, page_no: int):
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
async def fetch(self, url: str) -> Tuple[Union[str, None], int]:
async def fetch(self, url: str) -> Tuple[str | None, int]:
"""Make network requests using the internal session.
@param url
@ -63,7 +58,7 @@ class Fetcher:
return await response.text(), 200
return None, response.status
async def scrape_usernames(self, page_no: int) -> Union[List[str], None]:
async def scrape_usernames(self, page_no: int) -> List[str] | None:
"""Source the specified site for all coach usernames.
All pages should be downloaded at `self.path_page_file()`. Any cached
@ -93,7 +88,7 @@ class Fetcher:
raise NotImplementedError()
def _insert(row: Row, key: str, value: Any):
def _insert(row: Row, key: RowKey, value: Any):
if value is not None:
row[key] = value
@ -103,19 +98,19 @@ class Extractor:
self.fetcher = fetcher
self.username = username
def get_name(self) -> Union[str, None]:
def get_name(self) -> str | None:
raise NotImplementedError()
def get_image_url(self) -> Union[str, None]:
def get_image_url(self) -> str | None:
raise NotImplementedError()
def get_rapid(self) -> Union[int, None]:
def get_rapid(self) -> int | None:
raise NotImplementedError()
def get_blitz(self) -> Union[int, None]:
def get_blitz(self) -> int | None:
raise NotImplementedError()
def get_bullet(self) -> Union[int, None]:
def get_bullet(self) -> int | None:
raise NotImplementedError()
def extract(self) -> Row:
@ -160,7 +155,7 @@ class Pipeline:
async def process(self, conn, session: aiohttp.ClientSession):
fetcher = self.get_fetcher(session)
queue = asyncio.Queue()
queue: asyncio.Queue = asyncio.Queue()
# Create a batch of workers to process the jobs put into the queue.
workers = []
@ -171,14 +166,11 @@ class Pipeline:
# Begin downloading all coach usernames and files. The workers will
# run concurrently to extract all the relvant information and write
page_no = 1
usernames = [None]
while len(usernames):
usernames: List[str] | None = [""]
while usernames is None or len(usernames):
usernames = await fetcher.scrape_usernames(page_no)
page_no += 1
if usernames is None:
usernames = [None]
continue
for username in usernames:
for username in usernames or []:
await fetcher._download_user_files(username)
extractor = self.get_extractor(fetcher, username)
queue.put_nowait((conn, extractor))

6
app/site.py Normal file
View File

@ -0,0 +1,6 @@
import enum
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"

View File

@ -58,6 +58,27 @@
})
) pypkgs-build-requirements
);
types = with pkgs.python311Packages; {
beautifulsoup4 = buildPythonPackage rec {
pname = "types-beautifulsoup4";
version = "4.12.0.7";
src = pkgs.fetchPypi {
inherit pname version;
sha256 = "sha256-WZgAKNKb9V0Ns1nvowW3W6zwy5Lj8/az/UCPJTHfJ0w";
};
doCheck = false;
};
psycopg2 = buildPythonPackage rec {
pname = "types-psycopg2";
version = "2.9.21.19";
src = pkgs.fetchPypi {
inherit pname version;
sha256 = "sha256-7DquUi3enEEUFZe8QRI7TJVftAk7H8fsbuYHeVoKCI8=";
};
doCheck = false;
};
};
in
{
packages = {
@ -88,6 +109,9 @@
pyls-isort
python-lsp-black
python-lsp-server
types.beautifulsoup4
types.psycopg2
typing-extensions
]);
};
});

38
poetry.lock generated
View File

@ -449,42 +449,6 @@ files = [
{file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
]
[[package]]
name = "types-beautifulsoup4"
version = "4.12.0.7"
description = "Typing stubs for beautifulsoup4"
optional = false
python-versions = ">=3.7"
files = [
{file = "types-beautifulsoup4-4.12.0.7.tar.gz", hash = "sha256:59980028d29bf55d0db359efa305b75bacf0cb92e3f3f6b3fd408f2531df274c"},
{file = "types_beautifulsoup4-4.12.0.7-py3-none-any.whl", hash = "sha256:8b03b054cb2e62abf82bbbeda57a07257026f4ed9010ef17d8f8eff43bb1f9b7"},
]
[package.dependencies]
types-html5lib = "*"
[[package]]
name = "types-html5lib"
version = "1.1.11.15"
description = "Typing stubs for html5lib"
optional = false
python-versions = "*"
files = [
{file = "types-html5lib-1.1.11.15.tar.gz", hash = "sha256:80e1a2062d22a3affe5c28d97da30bffbf3a076d393c80fc6f1671216c1bd492"},
{file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
]
[[package]]
name = "typing-extensions"
version = "4.8.0"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
{file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
]
[[package]]
name = "yarl"
version = "1.9.3"
@ -591,4 +555,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "9e4078c4f5eeffbc90b895528738c457dadd671a784ab8c411a5c3fe91925e34"
content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"

View File

@ -12,13 +12,13 @@ aiohttp = "^3.8.6"
lxml = "^4.9.3"
psycopg2 = "^2.9.9"
[tool.poetry.group.dev.dependencies]
types-beautifulsoup4 = "^4.12.0.7"
typing-extensions = "^4.8.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
app = "app.__main__:main"
[[tool.mypy.overrides]]
module = "aiohttp"
ignore_missing_imports = true