Generalize in anticipation of merging the lichess scraper. (#1)
* Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works.pull/2/head
parent
3cc31f8f24
commit
10801b560c
|
@ -29,15 +29,12 @@ data
|
||||||
|
|
||||||
If you have nix available, run:
|
If you have nix available, run:
|
||||||
```bash
|
```bash
|
||||||
$> nix build
|
$> nix run . -- --user-agent <your-email> -s chesscom
|
||||||
$> result/bin/app --user-agent <your-email>
|
|
||||||
```
|
```
|
||||||
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
||||||
instead run the following:
|
instead run the following:
|
||||||
```bash
|
```bash
|
||||||
$> poetry install
|
$> poetry run python3 -m app -u <your-email> -s chesscom
|
||||||
$> source $(poetry env info --path)/bin/activate
|
|
||||||
$> python3 -m app
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
|
@ -1,4 +1,39 @@
|
||||||
from app import scraper
|
import aiohttp
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from app.chesscom import Scraper as ChesscomScraper
|
||||||
|
from app.scraper import Site
|
||||||
|
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="coach-scraper",
|
||||||
|
description="HTML scraping of chess.com coaches.",
|
||||||
|
)
|
||||||
|
parser.add_argument("-u", "--user-agent", required=True)
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--site",
|
||||||
|
required=True,
|
||||||
|
choices=[
|
||||||
|
Site.CHESSCOM.value,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(
|
||||||
|
headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
|
||||||
|
) as session:
|
||||||
|
if args.site == Site.CHESSCOM.value:
|
||||||
|
scraper = ChesscomScraper(session)
|
||||||
|
|
||||||
|
await scraper.scrape()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
asyncio.run(run())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scraper.run()
|
main()
|
||||||
|
|
|
@ -0,0 +1,193 @@
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
from app.scraper import AnsiColor, BaseScraper, Export, Site
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
# The number of coach listing pages we will at most iterate through. This number
|
||||||
|
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
||||||
|
# traversing to the last page.
|
||||||
|
MAX_PAGES = 64
|
||||||
|
|
||||||
|
# How long to wait between a batch of network requests.
|
||||||
|
SLEEP_SECS = 3
|
||||||
|
|
||||||
|
|
||||||
|
class Scraper(BaseScraper):
|
||||||
|
def __init__(self, session: aiohttp.ClientSession):
|
||||||
|
super().__init__(site=Site.CHESSCOM.value, session=session)
|
||||||
|
|
||||||
|
async def download_usernames(self) -> List[str]:
|
||||||
|
"""Scan through chess.com/coaches for all coaches' usernames.
|
||||||
|
|
||||||
|
@return
|
||||||
|
The complete list of scraped usernames across every coach listing
|
||||||
|
page.
|
||||||
|
"""
|
||||||
|
usernames = []
|
||||||
|
for page_no in range(1, MAX_PAGES + 1):
|
||||||
|
filepath = self.path_page_file(page_no)
|
||||||
|
try:
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Reading file "),
|
||||||
|
(AnsiColor.DATA, filepath),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
usernames.extend([line.strip() for line in f.readlines()])
|
||||||
|
except FileNotFoundError:
|
||||||
|
page_usernames = await self._scrape_page(page_no)
|
||||||
|
if not page_usernames:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.ERROR, "[ERROR]"),
|
||||||
|
(None, ": Could not scrape page "),
|
||||||
|
(AnsiColor.DATA, str(page_no)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
with open(filepath, "w") as f:
|
||||||
|
for username in page_usernames:
|
||||||
|
f.write(f"{username}\n")
|
||||||
|
usernames.extend(page_usernames)
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Downloaded page "),
|
||||||
|
(AnsiColor.DATA, filepath),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await asyncio.sleep(SLEEP_SECS)
|
||||||
|
|
||||||
|
return usernames
|
||||||
|
|
||||||
|
async def _scrape_page(self, page_no: int) -> List[str]:
|
||||||
|
"""Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
|
||||||
|
|
||||||
|
@param page_no
|
||||||
|
The page consisting of at most 25 coaches (at the time of writing)
|
||||||
|
whose usernames are to be scraped.
|
||||||
|
@return
|
||||||
|
The list of scraped usernames on the specified coach listing page.
|
||||||
|
"""
|
||||||
|
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||||
|
response, status_code = await self.request(url)
|
||||||
|
if response is None:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.ERROR, "[ERROR]"),
|
||||||
|
(None, ": Received status "),
|
||||||
|
(AnsiColor.DATA, f"{status_code} "),
|
||||||
|
(None, "when downloading page "),
|
||||||
|
(AnsiColor.DATA, str(page_no)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
usernames = []
|
||||||
|
soup = BeautifulSoup(response, "html.parser")
|
||||||
|
members = soup.find_all("a", class_="members-categories-username")
|
||||||
|
for member in members:
|
||||||
|
href = member.get("href")
|
||||||
|
username = href[len("https://www.chess.com/member/") :]
|
||||||
|
usernames.append(username)
|
||||||
|
|
||||||
|
return usernames
|
||||||
|
|
||||||
|
async def download_profile(self, username: str):
|
||||||
|
"""For each coach, download coach-specific data.
|
||||||
|
|
||||||
|
This sends three parallel requests for:
|
||||||
|
* the coach's profile,
|
||||||
|
* the coach's recent activity,
|
||||||
|
* the coach's stats.
|
||||||
|
|
||||||
|
@param username
|
||||||
|
The coach username corresponding to the downloaded files.
|
||||||
|
"""
|
||||||
|
used_network = await asyncio.gather(
|
||||||
|
self._download_profile_file(
|
||||||
|
url=f"https://www.chess.com/member/{username}",
|
||||||
|
username=username,
|
||||||
|
filename=self.path_coach_file(username, f"{username}.html"),
|
||||||
|
),
|
||||||
|
self._download_profile_file(
|
||||||
|
url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
|
||||||
|
username=username,
|
||||||
|
filename=self.path_coach_file(username, "activity.json"),
|
||||||
|
),
|
||||||
|
self._download_profile_file(
|
||||||
|
url=f"https://www.chess.com/callback/member/stats/{username}",
|
||||||
|
username=username,
|
||||||
|
filename=self.path_coach_file(username, "stats.json"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if any(used_network):
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Downloaded data for coach "),
|
||||||
|
(AnsiColor.DATA, username),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await asyncio.sleep(SLEEP_SECS)
|
||||||
|
else:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Skipping download for coach "),
|
||||||
|
(AnsiColor.DATA, username),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _download_profile_file(self, url: str, username: str, filename: str):
|
||||||
|
"""Writes the contents of url into the specified file.
|
||||||
|
|
||||||
|
@param url
|
||||||
|
The URL of the file to download.
|
||||||
|
@param username
|
||||||
|
The coach username corresponding to the downloaded file.
|
||||||
|
@param filename
|
||||||
|
The output file to write the downloaded content to.
|
||||||
|
@return:
|
||||||
|
True if we make a network request. False otherwise.
|
||||||
|
"""
|
||||||
|
if os.path.isfile(filename):
|
||||||
|
return False
|
||||||
|
|
||||||
|
response, _unused_status = await self.request(url)
|
||||||
|
if response is not None:
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(response)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _load_stats_json(self, stats: dict) -> Export:
|
||||||
|
"""Extract relevant fields from a `stats.json` file."""
|
||||||
|
export: Export = {}
|
||||||
|
for stat in stats.get("stats", []):
|
||||||
|
if stat["key"] == "rapid":
|
||||||
|
export["fide_rapid"] = stat["stats"]["rating"]
|
||||||
|
return export
|
||||||
|
|
||||||
|
async def export(self, username: str) -> Export:
|
||||||
|
"""Transform coach-specific data into uniform format."""
|
||||||
|
stat_export: Export = {}
|
||||||
|
try:
|
||||||
|
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
||||||
|
stat_export = self._load_stats_json(json.load(f))
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
export: Export = {
|
||||||
|
"fide_rapid": None,
|
||||||
|
}
|
||||||
|
export.update(stat_export)
|
||||||
|
return export
|
277
app/scraper.py
277
app/scraper.py
|
@ -1,200 +1,119 @@
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import argparse
|
import enum
|
||||||
import asyncio
|
import json
|
||||||
import os
|
import os
|
||||||
import os.path
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from typing import List, Tuple, Union
|
||||||
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
|
|
||||||
# The root directory containing downloaded files for a coach.
|
class Site(enum.Enum):
|
||||||
DATA_COACH_DIR = "data/coach/{username}"
|
CHESSCOM = "chesscom"
|
||||||
|
LICHESS = "lichess"
|
||||||
# Where a part of coach-related data is stored.
|
|
||||||
DATA_COACH_FILE = "data/coach/{username}/{filename}"
|
|
||||||
|
|
||||||
# Where a part of all discovered coach usernames is stored.
|
|
||||||
DATA_COACH_LIST = "data/pages/{page_no}.txt"
|
|
||||||
|
|
||||||
# The "User-Agent" value set in every request to chess.com.
|
|
||||||
USER_AGENT = "BoardWise chesscom-scraper ({user_agent})"
|
|
||||||
|
|
||||||
# How long to wait between a batch of network requests.
|
|
||||||
SLEEP_SECS = 3
|
|
||||||
|
|
||||||
|
|
||||||
def ANSI_COLOR(s: str):
|
class AnsiColor(enum.Enum):
|
||||||
"""Print colored output to the console."""
|
ERROR = "\033[0;31m"
|
||||||
return f"\033[0;34m{s}\033[0m" # Blue
|
INFO = "\033[0;34m"
|
||||||
|
DATA = "\033[0;36m"
|
||||||
|
RESET = "\033[0m"
|
||||||
|
|
||||||
|
|
||||||
async def chesscom_request(session: aiohttp.ClientSession, url: str):
|
class Export(TypedDict, total=False):
|
||||||
"""Convenience function for network requests to chess.com.
|
fide_rapid: Union[int, None]
|
||||||
|
|
||||||
@param session
|
|
||||||
|
class BaseScraper:
|
||||||
|
def __init__(self, site: str, session: aiohttp.ClientSession):
|
||||||
|
"""Initialize a new web scraper and exporter.
|
||||||
|
|
||||||
|
@param site:
|
||||||
|
The site we are making requests out to.
|
||||||
|
@param session:
|
||||||
The `aiohttp.ClientSession` context our requests are made from.
|
The `aiohttp.ClientSession` context our requests are made from.
|
||||||
@param url
|
|
||||||
The URL to send a request to.
|
|
||||||
@return
|
|
||||||
The text response returned by the server at @url.
|
|
||||||
"""
|
"""
|
||||||
async with session.get(url) as response:
|
self.site = site
|
||||||
|
self.session = session
|
||||||
|
|
||||||
|
async def download_usernames(self) -> List[str]:
|
||||||
|
"""Collect all coach usernames from the specified site."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
async def download_profile(self, username: str):
|
||||||
|
"""For each coach, download coach-specific data."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
async def export(self, username: str) -> Export:
|
||||||
|
"""Transform coach-specific data into uniform format."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
async def request(self, url: str) -> Tuple[Union[str, None], int]:
|
||||||
|
"""Make network requests using the internal session.
|
||||||
|
|
||||||
|
@param url
|
||||||
|
The URL to make a GET request to.
|
||||||
|
@return
|
||||||
|
Tuple containing the response body (if the request was successful)
|
||||||
|
and status code.
|
||||||
|
"""
|
||||||
|
async with self.session.get(url) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
return await response.text()
|
return await response.text(), 200
|
||||||
print(f"Encountered {response.status} when retrieving {url}.")
|
return None, response.status
|
||||||
|
|
||||||
|
async def scrape(self):
|
||||||
|
"""Main entrypoint for scraping and exporting downloaded content.
|
||||||
|
|
||||||
async def _scrape_page_coach_usernames(session: aiohttp.ClientSession, page_no: int):
|
A `Scraper` is structured to operates in the following stages:
|
||||||
"""Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
|
|
||||||
|
|
||||||
@param session
|
1. Collect all coach usernames from the specified site.
|
||||||
The `aiohttp.ClientSession` context our requests are made from.
|
2. For each coach, download coach-specific data.
|
||||||
@param page_no
|
3. Transform this data and export into uniform format.
|
||||||
The page consisting of at most 25 coaches (at the time of writing)
|
|
||||||
whose usernames are to be scraped.
|
|
||||||
@return
|
|
||||||
The list of scraped usernames on the specified coach listing page.
|
|
||||||
"""
|
"""
|
||||||
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
os.makedirs(self.path_coaches_dir(), exist_ok=True)
|
||||||
response = await chesscom_request(session, url)
|
os.makedirs(self.path_pages_dir(), exist_ok=True)
|
||||||
if response is None:
|
usernames = await self.download_usernames()
|
||||||
return
|
for username in usernames:
|
||||||
|
os.makedirs(self.path_coach_dir(username), exist_ok=True)
|
||||||
|
await self.download_profile(username)
|
||||||
|
|
||||||
usernames = []
|
export = await self.export(username)
|
||||||
soup = BeautifulSoup(response, "html.parser")
|
with open(self.path_coach_file(username, "export.json"), "w") as f:
|
||||||
members = soup.find_all("a", class_="members-categories-username")
|
json.dump(export, f)
|
||||||
for member in members:
|
self.log(
|
||||||
href = member.get("href")
|
[
|
||||||
username = href[len("https://www.chess.com/member/") :]
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
usernames.append(username)
|
(None, ": Finished exporting "),
|
||||||
|
(AnsiColor.DATA, username),
|
||||||
return usernames
|
]
|
||||||
|
|
||||||
|
|
||||||
async def _scrape_all_coach_usernames(
|
|
||||||
session: aiohttp.ClientSession, max_pages: int = 64
|
|
||||||
):
|
|
||||||
"""Scan through chess.com/coaches for all coaches' usernames.
|
|
||||||
|
|
||||||
@param session
|
|
||||||
The `aiohttp.ClientSession` context our requests are made from.
|
|
||||||
@param max_pages
|
|
||||||
The number of pages we will at most iterate through. This number was
|
|
||||||
determined by going to chess.com/coaches?sortBy=alphabetical&page=1
|
|
||||||
and traversing to the last page.
|
|
||||||
@return
|
|
||||||
The complete list of scraped usernames across every coach listing page.
|
|
||||||
"""
|
|
||||||
usernames = []
|
|
||||||
for page_no in range(1, max_pages + 1):
|
|
||||||
filepath = DATA_COACH_LIST.format(page_no=page_no)
|
|
||||||
try:
|
|
||||||
with open(filepath, "r") as f:
|
|
||||||
usernames.extend(f.readlines())
|
|
||||||
print(f"Skipping {ANSI_COLOR(filepath)}")
|
|
||||||
except FileNotFoundError:
|
|
||||||
page_usernames = await _scrape_page_coach_usernames(session, page_no)
|
|
||||||
if not page_usernames:
|
|
||||||
print(f"Could not write {ANSI_COLOR(filepath)}")
|
|
||||||
continue
|
|
||||||
with open(filepath, "w") as f:
|
|
||||||
for username in page_usernames:
|
|
||||||
f.write(f"{username}\n")
|
|
||||||
usernames.extend(page_usernames)
|
|
||||||
print(f"Downloaded {ANSI_COLOR(filepath)}")
|
|
||||||
await asyncio.sleep(SLEEP_SECS)
|
|
||||||
|
|
||||||
return usernames
|
|
||||||
|
|
||||||
|
|
||||||
async def _download_coach_file(
|
|
||||||
session: aiohttp.ClientSession, url: str, username: str, filename: str
|
|
||||||
):
|
|
||||||
"""Writes the contents of @url into `DATA_COACH_FILE`.
|
|
||||||
|
|
||||||
@param session
|
|
||||||
The `aiohttp.ClientSession` context our requests are made from.
|
|
||||||
@param url
|
|
||||||
The URL of the file to download.
|
|
||||||
@param username
|
|
||||||
The coach username corresponding to the downloaded file.
|
|
||||||
@param filename
|
|
||||||
The output file to write the downloaded content to.
|
|
||||||
@return:
|
|
||||||
True if we make a network request. False otherwise.
|
|
||||||
"""
|
|
||||||
filepath = DATA_COACH_FILE.format(username=username, filename=filename)
|
|
||||||
if os.path.isfile(filepath):
|
|
||||||
return False
|
|
||||||
|
|
||||||
response = await chesscom_request(session, url)
|
|
||||||
if response is not None:
|
|
||||||
with open(filepath, "w") as f:
|
|
||||||
f.write(response)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
async def _download_coach_data(session: aiohttp.ClientSession, username: str):
|
|
||||||
"""Download coach-related data to the `DATA_COACH_DIR` directory.
|
|
||||||
|
|
||||||
This sends three parallel requests for:
|
|
||||||
* the coach's profile,
|
|
||||||
* the coach's recent activity,
|
|
||||||
* the coach's stats.
|
|
||||||
|
|
||||||
@param session
|
|
||||||
The `aiohttp.ClientSession` context our requests are made from.
|
|
||||||
@param username
|
|
||||||
The coach username corresponding to the downloaded files.
|
|
||||||
"""
|
|
||||||
used_network = await asyncio.gather(
|
|
||||||
_download_coach_file(
|
|
||||||
session,
|
|
||||||
url=f"https://www.chess.com/member/{username}",
|
|
||||||
username=username,
|
|
||||||
filename=f"{username}.html",
|
|
||||||
),
|
|
||||||
_download_coach_file(
|
|
||||||
session,
|
|
||||||
url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
|
|
||||||
username=username,
|
|
||||||
filename="activity.json",
|
|
||||||
),
|
|
||||||
_download_coach_file(
|
|
||||||
session,
|
|
||||||
url=f"https://www.chess.com/callback/member/stats/{username}",
|
|
||||||
username=username,
|
|
||||||
filename="stats.json",
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
if any(used_network):
|
|
||||||
print(f"Downloaded {ANSI_COLOR(username)}")
|
def path_coaches_dir(self):
|
||||||
await asyncio.sleep(SLEEP_SECS)
|
"""The root directory for all coach-related downloads."""
|
||||||
|
return os.path.join("data", self.site, "coaches")
|
||||||
|
|
||||||
|
def path_coach_dir(self, username: str):
|
||||||
|
"""The root directory for a specific coach's downloads."""
|
||||||
|
return os.path.join(self.path_coaches_dir(), username)
|
||||||
|
|
||||||
|
def path_coach_file(self, username: str, filename: str):
|
||||||
|
"""Path to a coach-specific file download."""
|
||||||
|
return os.path.join(self.path_coach_dir(username), filename)
|
||||||
|
|
||||||
|
def path_pages_dir(self):
|
||||||
|
"""The root directory for all username listing files."""
|
||||||
|
return os.path.join("data", self.site, "pages")
|
||||||
|
|
||||||
|
def path_page_file(self, page_no: int):
|
||||||
|
"""The root directory for usernames scraped from a single page."""
|
||||||
|
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
||||||
|
|
||||||
|
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
|
||||||
|
transformed = []
|
||||||
|
for k, v in msgs:
|
||||||
|
if k is None:
|
||||||
|
transformed.append(v)
|
||||||
else:
|
else:
|
||||||
print(f"Skipping {ANSI_COLOR(username)}")
|
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
|
||||||
|
|
||||||
|
print("".join(transformed))
|
||||||
async def _scrape():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="chesscom-scraper",
|
|
||||||
description="HTML scraping of chess.com coaches.",
|
|
||||||
)
|
|
||||||
parser.add_argument("-u", "--user-agent", required=True)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
os.makedirs("data/pages", exist_ok=True)
|
|
||||||
os.makedirs("data/coach", exist_ok=True)
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
|
||||||
headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)}
|
|
||||||
) as session:
|
|
||||||
# Retrieve all coaches on the platform.
|
|
||||||
usernames = await _scrape_all_coach_usernames(session)
|
|
||||||
# For each coach, download relevant data.
|
|
||||||
for username in [u.strip() for u in usernames]:
|
|
||||||
os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True)
|
|
||||||
await _download_coach_data(session, username)
|
|
||||||
|
|
||||||
|
|
||||||
def run():
|
|
||||||
asyncio.run(_scrape())
|
|
||||||
|
|
|
@ -345,6 +345,17 @@ files = [
|
||||||
{file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
|
{file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typing-extensions"
|
||||||
|
version = "4.8.0"
|
||||||
|
description = "Backported and Experimental Type Hints for Python 3.8+"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
|
||||||
|
{file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "yarl"
|
name = "yarl"
|
||||||
version = "1.9.3"
|
version = "1.9.3"
|
||||||
|
@ -451,4 +462,4 @@ multidict = ">=4.0"
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "987c0a45c65fc281154469d795a5dc2828af5fa55226a1688466b71bf4327e3e"
|
content-hash = "04db01ae29bbc78abf48f0ae23d60db56da274aea1b281c7aeaca0e705162114"
|
||||||
|
|
|
@ -12,10 +12,11 @@ aiohttp = "^3.8.6"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
types-beautifulsoup4 = "^4.12.0.7"
|
types-beautifulsoup4 = "^4.12.0.7"
|
||||||
|
typing-extensions = "^4.8.0"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
app = "app.scraper:run"
|
app = "app.__main__:main"
|
||||||
|
|
Loading…
Reference in New Issue