2023-11-30 22:15:15 +00:00
|
|
|
import argparse
|
|
|
|
import asyncio
|
2023-12-05 18:43:13 +00:00
|
|
|
from typing import List
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-01 23:36:53 +00:00
|
|
|
import aiohttp
|
2023-12-05 18:43:13 +00:00
|
|
|
import psycopg2
|
2023-12-07 03:53:54 +00:00
|
|
|
from lingua import LanguageDetector, LanguageDetectorBuilder
|
2023-12-01 23:36:53 +00:00
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
from app.chesscom import Pipeline as ChesscomPipeline
|
2023-12-05 22:15:42 +00:00
|
|
|
from app.database import backup_database, load_languages
|
2023-12-05 18:43:13 +00:00
|
|
|
from app.lichess import Pipeline as LichessPipeline
|
2023-12-05 21:20:46 +00:00
|
|
|
from app.types import Site
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
# The number of parallel extraction jobs that are run at a time.
|
|
|
|
WORKER_COUNT = 10
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
async def _process(
|
|
|
|
site: Site, conn, detector: LanguageDetector, session: aiohttp.ClientSession
|
|
|
|
):
|
2023-12-05 18:43:13 +00:00
|
|
|
if site == Site.CHESSCOM:
|
2023-12-07 03:53:54 +00:00
|
|
|
await ChesscomPipeline(worker_count=WORKER_COUNT).process(
|
|
|
|
conn, detector, session
|
|
|
|
)
|
2023-12-05 18:43:13 +00:00
|
|
|
elif site == Site.LICHESS:
|
2023-12-07 03:53:54 +00:00
|
|
|
await LichessPipeline(worker_count=WORKER_COUNT).process(
|
|
|
|
conn, detector, session
|
|
|
|
)
|
2023-12-05 18:43:13 +00:00
|
|
|
else:
|
|
|
|
assert False, f"Encountered unknown site: {site}."
|
|
|
|
|
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
async def _entrypoint(
|
|
|
|
conn, detector: LanguageDetector, user_agent: str, sites: List[Site]
|
|
|
|
):
|
2023-12-05 18:43:13 +00:00
|
|
|
"""Top-level entrypoint that dispatches a pipeline per requested site."""
|
|
|
|
async with aiohttp.ClientSession(
|
|
|
|
headers={"User-Agent": f"BoardWise coach-scraper ({user_agent})"}
|
|
|
|
) as session:
|
2023-12-07 03:53:54 +00:00
|
|
|
await asyncio.gather(
|
|
|
|
*[_process(site, conn, detector, session) for site in sites]
|
|
|
|
)
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2023-11-30 22:15:15 +00:00
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
prog="coach-scraper",
|
2023-12-01 00:12:16 +00:00
|
|
|
description="Scraping/exporting of chess coaches.",
|
2023-11-30 22:15:15 +00:00
|
|
|
)
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
# Database-related arguments.
|
|
|
|
parser.add_argument("--host", required=True)
|
|
|
|
parser.add_argument("--dbname", default="postgres")
|
|
|
|
parser.add_argument("--user", default="postgres")
|
|
|
|
parser.add_argument("--password", default="password")
|
|
|
|
parser.add_argument("--port", default=5432)
|
|
|
|
|
|
|
|
# Client session-related arguments.
|
|
|
|
parser.add_argument("--user-agent", required=True)
|
2023-11-30 22:15:15 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--site",
|
|
|
|
required=True,
|
2023-12-01 17:47:20 +00:00
|
|
|
action="append",
|
2023-11-30 22:15:15 +00:00
|
|
|
choices=[
|
|
|
|
Site.CHESSCOM.value,
|
2023-11-30 22:36:44 +00:00
|
|
|
Site.LICHESS.value,
|
2023-11-30 22:15:15 +00:00
|
|
|
],
|
|
|
|
)
|
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
args = parser.parse_args()
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
detector = LanguageDetectorBuilder.from_all_languages().build()
|
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
conn = None
|
|
|
|
try:
|
|
|
|
conn = psycopg2.connect(
|
|
|
|
dbname=args.dbname,
|
|
|
|
user=args.user,
|
|
|
|
host=args.host,
|
|
|
|
password=args.password,
|
|
|
|
port=args.port,
|
|
|
|
)
|
|
|
|
backup_database(conn)
|
2023-12-05 22:15:42 +00:00
|
|
|
load_languages(conn)
|
2023-12-05 18:43:13 +00:00
|
|
|
asyncio.run(
|
|
|
|
_entrypoint(
|
|
|
|
conn=conn,
|
2023-12-07 03:53:54 +00:00
|
|
|
detector=detector,
|
2023-12-05 18:43:13 +00:00
|
|
|
user_agent=args.user_agent,
|
|
|
|
sites=list(map(Site, set(args.site))),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
finally:
|
|
|
|
if conn:
|
|
|
|
conn.close()
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-11-28 12:28:21 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-11-30 22:15:15 +00:00
|
|
|
main()
|