Transition to a CSV; Postgres can handle that better.

pull/9/head
Joshua Potter 2023-12-04 15:08:17 -07:00
parent ec94a16140
commit 63764a22c4
7 changed files with 179 additions and 94 deletions

View File

@ -29,17 +29,19 @@ data
├── ... ├── ...
``` ```
## Usage ## Quickstart
If you have nix available, run: If you have nix available, run:
```bash ```bash
$ nix run . -- --user-agent <your-email> -s <site> $ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
``` ```
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
instead run the following: instead run the following:
```bash ```bash
$ poetry run python3 -m app -u <your-email> -s <site> $ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
``` ```
After running (this may take several hours), a new CSV will be generated at
`data/export.csv` containing all scraped content from the specified `<site>`s.
## Database ## Database
@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
```bash ```bash
$ psql -h @scraper -f sql/init.sql $ psql -h @scraper -f sql/init.sql
``` ```
Next, concatenate all exported content and dump into the newly created table: Next, dump exported data into the newly created table:
```bash ```bash
$ cat data/{chesscom,lichess}/export.json > data/export.json $ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'" ```
Re-running the `sql/export.sql` script will create a backup of the
`coach_scraper.export` table. It will then upsert the scraped data. You can view
all backups from the `psql` console like so:
```
postgres=# \dt coach_scraper.export*
``` ```
Re-running will automatically create backups and replace the coach data found
in `coach_scraper.export`.
### E2E ### E2E
@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
the database in one fell swoop. Assuming our database is open with a socket the database in one fell swoop. Assuming our database is open with a socket
connection available at `@scraper`: connection available at `@scraper`:
```bash ```bash
nix run . -- --user-agent <your-email> -s chesscom -s lichess $ nix run . -- --user-agent <your-email> -s chesscom -s lichess
cat data/{chesscom,lichess}/export.json > data/export.json $ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
psql -h @scraper -f sql/init.sql
psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
``` ```
## Development ## Development

View File

@ -1,5 +1,6 @@
import argparse import argparse
import asyncio import asyncio
import csv
import json import json
import aiohttp import aiohttp
@ -32,6 +33,8 @@ async def run():
async with aiohttp.ClientSession( async with aiohttp.ClientSession(
headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"} headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
) as session: ) as session:
with open("data/export.csv", "w") as f:
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
for site in set(args.site): for site in set(args.site):
scraper, exporter_cls = None, None scraper, exporter_cls = None, None
@ -42,15 +45,22 @@ async def run():
scraper = LichessScraper(session) scraper = LichessScraper(session)
exporter_cls = LichessExporter exporter_cls = LichessExporter
# Write out each coach data into NDJSON file.
dump = []
usernames = await scraper.scrape() usernames = await scraper.scrape()
for username in usernames: for username in usernames:
export = exporter_cls(username).export() export = exporter_cls(username).export()
dump.append(f"{json.dumps(export)}\n") writer.writerow(
[
with open(scraper.path_site_file("export.json"), "w") as f: # This should match the order data is loaded in the
f.writelines(dump) # sql/export.sql script.
export["site"],
export["username"],
export.get("name", ""),
export.get("image_url", ""),
export.get("rapid", ""),
export.get("blitz", ""),
export.get("bullet", ""),
]
)
def main(): def main():

View File

@ -5,7 +5,7 @@ import os.path
from typing import List, Union from typing import List, Union
import aiohttp import aiohttp
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, SoupStrainer
from app.exporter import BaseExporter from app.exporter import BaseExporter
from app.repo import AnsiColor, Site from app.repo import AnsiColor, Site
@ -172,10 +172,27 @@ class Scraper(BaseScraper):
return True return True
def _profile_filter(elem, attrs):
"""Includes only relevant segments of the `{username}.html` file."""
if "profile-header-info" in attrs.get("class", ""):
return True
if "profile-card-info" in attrs.get("class", ""):
return True
class Exporter(BaseExporter): class Exporter(BaseExporter):
def __init__(self, username: str): def __init__(self, username: str):
super().__init__(site=Site.CHESSCOM.value, username=username) super().__init__(site=Site.CHESSCOM.value, username=username)
self.profile_soup = None
try:
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
self.profile_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
)
except FileNotFoundError:
pass
self.stats_json = {} self.stats_json = {}
try: try:
with open(self.path_coach_file(username, "stats.json"), "r") as f: with open(self.path_coach_file(username, "stats.json"), "r") as f:
@ -185,6 +202,22 @@ class Exporter(BaseExporter):
except FileNotFoundError: except FileNotFoundError:
pass pass
def export_name(self) -> Union[str, None]:
try:
name = self.profile_soup.find("div", class_="profile-card-name")
return name.get_text().strip()
except AttributeError:
return None
def export_image_url(self) -> Union[str, None]:
try:
div = self.profile_soup.find("div", class_="profile-header-avatar")
src = div.find("img").get("src", "")
if "images.chesscomfiles.com" in src:
return src
except AttributeError:
return None
def export_rapid(self) -> Union[int, None]: def export_rapid(self) -> Union[int, None]:
return self.stats_json.get("rapid", {}).get("rating") return self.stats_json.get("rapid", {}).get("rating")

View File

@ -1,4 +1,4 @@
from typing import Union from typing import Any, Union
from typing_extensions import TypedDict from typing_extensions import TypedDict
@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo
class Export(TypedDict, total=False): class Export(TypedDict, total=False):
# The (site, username) make up a unique key for each coach.
site: str
username: str
# The coach's real name.
name: str
# The profile image used on the source site.
image_url: str
# The coach's rapid rating relative to the site they were sourced from. # The coach's rapid rating relative to the site they were sourced from.
rapid: int rapid: int
# The coach's blitz rating relative to the site they were sourced from. # The coach's blitz rating relative to the site they were sourced from.
@ -14,9 +21,8 @@ class Export(TypedDict, total=False):
bullet: int bullet: int
def _insert(export: Export, key: str, value: any): def _insert(export: Export, key: str, value: Any):
if value is None: if value is not None:
return
export[key] = value export[key] = value
@ -25,6 +31,12 @@ class BaseExporter(Repo):
super().__init__(site) super().__init__(site)
self.username = username self.username = username
def export_name(self) -> Union[str, None]:
raise NotImplementedError()
def export_image_url(self) -> Union[str, None]:
raise NotImplementedError()
def export_rapid(self) -> Union[int, None]: def export_rapid(self) -> Union[int, None]:
raise NotImplementedError() raise NotImplementedError()
@ -40,6 +52,8 @@ class BaseExporter(Repo):
_insert(export, "site", self.site) _insert(export, "site", self.site)
_insert(export, "username", self.username) _insert(export, "username", self.username)
_insert(export, "name", self.export_name())
_insert(export, "image_url", self.export_image_url())
_insert(export, "rapid", self.export_rapid()) _insert(export, "rapid", self.export_rapid())
_insert(export, "blitz", self.export_blitz()) _insert(export, "blitz", self.export_blitz())
_insert(export, "bullet", self.export_bullet()) _insert(export, "bullet", self.export_bullet())

View File

@ -1,7 +1,7 @@
import asyncio import asyncio
import os import os
import os.path import os.path
from typing import List from typing import List, Union
import aiohttp import aiohttp
from bs4 import BeautifulSoup, SoupStrainer from bs4 import BeautifulSoup, SoupStrainer
@ -97,9 +97,9 @@ class Scraper(BaseScraper):
soup = BeautifulSoup(response, "lxml") soup = BeautifulSoup(response, "lxml")
members = soup.find_all("article", class_="coach-widget") members = soup.find_all("article", class_="coach-widget")
for member in members: for member in members:
anchor = member.find("a", class_="overlay") a = member.find("a", class_="overlay")
if anchor: if a:
href = anchor.get("href") href = a.get("href")
username = href[len("/coach/") :] username = href[len("/coach/") :]
usernames.append(username) usernames.append(username)
@ -163,8 +163,16 @@ class Scraper(BaseScraper):
return True return True
def _profile_filter(elem, attrs):
"""Includes only relevant segments of the `{username}.html` file."""
if "coach-widget" in attrs.get("class", ""):
return True
def _stats_filter(elem, attrs): def _stats_filter(elem, attrs):
"""Includes only relevant segments of the `stats.html` file.""" """Includes only relevant segments of the `stats.html` file."""
if "profile-side" in attrs.get("class", ""):
return True
if "sub-ratings" in attrs.get("class", ""): if "sub-ratings" in attrs.get("class", ""):
return True return True
@ -173,43 +181,59 @@ class Exporter(BaseExporter):
def __init__(self, username: str): def __init__(self, username: str):
super().__init__(site=Site.LICHESS.value, username=username) super().__init__(site=Site.LICHESS.value, username=username)
self.stats_soup = None self.profile_soup = None
try: try:
with open(self.path_coach_file(username, "stats.html"), "r") as f: with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
stats_strainer = SoupStrainer(_stats_filter) self.profile_soup = BeautifulSoup(
self.stats_soup = BeautifulSoup( f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
f.read(), "lxml", parse_only=stats_strainer
) )
except FileNotFoundError: except FileNotFoundError:
pass pass
def export_rapid(self): self.stats_soup = None
try:
with open(self.path_coach_file(username, "stats.html"), "r") as f:
self.stats_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
)
except FileNotFoundError:
pass
def export_name(self) -> Union[str, None]:
try:
profile_side = self.stats_soup.find("div", class_="profile-side")
user_infos = profile_side.find("div", class_="user-infos")
name = user_infos.find("strong", class_="name")
return name.get_text().strip()
except AttributeError:
return None
def export_image_url(self) -> Union[str, None]:
try:
picture = self.profile_soup.find("img", class_="picture")
src = picture.get("src", "")
if "image.lichess1.org" in src:
return src
except AttributeError:
return None
def export_rapid(self) -> Union[int, None]:
return self._find_rating("rapid") return self._find_rating("rapid")
def export_blitz(self): def export_blitz(self) -> Union[int, None]:
return self._find_rating("blitz") return self._find_rating("blitz")
def export_bullet(self): def export_bullet(self) -> Union[int, None]:
return self._find_rating("bullet") return self._find_rating("bullet")
def _find_rating(self, name): def _find_rating(self, name) -> Union[int, None]:
if self.stats_soup is None: try:
return None a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
rating = a.find("rating")
anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
if anchor is None:
return None
rating = anchor.find("rating")
if rating is None:
return None
strong = rating.find("strong") strong = rating.find("strong")
if strong is None:
return None
value = strong.get_text() value = strong.get_text()
if value[-1] == "?": if value[-1] == "?":
value = value[:-1] value = value[:-1]
try:
return int(value) return int(value)
except ValueError: except (AttributeError, ValueError):
return None return None

View File

@ -7,31 +7,47 @@ DO $$
END; END;
$$ LANGUAGE plpgsql; $$ LANGUAGE plpgsql;
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB); -- This should match the order data is written in the app/__main__.py
-- script.
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
( site TEXT
, username TEXT
, name TEXT
, image_url TEXT
, rapid TEXT
, blitz TEXT
, bullet TEXT
);
SELECT format( SELECT format(
$$COPY pg_temp.coach_scraper_export (data) from %L$$, $$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
:export :export
) \gexec ) \gexec
INSERT INTO coach_scraper.export INSERT INTO coach_scraper.export
( username ( site
, site , username
, name
, image_url
, rapid , rapid
, blitz , blitz
, bullet , bullet
) )
SELECT SELECT
data->>'username', site,
data->>'site', username,
(data->>'rapid')::INT, name,
(data->>'blitz')::INT, image_url,
(data->>'bullet')::INT rapid::INT,
blitz::INT,
bullet::INT
FROM FROM
pg_temp.coach_scraper_export pg_temp.coach_scraper_export
ON CONFLICT ON CONFLICT
(site, username) (site, username)
DO UPDATE SET DO UPDATE SET
name = EXCLUDED.name,
image_url = EXCLUDED.image_url,
rapid = EXCLUDED.rapid, rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz, blitz = EXCLUDED.blitz,
bullet = EXCLUDED.bullet; bullet = EXCLUDED.bullet;

View File

@ -1,9 +1,13 @@
CREATE SCHEMA IF NOT EXISTS coach_scraper; CREATE SCHEMA IF NOT EXISTS coach_scraper;
CREATE TABLE IF NOT EXISTS coach_scraper.export DROP TABLE IF EXISTS coach_scraper.export;
CREATE TABLE coach_scraper.export
( id SERIAL PRIMARY KEY ( id SERIAL PRIMARY KEY
, site VARCHAR(16) NOT NULL , site VARCHAR(16) NOT NULL
, username VARCHAR(255) NOT NULL , username VARCHAR(255) NOT NULL
, name VARCHAR(255)
, image_url TEXT
, rapid INT , rapid INT
, blitz INT , blitz INT
, bullet INT , bullet INT
@ -15,22 +19,3 @@ ON
coach_scraper.export coach_scraper.export
USING USING
BTREE (site, username); BTREE (site, username);
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM information_schema.constraint_column_usage
WHERE table_schema = 'coach_scraper'
AND table_name = 'export'
AND constraint_name = 'site_username_unique'
) THEN
EXECUTE 'ALTER TABLE
coach_scraper.export
ADD CONSTRAINT
site_username_unique
UNIQUE USING INDEX
site_username_unique';
END IF;
END;
$$ LANGUAGE plpgsql;