Initial commit.
commit
1710e1aefa
|
@ -0,0 +1,7 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if command -v git > /dev/null && on_git_branch; then
|
||||||
|
git config --local core.hooksPath .githooks/
|
||||||
|
fi
|
||||||
|
|
||||||
|
use flake
|
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
filesToFormat=$(
|
||||||
|
git --no-pager diff --name-status --no-color --cached | \
|
||||||
|
awk '$1 != "D" && $2 ~ /\.py$/ {print $NF}'
|
||||||
|
)
|
||||||
|
|
||||||
|
for path in $filesToFormat
|
||||||
|
do
|
||||||
|
black --quiet "$path"
|
||||||
|
git add "$path"
|
||||||
|
done;
|
|
@ -0,0 +1,5 @@
|
||||||
|
# Directory used by `direnv` to hold `use flake`-generated profiles.
|
||||||
|
.direnv/
|
||||||
|
|
||||||
|
# A symlink produced by default when running `nix build`.
|
||||||
|
result
|
|
@ -0,0 +1,29 @@
|
||||||
|
# Poetry Flake Template
|
||||||
|
|
||||||
|
This is a template for constructing a working environment for Python (version
|
||||||
|
3.11.6) development. Packaging and dependency management relies on [poetry](https://python-poetry.org/)
|
||||||
|
(version 1.7.0). [direnv](https://direnv.net/) can be used to a launch a dev
|
||||||
|
shell upon entering this directory (refer to `.envrc`). Otherwise run via:
|
||||||
|
```bash
|
||||||
|
$> nix develop
|
||||||
|
```
|
||||||
|
|
||||||
|
## Language Server
|
||||||
|
|
||||||
|
The [python-lsp-server](https://github.com/python-lsp/python-lsp-server)
|
||||||
|
(version v1.9.0) is included in this flake, along with the [python-lsp-black](https://github.com/python-lsp/python-lsp-black)
|
||||||
|
plugin for formatting purposes. `pylsp` is expected to be configured to use
|
||||||
|
[McCabe](https://github.com/PyCQA/mccabe), [pycodestyle](https://pycodestyle.pycqa.org/en/latest/),
|
||||||
|
and [pyflakes](https://github.com/PyCQA/pyflakes). Refer to your editor for
|
||||||
|
configuration details.
|
||||||
|
|
||||||
|
## Formatting
|
||||||
|
|
||||||
|
Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.html)
|
||||||
|
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
|
||||||
|
be used to format all `*.py` files prior to commit. Install via:
|
||||||
|
```bash
|
||||||
|
$> git config --local core.hooksPath .githooks/
|
||||||
|
```
|
||||||
|
If running [direnv](https://direnv.net/), this hook is installed automatically
|
||||||
|
when entering the directory.
|
|
@ -0,0 +1,10 @@
|
||||||
|
(import
|
||||||
|
(
|
||||||
|
let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in
|
||||||
|
fetchTarball {
|
||||||
|
url = lock.nodes.flake-compat.locked.url or "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
|
||||||
|
sha256 = lock.nodes.flake-compat.locked.narHash;
|
||||||
|
}
|
||||||
|
)
|
||||||
|
{ src = ./.; }
|
||||||
|
).defaultNix
|
|
@ -0,0 +1,190 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-compat": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1696426674,
|
||||||
|
"narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
|
||||||
|
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
|
||||||
|
"revCount": 57,
|
||||||
|
"type": "tarball",
|
||||||
|
"url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"type": "tarball",
|
||||||
|
"url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"flake-utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1694529238,
|
||||||
|
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"flake-utils_2": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems_2"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1694529238,
|
||||||
|
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nix-github-actions": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"poetry2nix",
|
||||||
|
"nixpkgs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1698974481,
|
||||||
|
"narHash": "sha256-yPncV9Ohdz1zPZxYHQf47S8S0VrnhV7nNhCawY46hDA=",
|
||||||
|
"owner": "nix-community",
|
||||||
|
"repo": "nix-github-actions",
|
||||||
|
"rev": "4bb5e752616262457bc7ca5882192a564c0472d2",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-community",
|
||||||
|
"repo": "nix-github-actions",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1700794826,
|
||||||
|
"narHash": "sha256-RyJTnTNKhO0yqRpDISk03I/4A67/dp96YRxc86YOPgU=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "5a09cb4b393d58f9ed0d9ca1555016a8543c2ac8",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"poetry2nix": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils_2",
|
||||||
|
"nix-github-actions": "nix-github-actions",
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
],
|
||||||
|
"systems": "systems_3",
|
||||||
|
"treefmt-nix": "treefmt-nix"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1701035916,
|
||||||
|
"narHash": "sha256-Cbe/3H9/z7vIXAeMr9m9iXs1gRxpE3w1mrx9aaxWtGU=",
|
||||||
|
"owner": "nix-community",
|
||||||
|
"repo": "poetry2nix",
|
||||||
|
"rev": "2bee2516bc054458b0cbca10b18e2ec63cea8726",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-community",
|
||||||
|
"repo": "poetry2nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-compat": "flake-compat",
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs",
|
||||||
|
"poetry2nix": "poetry2nix"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems_2": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems_3": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"id": "systems",
|
||||||
|
"type": "indirect"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"treefmt-nix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"poetry2nix",
|
||||||
|
"nixpkgs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1699786194,
|
||||||
|
"narHash": "sha256-3h3EH1FXQkIeAuzaWB+nK0XK54uSD46pp+dMD3gAcB4=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "treefmt-nix",
|
||||||
|
"rev": "e82f32aa7f06bbbd56d7b12186d555223dc399d1",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "treefmt-nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
|
@ -0,0 +1,89 @@
|
||||||
|
{
|
||||||
|
description = ''
|
||||||
|
An opinionated poetry flake.
|
||||||
|
|
||||||
|
This flake has been adapted from the `app` template found in:
|
||||||
|
https://github.com/nix-community/poetry2nix
|
||||||
|
|
||||||
|
To generate a copy of this template elsewhere, run:
|
||||||
|
$> bootstrap poetry
|
||||||
|
'';
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
flake-compat.url = "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz";
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||||
|
poetry2nix = {
|
||||||
|
url = "github:nix-community/poetry2nix";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = { self, nixpkgs, flake-utils, poetry2nix, ... }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
# See https://github.com/nix-community/poetry2nix/tree/master#api for
|
||||||
|
# more functions and examples.
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
|
||||||
|
inherit
|
||||||
|
(poetry2nix.lib.mkPoetry2Nix { inherit pkgs; })
|
||||||
|
mkPoetryApplication
|
||||||
|
defaultPoetryOverrides;
|
||||||
|
|
||||||
|
# https://github.com/nix-community/poetry2nix/blob/ec4364021900f8e0d425d901b6e6ff03cf201efb/docs/edgecases.md
|
||||||
|
# `poetry2nix`, by default, prefers building from source. To build
|
||||||
|
# certain dependencies, we need to augment its build dependencies by
|
||||||
|
# adding the corresponding build backend (e.g. `setuptools`).
|
||||||
|
#
|
||||||
|
# For example, you can write:
|
||||||
|
# ```nix
|
||||||
|
# pypkgs-build-requirements = {
|
||||||
|
# ...
|
||||||
|
# coolname = [ "setuptools" ];
|
||||||
|
# ...
|
||||||
|
# };
|
||||||
|
# ```
|
||||||
|
# after encountering a build error like:
|
||||||
|
#
|
||||||
|
# > ModuleNotFoundError: No module named 'setuptools'
|
||||||
|
pypkgs-build-requirements = {};
|
||||||
|
poetry2nix-overrides = defaultPoetryOverrides.extend (self: super:
|
||||||
|
builtins.mapAttrs (package: build-requirements:
|
||||||
|
(builtins.getAttr package super).overridePythonAttrs (old: {
|
||||||
|
buildInputs =
|
||||||
|
(old.buildInputs or []) ++
|
||||||
|
(builtins.map (pkg:
|
||||||
|
if builtins.isString pkg then
|
||||||
|
builtins.getAttr pkg super
|
||||||
|
else
|
||||||
|
pkg) build-requirements);
|
||||||
|
})
|
||||||
|
) pypkgs-build-requirements
|
||||||
|
);
|
||||||
|
in
|
||||||
|
{
|
||||||
|
packages = {
|
||||||
|
tmpl-app = mkPoetryApplication {
|
||||||
|
projectDir = ./.;
|
||||||
|
overrides = poetry2nix-overrides;
|
||||||
|
};
|
||||||
|
default = self.packages.${system}.tmpl-app;
|
||||||
|
};
|
||||||
|
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
inputsFrom = [ self.packages.${system}.tmpl-app ];
|
||||||
|
packages = [
|
||||||
|
pkgs.poetry
|
||||||
|
] ++ (with pkgs.python311Packages; [
|
||||||
|
black
|
||||||
|
debugpy
|
||||||
|
mccabe
|
||||||
|
pycodestyle
|
||||||
|
pyflakes
|
||||||
|
python-lsp-black
|
||||||
|
python-lsp-server
|
||||||
|
]);
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
|
@ -0,0 +1,99 @@
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
import random
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
# References to paths we use to save any scraped content.
|
||||||
|
DATA_COACH_LINKS = "data/coach_links.txt"
|
||||||
|
DATA_COACH_DIR = "data/coach/{}/{}"
|
||||||
|
|
||||||
|
|
||||||
|
async def scrape_coach_links(page_no):
|
||||||
|
"""Scrape a single coach page listing."""
|
||||||
|
links = []
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||||
|
async with session.get(href) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
print(f"Encountered {response.status} when retrieving {href}")
|
||||||
|
return
|
||||||
|
html = await response.text()
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
members = soup.find_all("a", class_="members-categories-username")
|
||||||
|
for member in members:
|
||||||
|
links.append(member.get("href"))
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
async def scrape_all_coach_links(max_pages=62):
|
||||||
|
"""Scans through chess.com/coaches for all member links."""
|
||||||
|
if os.path.isfile(DATA_COACH_LINKS):
|
||||||
|
with open(DATA_COACH_LINKS, "r") as f:
|
||||||
|
return f.readlines()
|
||||||
|
for i in range(1, max_pages + 1):
|
||||||
|
# Nest the file context manager here so I can `tail -f` the file.
|
||||||
|
with open(DATA_COACH_LINKS, "a") as f:
|
||||||
|
links = await scrape_coach_links(i)
|
||||||
|
for link in links:
|
||||||
|
f.write(f"{link}\n")
|
||||||
|
await asyncio.sleep(random.randint(2, 5))
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
async def download_member_info(member_name, filename, href):
|
||||||
|
"""Download member-specific content.
|
||||||
|
|
||||||
|
@return: True if we downloaded content. False if the results already
|
||||||
|
existed locally.
|
||||||
|
"""
|
||||||
|
target = DATA_COACH_DIR.format(member_name, filename)
|
||||||
|
if os.path.isfile(target):
|
||||||
|
return False
|
||||||
|
with open(target, "w") as f:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(href) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
print(f"Encountered {response.status} when retrieving {href}")
|
||||||
|
return
|
||||||
|
f.write(await response.text())
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
links = await scrape_all_coach_links()
|
||||||
|
for link in links:
|
||||||
|
href = link.strip()
|
||||||
|
member_name = href[len("https://www.chess.com/member/") :]
|
||||||
|
downloaded = await asyncio.gather(
|
||||||
|
download_member_info(
|
||||||
|
member_name,
|
||||||
|
f"{member_name}.html",
|
||||||
|
href,
|
||||||
|
),
|
||||||
|
download_member_info(
|
||||||
|
member_name,
|
||||||
|
"activity.json",
|
||||||
|
f"https://www.chess.com/callback/member/activity/{member_name}?page=1",
|
||||||
|
),
|
||||||
|
download_member_info(
|
||||||
|
member_name,
|
||||||
|
"stats.json",
|
||||||
|
f"https://www.chess.com/callback/member/stats/{member_name}",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# Only want to sleep if the files didn't already exist.
|
||||||
|
if any(downloaded):
|
||||||
|
await asyncio.sleep(random.randint(2, 5))
|
||||||
|
print(f"Downloaded {member_name}")
|
||||||
|
else:
|
||||||
|
print(f"Skipping {member_name}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
os.makedirs("data/coach", exist_ok=True)
|
||||||
|
asyncio.run(main())
|
|
@ -0,0 +1,7 @@
|
||||||
|
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
||||||
|
package = []
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
lock-version = "2.0"
|
||||||
|
python-versions = "^3.11"
|
||||||
|
content-hash = "81b2fa642d7f2d1219cf80112ace12d689d053d81be7f7addb98144d56fc0fb2"
|
|
@ -0,0 +1,15 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "chesscom-scraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["jrpotter2112@gmail.com"]
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
beautifulsoup4 = "^4.12.2"
|
||||||
|
aiohttp = "^3.8.6"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
Loading…
Reference in New Issue