Initial commit.
commit
1710e1aefa
|
@ -0,0 +1,7 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
if command -v git > /dev/null && on_git_branch; then
|
||||
git config --local core.hooksPath .githooks/
|
||||
fi
|
||||
|
||||
use flake
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
filesToFormat=$(
|
||||
git --no-pager diff --name-status --no-color --cached | \
|
||||
awk '$1 != "D" && $2 ~ /\.py$/ {print $NF}'
|
||||
)
|
||||
|
||||
for path in $filesToFormat
|
||||
do
|
||||
black --quiet "$path"
|
||||
git add "$path"
|
||||
done;
|
|
@ -0,0 +1,5 @@
|
|||
# Directory used by `direnv` to hold `use flake`-generated profiles.
|
||||
.direnv/
|
||||
|
||||
# A symlink produced by default when running `nix build`.
|
||||
result
|
|
@ -0,0 +1,29 @@
|
|||
# Poetry Flake Template
|
||||
|
||||
This is a template for constructing a working environment for Python (version
|
||||
3.11.6) development. Packaging and dependency management relies on [poetry](https://python-poetry.org/)
|
||||
(version 1.7.0). [direnv](https://direnv.net/) can be used to a launch a dev
|
||||
shell upon entering this directory (refer to `.envrc`). Otherwise run via:
|
||||
```bash
|
||||
$> nix develop
|
||||
```
|
||||
|
||||
## Language Server
|
||||
|
||||
The [python-lsp-server](https://github.com/python-lsp/python-lsp-server)
|
||||
(version v1.9.0) is included in this flake, along with the [python-lsp-black](https://github.com/python-lsp/python-lsp-black)
|
||||
plugin for formatting purposes. `pylsp` is expected to be configured to use
|
||||
[McCabe](https://github.com/PyCQA/mccabe), [pycodestyle](https://pycodestyle.pycqa.org/en/latest/),
|
||||
and [pyflakes](https://github.com/PyCQA/pyflakes). Refer to your editor for
|
||||
configuration details.
|
||||
|
||||
## Formatting
|
||||
|
||||
Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.html)
|
||||
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
|
||||
be used to format all `*.py` files prior to commit. Install via:
|
||||
```bash
|
||||
$> git config --local core.hooksPath .githooks/
|
||||
```
|
||||
If running [direnv](https://direnv.net/), this hook is installed automatically
|
||||
when entering the directory.
|
|
@ -0,0 +1,10 @@
|
|||
(import
|
||||
(
|
||||
let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in
|
||||
fetchTarball {
|
||||
url = lock.nodes.flake-compat.locked.url or "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
|
||||
sha256 = lock.nodes.flake-compat.locked.narHash;
|
||||
}
|
||||
)
|
||||
{ src = ./.; }
|
||||
).defaultNix
|
|
@ -0,0 +1,190 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-compat": {
|
||||
"locked": {
|
||||
"lastModified": 1696426674,
|
||||
"narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
|
||||
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
|
||||
"revCount": 57,
|
||||
"type": "tarball",
|
||||
"url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1694529238,
|
||||
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils_2": {
|
||||
"inputs": {
|
||||
"systems": "systems_2"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1694529238,
|
||||
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix-github-actions": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"poetry2nix",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1698974481,
|
||||
"narHash": "sha256-yPncV9Ohdz1zPZxYHQf47S8S0VrnhV7nNhCawY46hDA=",
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"rev": "4bb5e752616262457bc7ca5882192a564c0472d2",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1700794826,
|
||||
"narHash": "sha256-RyJTnTNKhO0yqRpDISk03I/4A67/dp96YRxc86YOPgU=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5a09cb4b393d58f9ed0d9ca1555016a8543c2ac8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"poetry2nix": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils_2",
|
||||
"nix-github-actions": "nix-github-actions",
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
],
|
||||
"systems": "systems_3",
|
||||
"treefmt-nix": "treefmt-nix"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1701035916,
|
||||
"narHash": "sha256-Cbe/3H9/z7vIXAeMr9m9iXs1gRxpE3w1mrx9aaxWtGU=",
|
||||
"owner": "nix-community",
|
||||
"repo": "poetry2nix",
|
||||
"rev": "2bee2516bc054458b0cbca10b18e2ec63cea8726",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "poetry2nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-compat": "flake-compat",
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"poetry2nix": "poetry2nix"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_2": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_3": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "systems",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"treefmt-nix": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"poetry2nix",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1699786194,
|
||||
"narHash": "sha256-3h3EH1FXQkIeAuzaWB+nK0XK54uSD46pp+dMD3gAcB4=",
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"rev": "e82f32aa7f06bbbd56d7b12186d555223dc399d1",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
{
|
||||
description = ''
|
||||
An opinionated poetry flake.
|
||||
|
||||
This flake has been adapted from the `app` template found in:
|
||||
https://github.com/nix-community/poetry2nix
|
||||
|
||||
To generate a copy of this template elsewhere, run:
|
||||
$> bootstrap poetry
|
||||
'';
|
||||
|
||||
inputs = {
|
||||
flake-compat.url = "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
poetry2nix = {
|
||||
url = "github:nix-community/poetry2nix";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, flake-utils, poetry2nix, ... }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
# See https://github.com/nix-community/poetry2nix/tree/master#api for
|
||||
# more functions and examples.
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
|
||||
inherit
|
||||
(poetry2nix.lib.mkPoetry2Nix { inherit pkgs; })
|
||||
mkPoetryApplication
|
||||
defaultPoetryOverrides;
|
||||
|
||||
# https://github.com/nix-community/poetry2nix/blob/ec4364021900f8e0d425d901b6e6ff03cf201efb/docs/edgecases.md
|
||||
# `poetry2nix`, by default, prefers building from source. To build
|
||||
# certain dependencies, we need to augment its build dependencies by
|
||||
# adding the corresponding build backend (e.g. `setuptools`).
|
||||
#
|
||||
# For example, you can write:
|
||||
# ```nix
|
||||
# pypkgs-build-requirements = {
|
||||
# ...
|
||||
# coolname = [ "setuptools" ];
|
||||
# ...
|
||||
# };
|
||||
# ```
|
||||
# after encountering a build error like:
|
||||
#
|
||||
# > ModuleNotFoundError: No module named 'setuptools'
|
||||
pypkgs-build-requirements = {};
|
||||
poetry2nix-overrides = defaultPoetryOverrides.extend (self: super:
|
||||
builtins.mapAttrs (package: build-requirements:
|
||||
(builtins.getAttr package super).overridePythonAttrs (old: {
|
||||
buildInputs =
|
||||
(old.buildInputs or []) ++
|
||||
(builtins.map (pkg:
|
||||
if builtins.isString pkg then
|
||||
builtins.getAttr pkg super
|
||||
else
|
||||
pkg) build-requirements);
|
||||
})
|
||||
) pypkgs-build-requirements
|
||||
);
|
||||
in
|
||||
{
|
||||
packages = {
|
||||
tmpl-app = mkPoetryApplication {
|
||||
projectDir = ./.;
|
||||
overrides = poetry2nix-overrides;
|
||||
};
|
||||
default = self.packages.${system}.tmpl-app;
|
||||
};
|
||||
|
||||
devShells.default = pkgs.mkShell {
|
||||
inputsFrom = [ self.packages.${system}.tmpl-app ];
|
||||
packages = [
|
||||
pkgs.poetry
|
||||
] ++ (with pkgs.python311Packages; [
|
||||
black
|
||||
debugpy
|
||||
mccabe
|
||||
pycodestyle
|
||||
pyflakes
|
||||
python-lsp-black
|
||||
python-lsp-server
|
||||
]);
|
||||
};
|
||||
});
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
import os.path
|
||||
import random
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# References to paths we use to save any scraped content.
|
||||
DATA_COACH_LINKS = "data/coach_links.txt"
|
||||
DATA_COACH_DIR = "data/coach/{}/{}"
|
||||
|
||||
|
||||
async def scrape_coach_links(page_no):
|
||||
"""Scrape a single coach page listing."""
|
||||
links = []
|
||||
async with aiohttp.ClientSession() as session:
|
||||
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||
async with session.get(href) as response:
|
||||
if response.status != 200:
|
||||
print(f"Encountered {response.status} when retrieving {href}")
|
||||
return
|
||||
html = await response.text()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
members = soup.find_all("a", class_="members-categories-username")
|
||||
for member in members:
|
||||
links.append(member.get("href"))
|
||||
|
||||
return links
|
||||
|
||||
|
||||
async def scrape_all_coach_links(max_pages=62):
|
||||
"""Scans through chess.com/coaches for all member links."""
|
||||
if os.path.isfile(DATA_COACH_LINKS):
|
||||
with open(DATA_COACH_LINKS, "r") as f:
|
||||
return f.readlines()
|
||||
for i in range(1, max_pages + 1):
|
||||
# Nest the file context manager here so I can `tail -f` the file.
|
||||
with open(DATA_COACH_LINKS, "a") as f:
|
||||
links = await scrape_coach_links(i)
|
||||
for link in links:
|
||||
f.write(f"{link}\n")
|
||||
await asyncio.sleep(random.randint(2, 5))
|
||||
return links
|
||||
|
||||
|
||||
async def download_member_info(member_name, filename, href):
|
||||
"""Download member-specific content.
|
||||
|
||||
@return: True if we downloaded content. False if the results already
|
||||
existed locally.
|
||||
"""
|
||||
target = DATA_COACH_DIR.format(member_name, filename)
|
||||
if os.path.isfile(target):
|
||||
return False
|
||||
with open(target, "w") as f:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(href) as response:
|
||||
if response.status != 200:
|
||||
print(f"Encountered {response.status} when retrieving {href}")
|
||||
return
|
||||
f.write(await response.text())
|
||||
return True
|
||||
|
||||
|
||||
async def main():
|
||||
links = await scrape_all_coach_links()
|
||||
for link in links:
|
||||
href = link.strip()
|
||||
member_name = href[len("https://www.chess.com/member/") :]
|
||||
downloaded = await asyncio.gather(
|
||||
download_member_info(
|
||||
member_name,
|
||||
f"{member_name}.html",
|
||||
href,
|
||||
),
|
||||
download_member_info(
|
||||
member_name,
|
||||
"activity.json",
|
||||
f"https://www.chess.com/callback/member/activity/{member_name}?page=1",
|
||||
),
|
||||
download_member_info(
|
||||
member_name,
|
||||
"stats.json",
|
||||
f"https://www.chess.com/callback/member/stats/{member_name}",
|
||||
),
|
||||
)
|
||||
# Only want to sleep if the files didn't already exist.
|
||||
if any(downloaded):
|
||||
await asyncio.sleep(random.randint(2, 5))
|
||||
print(f"Downloaded {member_name}")
|
||||
else:
|
||||
print(f"Skipping {member_name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.makedirs("data/coach", exist_ok=True)
|
||||
asyncio.run(main())
|
|
@ -0,0 +1,7 @@
|
|||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
||||
package = []
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "81b2fa642d7f2d1219cf80112ace12d689d053d81be7f7addb98144d56fc0fb2"
|
|
@ -0,0 +1,15 @@
|
|||
[tool.poetry]
|
||||
name = "chesscom-scraper"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["jrpotter2112@gmail.com"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
beautifulsoup4 = "^4.12.2"
|
||||
aiohttp = "^3.8.6"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
Loading…
Reference in New Issue