Initial commit.

pull/2/head
Joshua Potter 2023-11-27 13:09:40 -07:00
commit 1710e1aefa
10 changed files with 464 additions and 0 deletions

7
.envrc Normal file
View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
if command -v git > /dev/null && on_git_branch; then
git config --local core.hooksPath .githooks/
fi
use flake

13
.githooks/pre-commit Executable file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
set -e
filesToFormat=$(
git --no-pager diff --name-status --no-color --cached | \
awk '$1 != "D" && $2 ~ /\.py$/ {print $NF}'
)
for path in $filesToFormat
do
black --quiet "$path"
git add "$path"
done;

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
# Directory used by `direnv` to hold `use flake`-generated profiles.
.direnv/
# A symlink produced by default when running `nix build`.
result

29
README.md Normal file
View File

@ -0,0 +1,29 @@
# Poetry Flake Template
This is a template for constructing a working environment for Python (version
3.11.6) development. Packaging and dependency management relies on [poetry](https://python-poetry.org/)
(version 1.7.0). [direnv](https://direnv.net/) can be used to a launch a dev
shell upon entering this directory (refer to `.envrc`). Otherwise run via:
```bash
$> nix develop
```
## Language Server
The [python-lsp-server](https://github.com/python-lsp/python-lsp-server)
(version v1.9.0) is included in this flake, along with the [python-lsp-black](https://github.com/python-lsp/python-lsp-black)
plugin for formatting purposes. `pylsp` is expected to be configured to use
[McCabe](https://github.com/PyCQA/mccabe), [pycodestyle](https://pycodestyle.pycqa.org/en/latest/),
and [pyflakes](https://github.com/PyCQA/pyflakes). Refer to your editor for
configuration details.
## Formatting
Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.html)
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
be used to format all `*.py` files prior to commit. Install via:
```bash
$> git config --local core.hooksPath .githooks/
```
If running [direnv](https://direnv.net/), this hook is installed automatically
when entering the directory.

10
default.nix Normal file
View File

@ -0,0 +1,10 @@
(import
(
let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in
fetchTarball {
url = lock.nodes.flake-compat.locked.url or "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
sha256 = lock.nodes.flake-compat.locked.narHash;
}
)
{ src = ./.; }
).defaultNix

190
flake.lock Normal file
View File

@ -0,0 +1,190 @@
{
"nodes": {
"flake-compat": {
"locked": {
"lastModified": 1696426674,
"narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
"revCount": 57,
"type": "tarball",
"url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz"
},
"original": {
"type": "tarball",
"url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"
}
},
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1694529238,
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"flake-utils_2": {
"inputs": {
"systems": "systems_2"
},
"locked": {
"lastModified": 1694529238,
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nix-github-actions": {
"inputs": {
"nixpkgs": [
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1698974481,
"narHash": "sha256-yPncV9Ohdz1zPZxYHQf47S8S0VrnhV7nNhCawY46hDA=",
"owner": "nix-community",
"repo": "nix-github-actions",
"rev": "4bb5e752616262457bc7ca5882192a564c0472d2",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nix-github-actions",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1700794826,
"narHash": "sha256-RyJTnTNKhO0yqRpDISk03I/4A67/dp96YRxc86YOPgU=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "5a09cb4b393d58f9ed0d9ca1555016a8543c2ac8",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"poetry2nix": {
"inputs": {
"flake-utils": "flake-utils_2",
"nix-github-actions": "nix-github-actions",
"nixpkgs": [
"nixpkgs"
],
"systems": "systems_3",
"treefmt-nix": "treefmt-nix"
},
"locked": {
"lastModified": 1701035916,
"narHash": "sha256-Cbe/3H9/z7vIXAeMr9m9iXs1gRxpE3w1mrx9aaxWtGU=",
"owner": "nix-community",
"repo": "poetry2nix",
"rev": "2bee2516bc054458b0cbca10b18e2ec63cea8726",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "poetry2nix",
"type": "github"
}
},
"root": {
"inputs": {
"flake-compat": "flake-compat",
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_3": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"id": "systems",
"type": "indirect"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1699786194,
"narHash": "sha256-3h3EH1FXQkIeAuzaWB+nK0XK54uSD46pp+dMD3gAcB4=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "e82f32aa7f06bbbd56d7b12186d555223dc399d1",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "treefmt-nix",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

89
flake.nix Normal file
View File

@ -0,0 +1,89 @@
{
description = ''
An opinionated poetry flake.
This flake has been adapted from the `app` template found in:
https://github.com/nix-community/poetry2nix
To generate a copy of this template elsewhere, run:
$> bootstrap poetry
'';
inputs = {
flake-compat.url = "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz";
flake-utils.url = "github:numtide/flake-utils";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
poetry2nix = {
url = "github:nix-community/poetry2nix";
inputs.nixpkgs.follows = "nixpkgs";
};
};
outputs = { self, nixpkgs, flake-utils, poetry2nix, ... }:
flake-utils.lib.eachDefaultSystem (system:
let
# See https://github.com/nix-community/poetry2nix/tree/master#api for
# more functions and examples.
pkgs = nixpkgs.legacyPackages.${system};
inherit
(poetry2nix.lib.mkPoetry2Nix { inherit pkgs; })
mkPoetryApplication
defaultPoetryOverrides;
# https://github.com/nix-community/poetry2nix/blob/ec4364021900f8e0d425d901b6e6ff03cf201efb/docs/edgecases.md
# `poetry2nix`, by default, prefers building from source. To build
# certain dependencies, we need to augment its build dependencies by
# adding the corresponding build backend (e.g. `setuptools`).
#
# For example, you can write:
# ```nix
# pypkgs-build-requirements = {
# ...
# coolname = [ "setuptools" ];
# ...
# };
# ```
# after encountering a build error like:
#
# > ModuleNotFoundError: No module named 'setuptools'
pypkgs-build-requirements = {};
poetry2nix-overrides = defaultPoetryOverrides.extend (self: super:
builtins.mapAttrs (package: build-requirements:
(builtins.getAttr package super).overridePythonAttrs (old: {
buildInputs =
(old.buildInputs or []) ++
(builtins.map (pkg:
if builtins.isString pkg then
builtins.getAttr pkg super
else
pkg) build-requirements);
})
) pypkgs-build-requirements
);
in
{
packages = {
tmpl-app = mkPoetryApplication {
projectDir = ./.;
overrides = poetry2nix-overrides;
};
default = self.packages.${system}.tmpl-app;
};
devShells.default = pkgs.mkShell {
inputsFrom = [ self.packages.${system}.tmpl-app ];
packages = [
pkgs.poetry
] ++ (with pkgs.python311Packages; [
black
debugpy
mccabe
pycodestyle
pyflakes
python-lsp-black
python-lsp-server
]);
};
});
}

99
main.py Normal file
View File

@ -0,0 +1,99 @@
import aiohttp
import asyncio
import os
import os.path
import random
from bs4 import BeautifulSoup
# References to paths we use to save any scraped content.
DATA_COACH_LINKS = "data/coach_links.txt"
DATA_COACH_DIR = "data/coach/{}/{}"
async def scrape_coach_links(page_no):
"""Scrape a single coach page listing."""
links = []
async with aiohttp.ClientSession() as session:
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
async with session.get(href) as response:
if response.status != 200:
print(f"Encountered {response.status} when retrieving {href}")
return
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
members = soup.find_all("a", class_="members-categories-username")
for member in members:
links.append(member.get("href"))
return links
async def scrape_all_coach_links(max_pages=62):
"""Scans through chess.com/coaches for all member links."""
if os.path.isfile(DATA_COACH_LINKS):
with open(DATA_COACH_LINKS, "r") as f:
return f.readlines()
for i in range(1, max_pages + 1):
# Nest the file context manager here so I can `tail -f` the file.
with open(DATA_COACH_LINKS, "a") as f:
links = await scrape_coach_links(i)
for link in links:
f.write(f"{link}\n")
await asyncio.sleep(random.randint(2, 5))
return links
async def download_member_info(member_name, filename, href):
"""Download member-specific content.
@return: True if we downloaded content. False if the results already
existed locally.
"""
target = DATA_COACH_DIR.format(member_name, filename)
if os.path.isfile(target):
return False
with open(target, "w") as f:
async with aiohttp.ClientSession() as session:
async with session.get(href) as response:
if response.status != 200:
print(f"Encountered {response.status} when retrieving {href}")
return
f.write(await response.text())
return True
async def main():
links = await scrape_all_coach_links()
for link in links:
href = link.strip()
member_name = href[len("https://www.chess.com/member/") :]
downloaded = await asyncio.gather(
download_member_info(
member_name,
f"{member_name}.html",
href,
),
download_member_info(
member_name,
"activity.json",
f"https://www.chess.com/callback/member/activity/{member_name}?page=1",
),
download_member_info(
member_name,
"stats.json",
f"https://www.chess.com/callback/member/stats/{member_name}",
),
)
# Only want to sleep if the files didn't already exist.
if any(downloaded):
await asyncio.sleep(random.randint(2, 5))
print(f"Downloaded {member_name}")
else:
print(f"Skipping {member_name}")
if __name__ == "__main__":
os.makedirs("data/coach", exist_ok=True)
asyncio.run(main())

7
poetry.lock generated Normal file
View File

@ -0,0 +1,7 @@
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
package = []
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "81b2fa642d7f2d1219cf80112ace12d689d053d81be7f7addb98144d56fc0fb2"

15
pyproject.toml Normal file
View File

@ -0,0 +1,15 @@
[tool.poetry]
name = "chesscom-scraper"
version = "0.1.0"
description = ""
authors = ["jrpotter2112@gmail.com"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
beautifulsoup4 = "^4.12.2"
aiohttp = "^3.8.6"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"