Break coach listing download per-page.
parent
1710e1aefa
commit
23aa622126
|
@ -3,3 +3,6 @@
|
||||||
|
|
||||||
# A symlink produced by default when running `nix build`.
|
# A symlink produced by default when running `nix build`.
|
||||||
result
|
result
|
||||||
|
|
||||||
|
# The results of the `chess.com` coach scraping.
|
||||||
|
data/
|
||||||
|
|
46
main.py
46
main.py
|
@ -8,66 +8,70 @@ from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
# References to paths we use to save any scraped content.
|
# References to paths we use to save any scraped content.
|
||||||
DATA_COACH_LINKS = "data/coach_links.txt"
|
DATA_COACH_LIST = "data/pages/{}.txt"
|
||||||
DATA_COACH_DIR = "data/coach/{}/{}"
|
DATA_COACH_DIR = "data/coach/{}/{}"
|
||||||
|
|
||||||
|
|
||||||
async def scrape_coach_links(page_no):
|
async def scrape_coach_links(page_no):
|
||||||
"""Scrape a single coach page listing."""
|
"""Scrape a single coach page listing."""
|
||||||
links = []
|
links = []
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(href) as response:
|
async with session.get(href) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
print(f"Encountered {response.status} when retrieving {href}")
|
print(f"Encountered {response.status} when retrieving {href}.")
|
||||||
return
|
return
|
||||||
html = await response.text()
|
html = await response.text()
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
members = soup.find_all("a", class_="members-categories-username")
|
members = soup.find_all("a", class_="members-categories-blahblah")
|
||||||
for member in members:
|
for member in members:
|
||||||
links.append(member.get("href"))
|
links.append(member.get("href"))
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
async def scrape_all_coach_links(max_pages=62):
|
async def scrape_all_coach_links(max_pages=64):
|
||||||
"""Scans through chess.com/coaches for all member links."""
|
"""Scan through https://www.chess.com/coaches for all member links."""
|
||||||
if os.path.isfile(DATA_COACH_LINKS):
|
links = []
|
||||||
with open(DATA_COACH_LINKS, "r") as f:
|
|
||||||
return f.readlines()
|
|
||||||
for i in range(1, max_pages + 1):
|
for i in range(1, max_pages + 1):
|
||||||
# Nest the file context manager here so I can `tail -f` the file.
|
filepath = DATA_COACH_LIST.format(i)
|
||||||
with open(DATA_COACH_LINKS, "a") as f:
|
if os.path.isfile(filepath):
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
links = f.readlines()
|
||||||
|
print(f"{filepath} already exists.")
|
||||||
|
else:
|
||||||
links = await scrape_coach_links(i)
|
links = await scrape_coach_links(i)
|
||||||
|
with open(filepath, "w") as f:
|
||||||
for link in links:
|
for link in links:
|
||||||
f.write(f"{link}\n")
|
f.write(f"{link}\n")
|
||||||
await asyncio.sleep(random.randint(2, 5))
|
print(f"Downloaded page {i} of coach list.")
|
||||||
|
await asyncio.sleep(random.randint(3, 7))
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
async def download_member_info(member_name, filename, href):
|
async def download_member_info(member_name, filename, href):
|
||||||
"""Download member-specific content.
|
"""Download member-specific content.
|
||||||
|
|
||||||
@return: True if we downloaded content. False if the results already
|
@return: True if we downloaded content. False if the download already
|
||||||
existed locally.
|
exists locally.
|
||||||
"""
|
"""
|
||||||
target = DATA_COACH_DIR.format(member_name, filename)
|
target = DATA_COACH_DIR.format(member_name, filename)
|
||||||
if os.path.isfile(target):
|
if os.path.isfile(target):
|
||||||
return False
|
return False
|
||||||
with open(target, "w") as f:
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(href) as response:
|
async with session.get(href) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
print(f"Encountered {response.status} when retrieving {href}")
|
print(f"Encountered {response.status} when retrieving {href}")
|
||||||
return
|
return
|
||||||
|
with open(target, "w") as f:
|
||||||
f.write(await response.text())
|
f.write(await response.text())
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
links = await scrape_all_coach_links()
|
links = await scrape_all_coach_links()
|
||||||
for link in links:
|
for href in [link.strip() for link in links]:
|
||||||
href = link.strip()
|
|
||||||
member_name = href[len("https://www.chess.com/member/") :]
|
member_name = href[len("https://www.chess.com/member/") :]
|
||||||
downloaded = await asyncio.gather(
|
downloaded = await asyncio.gather(
|
||||||
download_member_info(
|
download_member_info(
|
||||||
|
@ -86,14 +90,14 @@ async def main():
|
||||||
f"https://www.chess.com/callback/member/stats/{member_name}",
|
f"https://www.chess.com/callback/member/stats/{member_name}",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
# Only want to sleep if the files didn't already exist.
|
|
||||||
if any(downloaded):
|
if any(downloaded):
|
||||||
await asyncio.sleep(random.randint(2, 5))
|
await asyncio.sleep(random.randint(3, 7))
|
||||||
print(f"Downloaded {member_name}")
|
print(f"Downloaded {member_name} info.")
|
||||||
else:
|
else:
|
||||||
print(f"Skipping {member_name}")
|
print(f"Skipping {member_name} download.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
os.makedirs("data/pages", exist_ok=True)
|
||||||
os.makedirs("data/coach", exist_ok=True)
|
os.makedirs("data/coach", exist_ok=True)
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|
Loading…
Reference in New Issue