Have requests actually use the `chesscom_request` function.

pull/2/head
Joshua Potter 2023-11-27 14:28:10 -07:00
parent 97cdb3b9cd
commit 7308c47eb5
1 changed files with 29 additions and 32 deletions

51
main.py
View File

@ -12,31 +12,30 @@ DATA_COACH_LIST = "data/pages/{page_no}.txt"
DATA_COACH_DIR = "data/coach/{member_name}" DATA_COACH_DIR = "data/coach/{member_name}"
DATA_COACH_FILE = "data/coach/{member_name}/{filename}" DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)"
async def chesscom_requeset(url):
async with aiohttp.ClientSession() as session: async def chesscom_request(url):
body = None
async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session:
async with session.get(url) as response: async with session.get(url) as response:
if response.status != 200: if response.status != 200:
print(f"Encountered {response.status} when retrieving {url}.") print(f"Encountered {response.status} when retrieving {url}.")
return response.text() else:
pass body = await response.text()
return body
async def scrape_coach_links(page_no): async def scrape_coach_links(page_no):
"""Scrape a single coach page listing.""" """Scrape a single coach page listing."""
links = []
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
async with aiohttp.ClientSession( response = await chesscom_request(url)
headers={ if response is None:
"User-Agent": "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)",
}
) as session:
async with session.get(url) as response:
if response.status != 200:
print(f"Encountered {response.status} when retrieving {url}.")
return return
html = await response.text()
soup = BeautifulSoup(html, "html.parser") links = []
soup = BeautifulSoup(response, "html.parser")
members = soup.find_all("a", class_="members-categories-username") members = soup.find_all("a", class_="members-categories-username")
for member in members: for member in members:
links.append(member.get("href")) links.append(member.get("href"))
@ -59,7 +58,7 @@ async def scrape_all_coach_links(max_pages=64):
for link in links: for link in links:
f.write(f"{link}\n") f.write(f"{link}\n")
print(f"Downloaded page {i} of coach list.") print(f"Downloaded page {i} of coach list.")
await asyncio.sleep(random.randint(3, 7)) await asyncio.sleep(random.randint(10, 15))
return links return links
@ -67,19 +66,17 @@ async def scrape_all_coach_links(max_pages=64):
async def download_member_info(member_name, filename, url): async def download_member_info(member_name, filename, url):
"""Download member-specific content. """Download member-specific content.
@return: True if we downloaded content. False if the download already @return: True if we make a network request. False otherwise.
exists locally.
""" """
filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename) filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename)
if os.path.isfile(filepath): if os.path.isfile(filepath):
return False return False
async with aiohttp.ClientSession() as session:
async with session.get(url) as response: response = await chesscom_request(url)
if response.status != 200: if response is not None:
print(f"Encountered {response.status} when retrieving {url}")
return
with open(filepath, "w") as f: with open(filepath, "w") as f:
f.write(await response.text()) f.write(response)
return True return True
@ -88,7 +85,7 @@ async def main():
for url in [link.strip() for link in links]: for url in [link.strip() for link in links]:
member_name = url[len("https://www.chess.com/member/") :] member_name = url[len("https://www.chess.com/member/") :]
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True) os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
downloaded = await asyncio.gather( made_network_request = await asyncio.gather(
download_member_info( download_member_info(
member_name, member_name,
f"{member_name}.html", f"{member_name}.html",
@ -105,8 +102,8 @@ async def main():
f"https://www.chess.com/callback/member/stats/{member_name}", f"https://www.chess.com/callback/member/stats/{member_name}",
), ),
) )
if any(downloaded): if any(made_network_request):
await asyncio.sleep(random.randint(3, 7)) await asyncio.sleep(random.randint(10, 15))
print(f"Downloaded {member_name} info.") print(f"Downloaded {member_name} info.")
else: else:
print(f"Skipping {member_name} download.") print(f"Skipping {member_name} download.")