Add User-Agent to requests.

pull/2/head
Joshua Potter 2023-11-27 14:13:56 -07:00
parent 1a4199e95d
commit 97cdb3b9cd
1 changed files with 23 additions and 10 deletions

33
main.py
View File

@ -13,14 +13,27 @@ DATA_COACH_DIR = "data/coach/{member_name}"
DATA_COACH_FILE = "data/coach/{member_name}/{filename}" DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
async def chesscom_requeset(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
print(f"Encountered {response.status} when retrieving {url}.")
return response.text()
pass
async def scrape_coach_links(page_no): async def scrape_coach_links(page_no):
"""Scrape a single coach page listing.""" """Scrape a single coach page listing."""
links = [] links = []
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(
async with session.get(href) as response: headers={
"User-Agent": "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)",
}
) as session:
async with session.get(url) as response:
if response.status != 200: if response.status != 200:
print(f"Encountered {response.status} when retrieving {href}.") print(f"Encountered {response.status} when retrieving {url}.")
return return
html = await response.text() html = await response.text()
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
@ -51,7 +64,7 @@ async def scrape_all_coach_links(max_pages=64):
return links return links
async def download_member_info(member_name, filename, href): async def download_member_info(member_name, filename, url):
"""Download member-specific content. """Download member-specific content.
@return: True if we downloaded content. False if the download already @return: True if we downloaded content. False if the download already
@ -61,9 +74,9 @@ async def download_member_info(member_name, filename, href):
if os.path.isfile(filepath): if os.path.isfile(filepath):
return False return False
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(href) as response: async with session.get(url) as response:
if response.status != 200: if response.status != 200:
print(f"Encountered {response.status} when retrieving {href}") print(f"Encountered {response.status} when retrieving {url}")
return return
with open(filepath, "w") as f: with open(filepath, "w") as f:
f.write(await response.text()) f.write(await response.text())
@ -72,14 +85,14 @@ async def download_member_info(member_name, filename, href):
async def main(): async def main():
links = await scrape_all_coach_links() links = await scrape_all_coach_links()
for href in [link.strip() for link in links]: for url in [link.strip() for link in links]:
member_name = href[len("https://www.chess.com/member/") :] member_name = url[len("https://www.chess.com/member/") :]
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True) os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
downloaded = await asyncio.gather( downloaded = await asyncio.gather(
download_member_info( download_member_info(
member_name, member_name,
f"{member_name}.html", f"{member_name}.html",
href, url,
), ),
download_member_info( download_member_info(
member_name, member_name,