Add User-Agent to requests.
parent
1a4199e95d
commit
97cdb3b9cd
33
main.py
33
main.py
|
@ -13,14 +13,27 @@ DATA_COACH_DIR = "data/coach/{member_name}"
|
||||||
DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
|
DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
|
||||||
|
|
||||||
|
|
||||||
|
async def chesscom_requeset(url):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
print(f"Encountered {response.status} when retrieving {url}.")
|
||||||
|
return response.text()
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def scrape_coach_links(page_no):
|
async def scrape_coach_links(page_no):
|
||||||
"""Scrape a single coach page listing."""
|
"""Scrape a single coach page listing."""
|
||||||
links = []
|
links = []
|
||||||
href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(
|
||||||
async with session.get(href) as response:
|
headers={
|
||||||
|
"User-Agent": "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)",
|
||||||
|
}
|
||||||
|
) as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
print(f"Encountered {response.status} when retrieving {href}.")
|
print(f"Encountered {response.status} when retrieving {url}.")
|
||||||
return
|
return
|
||||||
html = await response.text()
|
html = await response.text()
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
@ -51,7 +64,7 @@ async def scrape_all_coach_links(max_pages=64):
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
async def download_member_info(member_name, filename, href):
|
async def download_member_info(member_name, filename, url):
|
||||||
"""Download member-specific content.
|
"""Download member-specific content.
|
||||||
|
|
||||||
@return: True if we downloaded content. False if the download already
|
@return: True if we downloaded content. False if the download already
|
||||||
|
@ -61,9 +74,9 @@ async def download_member_info(member_name, filename, href):
|
||||||
if os.path.isfile(filepath):
|
if os.path.isfile(filepath):
|
||||||
return False
|
return False
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(href) as response:
|
async with session.get(url) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
print(f"Encountered {response.status} when retrieving {href}")
|
print(f"Encountered {response.status} when retrieving {url}")
|
||||||
return
|
return
|
||||||
with open(filepath, "w") as f:
|
with open(filepath, "w") as f:
|
||||||
f.write(await response.text())
|
f.write(await response.text())
|
||||||
|
@ -72,14 +85,14 @@ async def download_member_info(member_name, filename, href):
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
links = await scrape_all_coach_links()
|
links = await scrape_all_coach_links()
|
||||||
for href in [link.strip() for link in links]:
|
for url in [link.strip() for link in links]:
|
||||||
member_name = href[len("https://www.chess.com/member/") :]
|
member_name = url[len("https://www.chess.com/member/") :]
|
||||||
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
|
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
|
||||||
downloaded = await asyncio.gather(
|
downloaded = await asyncio.gather(
|
||||||
download_member_info(
|
download_member_info(
|
||||||
member_name,
|
member_name,
|
||||||
f"{member_name}.html",
|
f"{member_name}.html",
|
||||||
href,
|
url,
|
||||||
),
|
),
|
||||||
download_member_info(
|
download_member_info(
|
||||||
member_name,
|
member_name,
|
||||||
|
|
Loading…
Reference in New Issue