Spaces:
Runtime error
Runtime error
import requests | |
import re | |
from bs4 import BeautifulSoup | |
class PhysicsAQA: | |
def __init__(self) -> None: | |
self.url = "https://www.savemyexams.co.uk/a-level/physics/aqa/-/pages/topic-questions-pdf/" | |
response= requests.get(self.url).text | |
self.soup = BeautifulSoup(response,features='lxml')# | |
def collectdata(self): | |
physicsaqa = {} | |
slice_indexes = [] | |
data = [] | |
data = self.soup.find_all(["td"])[2:] | |
for ind,td in enumerate(data): | |
if "." in td.text or "Section" in td.text: | |
#print(td.text,ind) | |
slice_indexes.append(ind) | |
#print(len(slice_indexes)) | |
for ind in range(len(slice_indexes)+1): | |
if ind == len(slice_indexes) -1: | |
break | |
sliceone = slice_indexes[ind] | |
slicetwo = slice_indexes[ind + 1] | |
chapterdata = data[sliceone:slicetwo] | |
chapternum = data[sliceone:slicetwo][0].text.replace('\n','').replace('\xa0','') | |
#print(sliceone,slicetwo) | |
#print(data[sliceone:slicetwo]) | |
physicsaqa[chapternum] = {} | |
for chapter in chapterdata: | |
#print(chapter) | |
if chapter.find("a",href=True) != None: | |
#print(chapter) | |
physicsaqa[chapternum][chapter.find("a",href=True).text.replace('\xa0','').replace('\n','').replace("\u200b","")] = chapter.find("a",href=True)["href"] | |
return physicsaqa | |
if __name__ == "__main__": | |
data = PhysicsAQA().collectdata() | |
print(data) | |