RevB-Backend / physicsaqa.py
CaesarCloudSync
CaesarAI Change Card Improve 1
211843b
raw
history blame
1.62 kB
import requests
import re
from bs4 import BeautifulSoup
class PhysicsAQA:
def __init__(self) -> None:
self.url = "https://www.savemyexams.co.uk/a-level/physics/aqa/-/pages/topic-questions-pdf/"
response= requests.get(self.url).text
self.soup = BeautifulSoup(response,features='lxml')#
def collectdata(self):
physicsaqa = {}
slice_indexes = []
data = []
data = self.soup.find_all(["td"])[2:]
for ind,td in enumerate(data):
if "." in td.text or "Section" in td.text:
#print(td.text,ind)
slice_indexes.append(ind)
#print(len(slice_indexes))
for ind in range(len(slice_indexes)+1):
if ind == len(slice_indexes) -1:
break
sliceone = slice_indexes[ind]
slicetwo = slice_indexes[ind + 1]
chapterdata = data[sliceone:slicetwo]
chapternum = data[sliceone:slicetwo][0].text.replace('\n','').replace('\xa0','')
#print(sliceone,slicetwo)
#print(data[sliceone:slicetwo])
physicsaqa[chapternum] = {}
for chapter in chapterdata:
#print(chapter)
if chapter.find("a",href=True) != None:
#print(chapter)
physicsaqa[chapternum][chapter.find("a",href=True).text.replace('\xa0','').replace('\n','').replace("\u200b","")] = chapter.find("a",href=True)["href"]
return physicsaqa
if __name__ == "__main__":
data = PhysicsAQA().collectdata()
print(data)