VQArt / src /data /wiki_scrape.py
nouman-10's picture
Upload 35 files
ccba2d5
import wikipedia
import os
def get_raw_wikipedia_article(entity):
try:
results = wikipedia.search(entity)
best_result = results[0]
page = wikipedia.page(best_result, auto_suggest=False)
return page.content
except wikipedia.exceptions.DisambiguationError as e:
# Search term can't be disambiguated so we try
# again with a more specific search term adding ' (arts)'
return get_raw_wikipedia_article(entity + ' (arts)')
except wikipedia.exceptions.PageError as e:
# If the page doesn't exist, handle the PageError here.
print("The requested page does not exist on Wikipedia.")
return None
def clean_article(raw_article):
lines = raw_article.split('\n')
clean_lines = []
for l in lines:
if l.startswith('== See also'):
break
if l.startswith('== References'):
break
if l.startswith('='):
continue
if len(l.strip()) == 0:
continue
clean_lines.append(l.strip())
return '\n'.join(clean_lines)
def save_article(content, path):
with open(path, 'w', encoding='utf-8') as f:
f.write(content)
def load_entities(entities_path):
with open(entities_path, 'r', encoding='utf-8') as f:
return [l.strip() for l in f.readlines()]
def scrape(entities_path, save_path):
entities = load_entities(entities_path)
for entity in entities:
raw_article = get_raw_wikipedia_article(entity)
if raw_article == None:
print(f'Article on Wikipedia not found for entity {entity} :(')
continue
cleaned_article = clean_article(raw_article)
save_article(cleaned_article, os.path.join(save_path, f'{entity}.txt'))
if __name__ == '__main__':
scrape('src/data/entities.txt', 'data/wiki_articles')