|
import wikipedia |
|
import os |
|
|
|
def get_raw_wikipedia_article(entity): |
|
try: |
|
results = wikipedia.search(entity) |
|
best_result = results[0] |
|
page = wikipedia.page(best_result, auto_suggest=False) |
|
return page.content |
|
except wikipedia.exceptions.DisambiguationError as e: |
|
|
|
|
|
return get_raw_wikipedia_article(entity + ' (arts)') |
|
|
|
except wikipedia.exceptions.PageError as e: |
|
|
|
print("The requested page does not exist on Wikipedia.") |
|
return None |
|
|
|
def clean_article(raw_article): |
|
lines = raw_article.split('\n') |
|
clean_lines = [] |
|
for l in lines: |
|
if l.startswith('== See also'): |
|
break |
|
if l.startswith('== References'): |
|
break |
|
if l.startswith('='): |
|
continue |
|
if len(l.strip()) == 0: |
|
continue |
|
|
|
clean_lines.append(l.strip()) |
|
return '\n'.join(clean_lines) |
|
|
|
def save_article(content, path): |
|
with open(path, 'w', encoding='utf-8') as f: |
|
f.write(content) |
|
|
|
def load_entities(entities_path): |
|
with open(entities_path, 'r', encoding='utf-8') as f: |
|
return [l.strip() for l in f.readlines()] |
|
|
|
def scrape(entities_path, save_path): |
|
entities = load_entities(entities_path) |
|
for entity in entities: |
|
raw_article = get_raw_wikipedia_article(entity) |
|
if raw_article == None: |
|
print(f'Article on Wikipedia not found for entity {entity} :(') |
|
continue |
|
|
|
cleaned_article = clean_article(raw_article) |
|
save_article(cleaned_article, os.path.join(save_path, f'{entity}.txt')) |
|
|
|
if __name__ == '__main__': |
|
scrape('src/data/entities.txt', 'data/wiki_articles') |