# %% # Import all libraries from bs4 import BeautifulSoup import bibtexparser from dateutil import parser import json import requests import tldextract from collections import defaultdict import re import mwparserfromhell # Given the DOI, PMID, PMC number, fetch journal's meta data def get_metainfo_doi(doi): """Input: doi string Output: the journal name and date published of the article. Return None for each value if the can't parsed """ res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"}) res = res.content.decode('utf-8') bibtext = bibtexparser.loads(res).entries if len(bibtext) >0 : journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None time_published = "" if "year" in bibtext[0]: time_published += bibtext[0]["year"] if "month" in bibtext[0]: time_published += " " + bibtext[0]["month"] if "day" in bibtext[0]: time_published += " " + bibtext[0]["day"] if len(time_published) > 0: time_published = parser.parse(time_published) else: time_published = None return journal, time_published else: return None, None def get_metainfo_pmc(pmc): res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json") res = res.content.decode("utf-8") res = json.loads(res) data = res["result"][pmc] journal, time_published = None, None if "error" in data: return None, None else: journal = data["fulljournalname"].strip() time_published = parser.parse(data["pubdate"]) return journal, time_published def get_metainfo_pmid(pmid): res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json") res = res.content.decode("utf-8") res = json.loads(res) data = res["result"][pmid] journal, time_published = None, None if "error" in data: return None, None else: journal = data["fulljournalname"].strip() time_published = parser.parse(data["pubdate"]) return journal, time_published def parse_html(page_url): """ This function parse metadata of citations from HTML tag. Input: wiki_url Output: a parsed citation list from HTML. Each citation has format key: value key: the text version of all citation value: a dictionary with schema {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}""" citation_types = {'web', 'journal', 'book', 'conference', 'news'} all_parsed_citations = defaultdict(dict) response = requests.get(page_url) soup = BeautifulSoup(response.content, 'html.parser') # Get all the references all_ref = [] ordered_ref_lst = soup.find_all("ol", {"class": "references"}) for each in ordered_ref_lst: refs = each.find_all("li") all_ref += refs for ele in all_ref: #Check if it has ref = ele.find("span", {"class":"reference-text"}) source_type = "other" #first default value for source_type if ref: # TASK: get all essential information from citation tag citation_key = ref.get_text() hyperlink = ref.find("a", {"class": "external text"}) external_link = hyperlink["href"] if hyperlink else None # TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news' cite_tag = ref.find("cite") if cite_tag: for class_tag in cite_tag["class"]: if class_tag in citation_types: source_type = class_tag break # TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news) #for journal, conference, others look for DOI or PMID or PMC if source_type in {'journal', 'conference', 'other'}: has_doi = ref.find("a", {"title": "Doi (identifier)"}) has_pmc = ref.find("a", {"title": "PMC (identifier)"}) has_pmid = ref.find("a", {"title": "PMID (identifier)"}) journal, date = None, None if has_doi: doi = has_doi.find_next("a", {"class": "external text"}) journal, date = get_metainfo_doi(doi.text) elif has_pmc: pmc = has_pmc.find_next("a", {"class": "external text"}) journal, date = get_metainfo_pmc(pmc.text) elif has_pmid: pmid = has_pmid.find_next("a", {"class": "external text"}) journal, date = get_metainfo_pmid(pmid.text) all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date} # for news, web, other that hasn't been parsed, publisher is the domain of the website elif source_type in {'news', 'web', 'other'}: publisher = tldextract.extract(external_link).domain if external_link else None all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None} return all_parsed_citations # After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation # %% def parse_match_wikitext(wiki_url): """ This function parse wikitext version of the citations, match it with the HTML version, and extract more information, such as publisher and date that weren't extracted on the HTML. Input: wiki_url Output: a fully parsed citation list. Each citation has format key: value key: the text version of all citation value: a dictionary with schema {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} """ parsed_citation = parse_html(wiki_url) print("ALL citation", len(parsed_citation)) wiki_page = wiki_url.split("wiki/")[1] url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw" response = requests.get(url) text = response.text wikicode = mwparserfromhell.parse(text) # Create a copy of not fully parsed citation, ie one that lacks publisher or date param not_fully_parsed = defaultdict(dict) for key, val in parsed_citation.items(): if not val["publisher"] or not val["date"]: not_fully_parsed[key] = val for tpl in wikicode.filter_templates(matches="{{cite"): #tpl is template, for a template in wikitext found_match = None # Match on external link: if tpl.has_param("url"): external_url = tpl.get("url").split("=")[1] for key, val in not_fully_parsed.items(): if val["external_link"]: if val["external_link"].strip() == external_url.strip(): found_match = key break # if not found match by URL, find by title if not found_match: if tpl.has_param("title"): #Get the title of citation in without formatting text title = tpl.get("title").split("=")[1] title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting for key in not_fully_parsed.keys(): if title in key: found_match = key break if found_match: # Fetch publisher/ journal name from wikitext if not parsed_citation[found_match]["publisher"]: publisher = None if tpl.has_param("journal"): #for journal name publisher = tpl.get("journal").split("=")[1] elif tpl.has_param("publishder"): #for website or book publisher publisher = tpl.get("publisher").split("=")[1] elif tpl.has_param("work"): #for news/ magazine name publisher = tpl.get("work").split("=")[1] if publisher: publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher) parsed_citation[found_match]["publisher"] = publisher # Fetch publication date from wikitext if not parsed_citation[found_match]["date"]: date = None if tpl.has_param("date"): date = tpl.get("date").split("=")[1] if len(date) >= 4: #at least 4 digits for year, or yy-mm format date = parser.parse(date) parsed_citation[found_match]["date"] = date return parsed_citation def eval_scholarly_sources(citation): """ This function evaluates the tag for a scholarly souces (journal, conference, or other type) Input: the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} Output: the tag for citation (red, green, yellow, unknown) """ # read the dictionaries of flags from the json file with open("scholarly_flags.json", "r") as f: all_flags = json.load(f) # Check on the domain of external link if citation["external_link"]: domain = tldextract.extract(citation["external_link"]).domain if domain in all_flags["red_scholarly_reverse"]: return "red" elif domain in all_flags["yellow_scholarly_reverse"]: return "yellow" elif domain in all_flags["green_scholarly_reverse"]: return "green" #check on the name dictionary if citation["publisher"] in all_flags["red_scholarly"]: return "red" elif citation["publisher"] in all_flags["yellow_scholarly"]: return "yellow" elif citation["publisher"] in all_flags["green_scholarly"]: return "green" return "unknown" def eval_non_scholarly_sources(citation, citation_val): """ This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type) Input: the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} Output: the tag for citation (red, green, yellow, unknown) """ with open("non_scholarly_flags.json", "r") as f: non_scholarly_flags = json.load(f) # Check if the tag is found in either name or is part of external_link for key, val in non_scholarly_flags.items(): for source in val: if source in citation_val["external_link"]: return key elif source in citation: return key return "unknown" def check_source_quality(wiki_url): """ Go through each parsed citation, check them through the red-yellow-green tag Return: red, yellow, green lists that include the citations belong to each category """ parsed = parse_match_wikitext(wiki_url) red_flag_list = [] yellow_flag_list = [] green_flag_list = [] unknown_list = [] for citation, val in parsed.items(): eval = None # Check for journals/ conference/ other if val["type"] in {"journal", "conference", "other"}: eval = eval_scholarly_sources(val) elif val["type"] in {"web", "book", "news", "other"}: eval = eval_non_scholarly_sources(citation, val) if eval == "red": red_flag_list.append((citation, val["publisher"])) elif eval == "yellow": yellow_flag_list.append((citation, val["publisher"])) elif eval == "green": green_flag_list.append((citation, val["publisher"])) elif eval == "unknown": unknown_list.append((citation, val["publisher"])) return red_flag_list, yellow_flag_list, green_flag_list, unknown_list # TEST a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania") print("Red flag source:" , a[0]) print("Yellow flag source: ", a[1]) print("Green source: ", a[2]) print("Undetermined sources: ", a[3])