Spaces:
Sleeping
Sleeping
File size: 12,369 Bytes
6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 7be836b a3fcd7c 6d80354 a3fcd7c 7be836b a3fcd7c 7be836b a3fcd7c 7be836b a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 a3fcd7c 6d80354 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
# %%
# Import all libraries
from bs4 import BeautifulSoup
import bibtexparser
from dateutil import parser
import json
import requests
import tldextract
from collections import defaultdict
import re
import mwparserfromhell
# Given the DOI, PMID, PMC number, fetch journal's meta data
def get_metainfo_doi(doi):
"""Input: doi string
Output: the journal name and date published of the article. Return None for each value if the can't parsed
"""
res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"})
res = res.content.decode('utf-8')
bibtext = bibtexparser.loads(res).entries
if len(bibtext) >0 :
journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None
time_published = ""
if "year" in bibtext[0]:
time_published += bibtext[0]["year"]
if "month" in bibtext[0]:
time_published += " " + bibtext[0]["month"]
if "day" in bibtext[0]:
time_published += " " + bibtext[0]["day"]
if len(time_published) > 0:
time_published = parser.parse(time_published)
else:
time_published = None
return journal, time_published
else:
return None, None
def get_metainfo_pmc(pmc):
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json")
res = res.content.decode("utf-8")
res = json.loads(res)
data = res["result"][pmc]
journal, time_published = None, None
if "error" in data:
return None, None
else:
journal = data["fulljournalname"].strip()
time_published = parser.parse(data["pubdate"])
return journal, time_published
def get_metainfo_pmid(pmid):
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json")
res = res.content.decode("utf-8")
res = json.loads(res)
data = res["result"][pmid]
journal, time_published = None, None
if "error" in data:
return None, None
else:
journal = data["fulljournalname"].strip()
time_published = parser.parse(data["pubdate"])
return journal, time_published
def parse_html(page_url):
""" This function parse metadata of citations from HTML tag.
Input: wiki_url
Output: a parsed citation list from HTML. Each citation has format key: value
key: the text version of all citation
value: a dictionary with schema
{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}"""
citation_types = {'web', 'journal', 'book', 'conference', 'news'}
all_parsed_citations = defaultdict(dict)
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
# Get all the references
all_ref = []
ordered_ref_lst = soup.find_all("ol", {"class": "references"})
for each in ordered_ref_lst:
refs = each.find_all("li")
all_ref += refs
for ele in all_ref:
#Check if it has <span class="reference text">
ref = ele.find("span", {"class":"reference-text"})
source_type = "other" #first default value for source_type
if ref:
# TASK: get all essential information from citation tag
citation_key = ref.get_text()
hyperlink = ref.find("a", {"class": "external text"})
external_link = hyperlink["href"] if hyperlink else None
# TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news'
cite_tag = ref.find("cite")
if cite_tag:
for class_tag in cite_tag["class"]:
if class_tag in citation_types:
source_type = class_tag
break
# TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news)
#for journal, conference, others look for DOI or PMID or PMC
if source_type in {'journal', 'conference', 'other'}:
has_doi = ref.find("a", {"title": "Doi (identifier)"})
has_pmc = ref.find("a", {"title": "PMC (identifier)"})
has_pmid = ref.find("a", {"title": "PMID (identifier)"})
journal, date = None, None
if has_doi:
doi = has_doi.find_next("a", {"class": "external text"})
journal, date = get_metainfo_doi(doi.text)
elif has_pmc:
pmc = has_pmc.find_next("a", {"class": "external text"})
journal, date = get_metainfo_pmc(pmc.text)
elif has_pmid:
pmid = has_pmid.find_next("a", {"class": "external text"})
journal, date = get_metainfo_pmid(pmid.text)
all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date}
# for news, web, other that hasn't been parsed, publisher is the domain of the website
elif source_type in {'news', 'web', 'other'}:
publisher = tldextract.extract(external_link).domain if external_link else None
all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None}
return all_parsed_citations
# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation
# %%
def parse_match_wikitext(wiki_url):
"""
This function parse wikitext version of the citations, match it with the HTML version,
and extract more information, such as publisher and date that weren't extracted on the HTML.
Input: wiki_url
Output: a fully parsed citation list. Each citation has format key: value
key: the text version of all citation
value: a dictionary with schema
{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
"""
parsed_citation = parse_html(wiki_url)
print("ALL citation", len(parsed_citation))
wiki_page = wiki_url.split("wiki/")[1]
url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw"
response = requests.get(url)
text = response.text
wikicode = mwparserfromhell.parse(text)
# Create a copy of not fully parsed citation, ie one that lacks publisher or date param
not_fully_parsed = defaultdict(dict)
for key, val in parsed_citation.items():
if not val["publisher"] or not val["date"]:
not_fully_parsed[key] = val
for tpl in wikicode.filter_templates(matches="{{cite"):
#tpl is template, for a template in wikitext
found_match = None
# Match on external link:
if tpl.has_param("url"):
external_url = tpl.get("url").split("=")[1]
for key, val in not_fully_parsed.items():
if val["external_link"]:
if val["external_link"].strip() == external_url.strip():
found_match = key
break
# if not found match by URL, find by title
if not found_match:
if tpl.has_param("title"):
#Get the title of citation in without formatting text
title = tpl.get("title").split("=")[1]
title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting
for key in not_fully_parsed.keys():
if title in key:
found_match = key
break
if found_match:
# Fetch publisher/ journal name from wikitext
if not parsed_citation[found_match]["publisher"]:
publisher = None
if tpl.has_param("journal"): #for journal name
publisher = tpl.get("journal").split("=")[1]
elif tpl.has_param("publishder"): #for website or book publisher
publisher = tpl.get("publisher").split("=")[1]
elif tpl.has_param("work"): #for news/ magazine name
publisher = tpl.get("work").split("=")[1]
if publisher:
publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher)
parsed_citation[found_match]["publisher"] = publisher
# Fetch publication date from wikitext
if not parsed_citation[found_match]["date"]:
date = None
if tpl.has_param("date"):
date = tpl.get("date").split("=")[1]
if len(date) >= 4: #at least 4 digits for year, or yy-mm format
date = parser.parse(date)
parsed_citation[found_match]["date"] = date
return parsed_citation
def eval_scholarly_sources(citation):
"""
This function evaluates the tag for a scholarly souces (journal, conference, or other type)
Input:
the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
Output:
the tag for citation (red, green, yellow, unknown)
"""
# read the dictionaries of flags from the json file
with open("scholarly_flags.json", "r") as f:
all_flags = json.load(f)
# Check on the domain of external link
if citation["external_link"]:
domain = tldextract.extract(citation["external_link"]).domain
if domain in all_flags["red_scholarly_reverse"]:
return "red"
elif domain in all_flags["yellow_scholarly_reverse"]:
return "yellow"
elif domain in all_flags["green_scholarly_reverse"]:
return "green"
#check on the name dictionary
if citation["publisher"] in all_flags["red_scholarly"]:
return "red"
elif citation["publisher"] in all_flags["yellow_scholarly"]:
return "yellow"
elif citation["publisher"] in all_flags["green_scholarly"]:
return "green"
return "unknown"
def eval_non_scholarly_sources(citation, citation_val):
"""
This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
Input:
the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
Output:
the tag for citation (red, green, yellow, unknown)
"""
with open("non_scholarly_flags.json", "r") as f:
non_scholarly_flags = json.load(f)
# Check if the tag is found in either name or is part of external_link
for key, val in non_scholarly_flags.items():
for source in val:
if source in citation_val["external_link"]:
return key
elif source in citation:
return key
return "unknown"
def check_source_quality(wiki_url):
"""
Go through each parsed citation, check them through the red-yellow-green tag
Return: red, yellow, green lists that include the citations belong to each category
"""
parsed = parse_match_wikitext(wiki_url)
red_flag_list = []
yellow_flag_list = []
green_flag_list = []
unknown_list = []
for citation, val in parsed.items():
eval = None
# Check for journals/ conference/ other
if val["type"] in {"journal", "conference", "other"}:
eval = eval_scholarly_sources(val)
elif val["type"] in {"web", "book", "news", "other"}:
eval = eval_non_scholarly_sources(citation, val)
if eval == "red":
red_flag_list.append((citation, val["publisher"]))
elif eval == "yellow":
yellow_flag_list.append((citation, val["publisher"]))
elif eval == "green":
green_flag_list.append((citation, val["publisher"]))
elif eval == "unknown":
unknown_list.append((citation, val["publisher"]))
return red_flag_list, yellow_flag_list, green_flag_list, unknown_list
# TEST
a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
print("Red flag source:" , a[0])
print("Yellow flag source: ", a[1])
print("Green source: ", a[2])
print("Undetermined sources: ", a[3]) |