Spaces:

trangdoan982
/

wikicredibility

Sleeping

File size: 12,369 Bytes

# %%
# Import all libraries

from bs4 import BeautifulSoup
import bibtexparser
from dateutil import parser
import json
import requests
import tldextract
from collections import defaultdict
import re
import mwparserfromhell


# Given the DOI, PMID, PMC number, fetch journal's meta data

def get_metainfo_doi(doi):
    """Input: doi string
    Output: the journal name and date published of the article. Return None for each value if the can't parsed
    """
    res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"})
    res = res.content.decode('utf-8')
    bibtext = bibtexparser.loads(res).entries
    if len(bibtext) >0 :
        journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None
        time_published = ""

        if "year" in bibtext[0]:
            time_published += bibtext[0]["year"]
        if "month" in bibtext[0]:
            time_published += " " + bibtext[0]["month"]
        if "day" in bibtext[0]:
            time_published += " " + bibtext[0]["day"]
        if len(time_published) > 0:
            time_published = parser.parse(time_published)
        else:
            time_published = None

        return journal, time_published
    else:
        return None, None


def get_metainfo_pmc(pmc):
    res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json")
    res = res.content.decode("utf-8")
    res = json.loads(res)
    data = res["result"][pmc]
    journal, time_published = None, None
    if "error" in data:
        return None, None
    else:
        journal = data["fulljournalname"].strip()
        time_published = parser.parse(data["pubdate"])
        return journal, time_published


def get_metainfo_pmid(pmid):
    res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json")
    res = res.content.decode("utf-8")
    res = json.loads(res)
    data = res["result"][pmid]
    journal, time_published = None, None
    if "error" in data:
        return None, None
    else:
        journal = data["fulljournalname"].strip()
        time_published = parser.parse(data["pubdate"])
        return journal, time_published


def parse_html(page_url):
  """ This function parse metadata of citations from HTML tag.
    Input: wiki_url
    Output: a parsed citation list from HTML. Each citation has format key: value
        key: the text version of all citation
        value: a dictionary with schema 
            {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}"""
  
  citation_types = {'web', 'journal', 'book', 'conference', 'news'}
  all_parsed_citations = defaultdict(dict)
  response = requests.get(page_url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # Get all the references
  all_ref = []
  ordered_ref_lst = soup.find_all("ol", {"class": "references"})
  for each in ordered_ref_lst:
    refs = each.find_all("li")
    all_ref += refs

  for ele in all_ref:
    #Check if it has <span class="reference text">
    ref = ele.find("span", {"class":"reference-text"})
    
    source_type = "other" #first default value for source_type

    if ref:
      # TASK: get all essential information from citation tag
      citation_key = ref.get_text()
      hyperlink = ref.find("a", {"class": "external text"})
      external_link = hyperlink["href"] if hyperlink else None

      # TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news'
      cite_tag = ref.find("cite") 
      if cite_tag:
        for class_tag in cite_tag["class"]:
          if class_tag in citation_types:
            source_type = class_tag      
            break
          
      # TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news)
      #for journal, conference, others look for DOI or PMID or PMC
      if source_type in {'journal', 'conference', 'other'}:
        has_doi = ref.find("a", {"title": "Doi (identifier)"})
        has_pmc = ref.find("a", {"title": "PMC (identifier)"})
        has_pmid = ref.find("a", {"title": "PMID (identifier)"})
        journal, date = None, None
        if has_doi:
          doi = has_doi.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_doi(doi.text)
        elif has_pmc:
          pmc = has_pmc.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_pmc(pmc.text)
        elif has_pmid:
          pmid = has_pmid.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_pmid(pmid.text)         
        
        all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date}

      # for news, web, other that hasn't been parsed, publisher is the domain of the website 
      elif source_type in {'news', 'web', 'other'}:
        publisher = tldextract.extract(external_link).domain if external_link else None
        all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None}
        

  return all_parsed_citations


# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation

# %%
def parse_match_wikitext(wiki_url):
    """
    This function parse wikitext version of the citations, match it with the HTML version, 
    and extract more information, such as publisher and date that weren't extracted on the HTML.
    Input: wiki_url
    Output: a fully parsed citation list. Each citation has format key: value
        key: the text version of all citation
        value: a dictionary with schema 
            {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    """
    parsed_citation = parse_html(wiki_url)
    print("ALL citation", len(parsed_citation))
    wiki_page = wiki_url.split("wiki/")[1]
    url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw"
    response = requests.get(url)
    text = response.text
    wikicode = mwparserfromhell.parse(text)
    

    # Create a copy of not fully parsed citation, ie one that lacks publisher or date param
    not_fully_parsed = defaultdict(dict)
    for key, val in parsed_citation.items():
        if not val["publisher"] or not val["date"]:
            not_fully_parsed[key] = val

    for tpl in wikicode.filter_templates(matches="{{cite"):
        #tpl is template, for a template in wikitext
        found_match = None
        
        # Match on external link:
        if tpl.has_param("url"):
            external_url = tpl.get("url").split("=")[1]
            for key, val in not_fully_parsed.items():
                if val["external_link"]:
                    if val["external_link"].strip() == external_url.strip():
                        found_match = key
                        break
        # if not found match by URL, find by title
        if not found_match:
            if tpl.has_param("title"):
                #Get the title of citation in without formatting text
                title = tpl.get("title").split("=")[1] 
                title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting
                for key in not_fully_parsed.keys():
                    if title in key:
                        found_match = key
                        break
        
        if found_match:
            # Fetch publisher/ journal name from wikitext
            if not parsed_citation[found_match]["publisher"]:
                publisher = None
                if tpl.has_param("journal"): #for journal name
                    publisher = tpl.get("journal").split("=")[1] 
                elif tpl.has_param("publishder"): #for website or book publisher
                    publisher = tpl.get("publisher").split("=")[1]
                elif tpl.has_param("work"): #for news/ magazine name
                    publisher = tpl.get("work").split("=")[1]
                
                if publisher:
                    publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher)
                parsed_citation[found_match]["publisher"] = publisher

            # Fetch publication date from wikitext
            if not parsed_citation[found_match]["date"]:
                date = None
                if tpl.has_param("date"):
                    date = tpl.get("date").split("=")[1]
                    if len(date) >= 4: #at least 4 digits for year, or yy-mm format
                        date = parser.parse(date)
                parsed_citation[found_match]["date"] = date
            

    return parsed_citation


def eval_scholarly_sources(citation):
    """
    This function evaluates the tag for a scholarly souces (journal, conference, or other type)
    Input: 
        the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    Output:
        the tag for citation (red, green, yellow, unknown)
    """
    # read the dictionaries of flags from the json file
    with open("scholarly_flags.json", "r") as f:
        all_flags = json.load(f)

    # Check on the domain of external link
    if citation["external_link"]:
        domain = tldextract.extract(citation["external_link"]).domain
        if domain in all_flags["red_scholarly_reverse"]:
            return "red"
        elif domain in all_flags["yellow_scholarly_reverse"]:
            return "yellow"
        elif domain in all_flags["green_scholarly_reverse"]:
            return "green"
        
    #check on the name dictionary
        if citation["publisher"] in all_flags["red_scholarly"]:
            return "red"
        elif citation["publisher"] in all_flags["yellow_scholarly"]:
            return "yellow"
        elif citation["publisher"] in all_flags["green_scholarly"]:
            return "green"

    return "unknown"


def eval_non_scholarly_sources(citation, citation_val):
    """
    This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
    Input: 
        the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    Output:
        the tag for citation (red, green, yellow, unknown)
    """
    with open("non_scholarly_flags.json", "r") as f:
        non_scholarly_flags = json.load(f)
    
    # Check if the tag is found in either name or is part of external_link
    for key, val in non_scholarly_flags.items():
        for source in val:
            if source in citation_val["external_link"]:
                return key
            elif source in citation:
                return key
    return "unknown"

def check_source_quality(wiki_url):
    """
    Go through each parsed citation, check them through the red-yellow-green tag
    Return: red, yellow, green lists that include the citations belong to each category
    """
    parsed = parse_match_wikitext(wiki_url)
    red_flag_list = []
    yellow_flag_list = []
    green_flag_list = []
    unknown_list = []

    for citation, val in parsed.items():
        eval = None
        # Check for journals/ conference/ other
        if val["type"] in {"journal", "conference", "other"}:
           eval = eval_scholarly_sources(val)
        
        elif val["type"] in {"web", "book", "news", "other"}:
            eval = eval_non_scholarly_sources(citation, val)
        
        if eval == "red":
            red_flag_list.append((citation, val["publisher"]))
        elif eval == "yellow":
            yellow_flag_list.append((citation, val["publisher"]))
        elif eval == "green":
            green_flag_list.append((citation, val["publisher"]))
        elif eval == "unknown":
            unknown_list.append((citation, val["publisher"]))
    
    return red_flag_list, yellow_flag_list, green_flag_list, unknown_list

# TEST
a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
print("Red flag source:" , a[0])
print("Yellow flag source: ", a[1])
print("Green source: ", a[2])
print("Undetermined sources: ", a[3])