File size: 12,369 Bytes
6d80354
a3fcd7c
 
 
 
 
 
 
 
 
 
 
 
 
 
6d80354
a3fcd7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d80354
a3fcd7c
6d80354
a3fcd7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7be836b
a3fcd7c
6d80354
a3fcd7c
 
 
 
 
 
 
 
 
7be836b
 
 
a3fcd7c
 
 
 
 
7be836b
 
a3fcd7c
7be836b
a3fcd7c
 
6d80354
a3fcd7c
 
 
 
 
 
 
6d80354
 
 
 
 
 
 
 
 
 
 
a3fcd7c
 
 
 
 
 
 
 
 
6d80354
a3fcd7c
 
 
 
 
 
 
 
 
6d80354
a3fcd7c
 
 
 
 
6d80354
 
a3fcd7c
 
 
6d80354
a3fcd7c
 
 
6d80354
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# %%
# Import all libraries

from bs4 import BeautifulSoup
import bibtexparser
from dateutil import parser
import json
import requests
import tldextract
from collections import defaultdict
import re
import mwparserfromhell


# Given the DOI, PMID, PMC number, fetch journal's meta data

def get_metainfo_doi(doi):
    """Input: doi string
    Output: the journal name and date published of the article. Return None for each value if the can't parsed
    """
    res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"})
    res = res.content.decode('utf-8')
    bibtext = bibtexparser.loads(res).entries
    if len(bibtext) >0 :
        journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None
        time_published = ""

        if "year" in bibtext[0]:
            time_published += bibtext[0]["year"]
        if "month" in bibtext[0]:
            time_published += " " + bibtext[0]["month"]
        if "day" in bibtext[0]:
            time_published += " " + bibtext[0]["day"]
        if len(time_published) > 0:
            time_published = parser.parse(time_published)
        else:
            time_published = None

        return journal, time_published
    else:
        return None, None


def get_metainfo_pmc(pmc):
    res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json")
    res = res.content.decode("utf-8")
    res = json.loads(res)
    data = res["result"][pmc]
    journal, time_published = None, None
    if "error" in data:
        return None, None
    else:
        journal = data["fulljournalname"].strip()
        time_published = parser.parse(data["pubdate"])
        return journal, time_published


def get_metainfo_pmid(pmid):
    res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json")
    res = res.content.decode("utf-8")
    res = json.loads(res)
    data = res["result"][pmid]
    journal, time_published = None, None
    if "error" in data:
        return None, None
    else:
        journal = data["fulljournalname"].strip()
        time_published = parser.parse(data["pubdate"])
        return journal, time_published


def parse_html(page_url):
  """ This function parse metadata of citations from HTML tag.
    Input: wiki_url
    Output: a parsed citation list from HTML. Each citation has format key: value
        key: the text version of all citation
        value: a dictionary with schema 
            {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}"""
  
  citation_types = {'web', 'journal', 'book', 'conference', 'news'}
  all_parsed_citations = defaultdict(dict)
  response = requests.get(page_url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # Get all the references
  all_ref = []
  ordered_ref_lst = soup.find_all("ol", {"class": "references"})
  for each in ordered_ref_lst:
    refs = each.find_all("li")
    all_ref += refs

  for ele in all_ref:
    #Check if it has <span class="reference text">
    ref = ele.find("span", {"class":"reference-text"})
    
    source_type = "other" #first default value for source_type

    if ref:
      # TASK: get all essential information from citation tag
      citation_key = ref.get_text()
      hyperlink = ref.find("a", {"class": "external text"})
      external_link = hyperlink["href"] if hyperlink else None

      # TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news'
      cite_tag = ref.find("cite") 
      if cite_tag:
        for class_tag in cite_tag["class"]:
          if class_tag in citation_types:
            source_type = class_tag      
            break
          
      # TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news)
      #for journal, conference, others look for DOI or PMID or PMC
      if source_type in {'journal', 'conference', 'other'}:
        has_doi = ref.find("a", {"title": "Doi (identifier)"})
        has_pmc = ref.find("a", {"title": "PMC (identifier)"})
        has_pmid = ref.find("a", {"title": "PMID (identifier)"})
        journal, date = None, None
        if has_doi:
          doi = has_doi.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_doi(doi.text)
        elif has_pmc:
          pmc = has_pmc.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_pmc(pmc.text)
        elif has_pmid:
          pmid = has_pmid.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_pmid(pmid.text)         
        
        all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date}

      # for news, web, other that hasn't been parsed, publisher is the domain of the website 
      elif source_type in {'news', 'web', 'other'}:
        publisher = tldextract.extract(external_link).domain if external_link else None
        all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None}
        

  return all_parsed_citations


# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation

# %%
def parse_match_wikitext(wiki_url):
    """
    This function parse wikitext version of the citations, match it with the HTML version, 
    and extract more information, such as publisher and date that weren't extracted on the HTML.
    Input: wiki_url
    Output: a fully parsed citation list. Each citation has format key: value
        key: the text version of all citation
        value: a dictionary with schema 
            {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    """
    parsed_citation = parse_html(wiki_url)
    print("ALL citation", len(parsed_citation))
    wiki_page = wiki_url.split("wiki/")[1]
    url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw"
    response = requests.get(url)
    text = response.text
    wikicode = mwparserfromhell.parse(text)
    

    # Create a copy of not fully parsed citation, ie one that lacks publisher or date param
    not_fully_parsed = defaultdict(dict)
    for key, val in parsed_citation.items():
        if not val["publisher"] or not val["date"]:
            not_fully_parsed[key] = val

    for tpl in wikicode.filter_templates(matches="{{cite"):
        #tpl is template, for a template in wikitext
        found_match = None
        
        # Match on external link:
        if tpl.has_param("url"):
            external_url = tpl.get("url").split("=")[1]
            for key, val in not_fully_parsed.items():
                if val["external_link"]:
                    if val["external_link"].strip() == external_url.strip():
                        found_match = key
                        break
        # if not found match by URL, find by title
        if not found_match:
            if tpl.has_param("title"):
                #Get the title of citation in without formatting text
                title = tpl.get("title").split("=")[1] 
                title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting
                for key in not_fully_parsed.keys():
                    if title in key:
                        found_match = key
                        break
        
        if found_match:
            # Fetch publisher/ journal name from wikitext
            if not parsed_citation[found_match]["publisher"]:
                publisher = None
                if tpl.has_param("journal"): #for journal name
                    publisher = tpl.get("journal").split("=")[1] 
                elif tpl.has_param("publishder"): #for website or book publisher
                    publisher = tpl.get("publisher").split("=")[1]
                elif tpl.has_param("work"): #for news/ magazine name
                    publisher = tpl.get("work").split("=")[1]
                
                if publisher:
                    publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher)
                parsed_citation[found_match]["publisher"] = publisher

            # Fetch publication date from wikitext
            if not parsed_citation[found_match]["date"]:
                date = None
                if tpl.has_param("date"):
                    date = tpl.get("date").split("=")[1]
                    if len(date) >= 4: #at least 4 digits for year, or yy-mm format
                        date = parser.parse(date)
                parsed_citation[found_match]["date"] = date
            

    return parsed_citation


def eval_scholarly_sources(citation):
    """
    This function evaluates the tag for a scholarly souces (journal, conference, or other type)
    Input: 
        the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    Output:
        the tag for citation (red, green, yellow, unknown)
    """
    # read the dictionaries of flags from the json file
    with open("scholarly_flags.json", "r") as f:
        all_flags = json.load(f)

    # Check on the domain of external link
    if citation["external_link"]:
        domain = tldextract.extract(citation["external_link"]).domain
        if domain in all_flags["red_scholarly_reverse"]:
            return "red"
        elif domain in all_flags["yellow_scholarly_reverse"]:
            return "yellow"
        elif domain in all_flags["green_scholarly_reverse"]:
            return "green"
        
    #check on the name dictionary
        if citation["publisher"] in all_flags["red_scholarly"]:
            return "red"
        elif citation["publisher"] in all_flags["yellow_scholarly"]:
            return "yellow"
        elif citation["publisher"] in all_flags["green_scholarly"]:
            return "green"

    return "unknown"


def eval_non_scholarly_sources(citation, citation_val):
    """
    This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
    Input: 
        the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    Output:
        the tag for citation (red, green, yellow, unknown)
    """
    with open("non_scholarly_flags.json", "r") as f:
        non_scholarly_flags = json.load(f)
    
    # Check if the tag is found in either name or is part of external_link
    for key, val in non_scholarly_flags.items():
        for source in val:
            if source in citation_val["external_link"]:
                return key
            elif source in citation:
                return key
    return "unknown"

def check_source_quality(wiki_url):
    """
    Go through each parsed citation, check them through the red-yellow-green tag
    Return: red, yellow, green lists that include the citations belong to each category
    """
    parsed = parse_match_wikitext(wiki_url)
    red_flag_list = []
    yellow_flag_list = []
    green_flag_list = []
    unknown_list = []

    for citation, val in parsed.items():
        eval = None
        # Check for journals/ conference/ other
        if val["type"] in {"journal", "conference", "other"}:
           eval = eval_scholarly_sources(val)
        
        elif val["type"] in {"web", "book", "news", "other"}:
            eval = eval_non_scholarly_sources(citation, val)
        
        if eval == "red":
            red_flag_list.append((citation, val["publisher"]))
        elif eval == "yellow":
            yellow_flag_list.append((citation, val["publisher"]))
        elif eval == "green":
            green_flag_list.append((citation, val["publisher"]))
        elif eval == "unknown":
            unknown_list.append((citation, val["publisher"]))
    
    return red_flag_list, yellow_flag_list, green_flag_list, unknown_list

# TEST
a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
print("Red flag source:" , a[0])
print("Yellow flag source: ", a[1])
print("Green source: ", a[2])
print("Undetermined sources: ", a[3])