[email protected] commited on
Commit
6d80354
·
1 Parent(s): 73b44db

add evaluation for non-scholarly sources

Browse files
non_scholarly_flags.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"red": ["112 Ukraine", "Ad Fontes Media", "Advameg", "AlterNet", "Amazon", "Anadolu Agency ", "Ancestry.com", "ANNA News", "Answers.com", "Antiwar.com", "arXiv", "Baidu Baike", "bestgore.com", "Bild", "Blaze Media", "Blogger", "Breitbart News", "BroadwayWorld", "The California Globe", "The Canary", "CelebrityNetWorth", "Centre for Research on Globalization", "CESNUR", "China Global Television Network", "CNET ", "CoinDesk", "Consortium News", "CounterPunch", "Cracked.com", "Crunchbase", "The Daily Caller", "Daily Express", "Daily Kos", "Daily Mail", "Daily Sabah", "Daily Star", "The Daily Wire", "Discogs", "The Electronic Intifada", "Encyclopaedia Metallum", "The Epoch Times", "Examiner.com", "Facebook", "FamilySearch", "Famous Birthdays", "Fandom", "The Federalist", "Find a Grave", "Findmypast", "Flags of the World", "Flickr", "Forbes.com contributors", "Fox News ", "FrontPage Magazine", "The Gateway Pundit", "Gawker", "Geni.com", "gnis-class", "gns-class", "Global Times", "GlobalSecurity.org", "Goodreads", "The Grayzone", "Guido Fawkes", "Heat Street", "HispanTV", "History", "HuffPost contributors", "IMDb", "Independent Media Center", "InfoWars", "Inquisitr", "International Business Times", "Investopedia", "Jewish Virtual Library", "Jihad Watch", "Joshua Project", "Know Your Meme", "Last.fm", "Lenta.ru", "LifeSiteNews", "LinkedIn", "LiveJournal", "LiveLeak", "Lulu.com", "The Mail on Sunday", "Marquis Who's Who", "Mashable sponsored content", "Media Bias/Fact Check", "Media Research Center", "Medium", "metal-experience", "Metro", "MintPress News", "MyLife", "National Enquirer", "Natural News", "The New American", "New Eastern Outlook", "New York Post", "News Break", "NewsBlaze", "News of the World", "Newsmax", "NNDB", "Occupy Democrats", "One America News Network", "The Onion", "OpIndia", "Our Campaigns", "PanAm Post", "Patheos", "Peerage websites", "An Phoblacht", "The Points Guy", "The Points Guy ", "The Post Millennial", "PR Newswire", "Press TV", "Project Veritas", "Quadrant", "Quillette", "Quora", "Rate Your Music", "The Raw Story", "Reddit", "RedState", "ResearchGate", "Republic TV", "Rolling Stone ", "Rolling Stone ", "Royal Central", "RT", "Scribd", "Scriptural texts", "Sixth Tone ", "The Skwawkbox", "SourceWatch", "Spirit of Metal", "Sportskeeda", "Sputnik", "Stack Exchange", "starsunfolded.com", "The Sun", "Swarajya", "Taki's Magazine", "TASS", "Telesur", "The Truth About Guns", "TV.com", "TV Tropes", "Twitter", "The Unz Review", "Urban Dictionary", "VDARE", "Venezuelanalysis", "Veterans Today", "VGChartz", "VoC", "Voltaire Network", "Weather2Travel", "The Western Journal", "We Got This Covered", "WhatCulture", "Who's Who ", "WhoSampled", "Wikidata", "WikiLeaks", "Wikinews", "Wikipedia", "WordPress.com", "WorldNetDaily", "Worldometer", "YouTube", "Zero Hedge", "ZoomInfo", "70 News", "ABCnews.com.co", "American News", "banned.video", "Before It's News", "bients.com", "Bipartisan Report", "bizstandardnews.com", "Bloomberg.ma", "The Boston Tribune", "Breaking-CNN.com", "BVA News", "Cairns News", "Celebtricity", "CBSnews.com.co", "cnn-trending.com", "Conservative 101", "Conservative Frontline", "CountyNewsroom.info", "Daily USA Update", "Disclose.tv", "DrudgeReport.com.co", "Empire Herald", "Empire News", "Empire Sports", "The Expos\u00e9", "Fox-news24.com", "The Gateway Pundit", "Global Associated News", "Globalresearch.ca", "Gossip Mill Mzansi", "The Grayzone", "Guerilla News", "Gummy Post", "Houston Chronicle TV", "Huzlers", "InfoWars", "Judicial Watch", "\u039a\u0392\u039f\u03992.com", "KMT 11 News", "The Last Line of Defense", "Law Enforcement Today", "Liberal Society", "Liberty Writers News", "LinkBeef", "MV-media", "Naha Daily", "National Insider Politics", "NationalReport.net", "Natural News", "NBCNews.com.co", "News Breaks Here", "NewsBuzzDaily", "News Examiner", "News Hound", "The News Nerd", "NewsPunch", "NewsWatch33", "The New York Evening", "Next News Network", "Now 8 News", "Oneworld.press", "OpIndia", "Palmer Report", "Peace Data", "Postcard News", "The Predicted", "Prntly", "React 365", "The Reporterz", "Snoopack", "Spin Zone", "St George Gazette", "Stuppid", "Super Station 95", "TruNews", "TrueTrumpers.com", "UConservative", "UndergroundNewsReport.com", "The Unhived Mind", "United Media Publishing", "USA Daily Info", "usatoday.com.co", "US Postman", "washingtonpost.com.co", "WorldNetDaily", "World News Daily Report"], "green": ["ABC News", "The Age", "Agence France-Presse", "Al Jazeera", "Amnesty International", "Anti-Defamation League", "Aon", "Ars Technica", "Associated Press", "The Atlantic", "The Australian", "The A.V. Club", "AVN", "Axios", "BBC", "Behind the Voice Actors", "Bellingcat", "Bloomberg", "Burke's Peerage", "BuzzFeed News", "The Christian Science Monitor", "Climate Feedback", "CNET ", "CNN", "Coda Media", "Common Sense Media", "The Conversation", "The Daily Telegraph", "Deadline Hollywood", "Debrett's", "Deseret News", "Deutsche Welle", "Digital Spy", "The Diplomat", "The Economist", "Encyclop\u00e6dia Iranica", "Engadget", "Entertainment Weekly", "Financial Times", "Forbes", "Fox News", "Game Developer", "Game Informer", "Gazeta Wyborcza", "gnis-coord", "Gizmodo", "The Globe and Mail", "The Guardian", "Haaretz", "The Hill", "The Hindu", "The Hollywood Reporter", "HuffPost", "Idolator", "IGN", "The Independent", "The Indian Express", "Insider culture", "Inter Press Service", "The Intercept", "International Fact-Checking Network", "Jacobin", "JAMA", "The Jewish Chronicle", "Kirkus Reviews", "Kommersant", "Los Angeles Times", "Mail & Guardian", "The Mary Sue", "Metacritic", "Le Monde diplomatique", "Mother Jones", "MSNBC", "The Nation", "National Geographic", "NBC News", "The New Republic", "New York", "New York Daily News", "The New York Times", "The New Yorker", "The New Zealand Herald", "Newslaundry", "Newsweek ", "NPR", "People", "Pew Research Center", "People Make Games", "PinkNews", "Playboy", "Politico", "PolitiFact", "Polygon", "ProPublica", "Quartz", "Radio Free Asia", "Rappler", "Reason", "The Register", "Religion News Service", "Reuters", "Rolling Stone", "Rotten Tomatoes", "Science-Based Medicine", "Scientific American", "SCOTUSblog", "Sky News UK", "Snopes", "SCMP", "Southern Poverty Law Center", "Space.com", "Der Spiegel", "The Sydney Morning Herald", "TheWrap", "Time", "The Times", "TorrentFreak", "TV Guide", "U.S. News & World Report", "USA Today", "Vanity Fair", "Variety", "VentureBeat", "The Verge", "Vogue", "Voice of America", "Vox", "The Wall Street Journal", "The Washington Post", "The Weekly Standard", "Wired", "Yahoo News", "ZDNet"], "yellow": ["Alexa Internet", "AllSides", "The American Conservative", "Anadolu Agency", "Apple Daily", "Arab News", "Asian News International", "AskMen", "Asian News International", "Ballotpedia", "Biography.com", "Bloomberg profiles", "Boing Boing", "Bustle", "BuzzFeed", "Cato Institute", "Center for Economic and Policy Research", "China Daily", "CliffsNotes", "CNET ", "Cosmopolitan", "The Daily Beast", "The Daily Dot", "Daily Mirror", "Daily NK", "Democracy Now!", "Dotdash", "Encyclop\u00e6dia Britannica", "Entrepreneur", "Evening Standard", "Fairness and Accuracy in Reporting", "Fox News ", "Genius", "gns-coord", "Google Maps", "The Green Papers", "The Guardian blogs", "Guinness World Records", "Hansard", "Heavy.com", "Hope not Hate", "HuffPost ", "Human Events", "Independent Journal Review", "Insider", "IslamQA.info", "Jezebel", "Mashable", "MDPI", "Media Matters for America", "Mediaite", "MetalSucks", "Middle East Media Research Institute", "Mondoweiss", "Morning Star", "National Review", "The Needle Drop", "Newsweek ", "The Next Web", "Pride.com", "Quackwatch", "RealClearPolitics", "RhythmOne", "RIA Novosti", "Salon", "ScienceBlogs", "Screen Rant", "Sherdog", "Sixth Tone", "The Skeptic's Dictionary", "Sky News Australia", "SparkNotes", "The Spectator", "The Straits Times", "TechCrunch", "ThinkProgress", "The Times of India", "TMZ", "Townhall", "TRT World", "Us Weekly", "Vice Media", "Washington Examiner", "The Washington Times", "Wikidata transcluded statements", "World Socialist Web Site", "XBIZ", "Xinhua News Agency"]}
flags.json → scholarly_flags.json RENAMED
File without changes
source_eval_model.py CHANGED
@@ -1,4 +1,4 @@
1
-
2
  # Import all libraries
3
 
4
  from bs4 import BeautifulSoup
@@ -13,6 +13,7 @@ import mwparserfromhell
13
 
14
 
15
  # Given the DOI, PMID, PMC number, fetch journal's meta data
 
16
  def get_metainfo_doi(doi):
17
  """Input: doi string
18
  Output: the journal name and date published of the article. Return None for each value if the can't parsed
@@ -136,10 +137,9 @@ def parse_html(page_url):
136
  return all_parsed_citations
137
 
138
 
139
- # After finish parsing with HTML tag, we fetch the wikitext version of the page,
140
- # then match it with the HTML tag to extract more information about the citation
141
-
142
 
 
143
  def parse_match_wikitext(wiki_url):
144
  """
145
  This function parse wikitext version of the citations, match it with the HTML version,
@@ -223,12 +223,10 @@ def eval_scholarly_sources(citation):
223
  the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
224
  Output:
225
  the tag for citation (red, green, yellow, unknown)
226
- """
227
- # TODO: find check for green, both in domain check and name check
228
-
229
 
230
  # read the dictionaries of flags from the json file
231
- with open("flags.json", "r") as f:
232
  all_flags = json.load(f)
233
 
234
  found_tag = False
@@ -257,7 +255,7 @@ def eval_scholarly_sources(citation):
257
  return "unknown"
258
 
259
 
260
- def eval_non_scholarly_sources(citation):
261
  """
262
  This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
263
  Input:
@@ -265,8 +263,17 @@ def eval_non_scholarly_sources(citation):
265
  Output:
266
  the tag for citation (red, green, yellow, unknown)
267
  """
268
- pass
269
-
 
 
 
 
 
 
 
 
 
270
 
271
  def check_source_quality(wiki_url):
272
  """
@@ -276,6 +283,7 @@ def check_source_quality(wiki_url):
276
  parsed = parse_match_wikitext(wiki_url)
277
  red_flag_list = []
278
  yellow_flag_list = []
 
279
  unknown_list = []
280
 
281
  for citation, val in parsed.items():
@@ -285,22 +293,22 @@ def check_source_quality(wiki_url):
285
  eval = eval_scholarly_sources(val)
286
 
287
  elif val["type"] in {"web", "book", "news", "other"}:
288
- eval = eval_non_scholarly_sources(val)
289
 
290
  if eval == "red":
291
  red_flag_list.append((citation, val["publisher"]))
292
  elif eval == "yellow":
293
  yellow_flag_list.append((citation, val["publisher"]))
294
- # TODO: tag for green as well
 
295
  elif eval == "unknown":
296
  unknown_list.append((citation, val["publisher"]))
297
 
298
- return red_flag_list, yellow_flag_list, unknown_list
299
 
300
  # TEST
301
  a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
302
- print("Red flag scholarly source:" , a[0])
303
- print("Yellow flag scholarly source: ", a[1])
304
- print("Undetermined scholarly sources: ", a[2])
305
-
306
-
 
1
+ # %%
2
  # Import all libraries
3
 
4
  from bs4 import BeautifulSoup
 
13
 
14
 
15
  # Given the DOI, PMID, PMC number, fetch journal's meta data
16
+
17
  def get_metainfo_doi(doi):
18
  """Input: doi string
19
  Output: the journal name and date published of the article. Return None for each value if the can't parsed
 
137
  return all_parsed_citations
138
 
139
 
140
+ # After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation
 
 
141
 
142
+ # %%
143
  def parse_match_wikitext(wiki_url):
144
  """
145
  This function parse wikitext version of the citations, match it with the HTML version,
 
223
  the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
224
  Output:
225
  the tag for citation (red, green, yellow, unknown)
226
+ """
 
 
227
 
228
  # read the dictionaries of flags from the json file
229
+ with open("scholarly_flags.json", "r") as f:
230
  all_flags = json.load(f)
231
 
232
  found_tag = False
 
255
  return "unknown"
256
 
257
 
258
+ def eval_non_scholarly_sources(citation, citation_val):
259
  """
260
  This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
261
  Input:
 
263
  Output:
264
  the tag for citation (red, green, yellow, unknown)
265
  """
266
+ with open("non_scholarly_flags.json", "r") as f:
267
+ non_scholarly_flags = json.load(f)
268
+
269
+ # Check if the tag is found in either name or is part of external_link
270
+ for key, val in non_scholarly_flags.items():
271
+ for source in val:
272
+ if source in citation_val["external_link"]:
273
+ return key
274
+ elif source in citation:
275
+ return key
276
+ return "unknown"
277
 
278
  def check_source_quality(wiki_url):
279
  """
 
283
  parsed = parse_match_wikitext(wiki_url)
284
  red_flag_list = []
285
  yellow_flag_list = []
286
+ green_flag_list = []
287
  unknown_list = []
288
 
289
  for citation, val in parsed.items():
 
293
  eval = eval_scholarly_sources(val)
294
 
295
  elif val["type"] in {"web", "book", "news", "other"}:
296
+ eval = eval_non_scholarly_sources(citation, val)
297
 
298
  if eval == "red":
299
  red_flag_list.append((citation, val["publisher"]))
300
  elif eval == "yellow":
301
  yellow_flag_list.append((citation, val["publisher"]))
302
+ elif eval == "green":
303
+ green_flag_list.append((citation, val["publisher"]))
304
  elif eval == "unknown":
305
  unknown_list.append((citation, val["publisher"]))
306
 
307
+ return red_flag_list, yellow_flag_list, green_flag_list, unknown_list
308
 
309
  # TEST
310
  a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
311
+ print("Red flag source:" , a[0])
312
+ print("Yellow flag source: ", a[1])
313
+ print("Green source: ", a[2])
314
+ print("Undetermined sources: ", a[3])