Spaces:

trangdoan982
/

wikicredibility

Sleeping

App Files Files Community

[email protected] commited on Apr 1, 2023

Commit

6d80354

1 Parent(s): 73b44db

add evaluation for non-scholarly sources

Browse files

Files changed (3) hide show

non_scholarly_flags.json +1 -0
flags.json → scholarly_flags.json +0 -0
source_eval_model.py +27 -19

non_scholarly_flags.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"red": ["112 Ukraine", "Ad Fontes Media", "Advameg", "AlterNet", "Amazon", "Anadolu Agency ", "Ancestry.com", "ANNA News", "Answers.com", "Antiwar.com", "arXiv", "Baidu Baike", "bestgore.com", "Bild", "Blaze Media", "Blogger", "Breitbart News", "BroadwayWorld", "The California Globe", "The Canary", "CelebrityNetWorth", "Centre for Research on Globalization", "CESNUR", "China Global Television Network", "CNET ", "CoinDesk", "Consortium News", "CounterPunch", "Cracked.com", "Crunchbase", "The Daily Caller", "Daily Express", "Daily Kos", "Daily Mail", "Daily Sabah", "Daily Star", "The Daily Wire", "Discogs", "The Electronic Intifada", "Encyclopaedia Metallum", "The Epoch Times", "Examiner.com", "Facebook", "FamilySearch", "Famous Birthdays", "Fandom", "The Federalist", "Find a Grave", "Findmypast", "Flags of the World", "Flickr", "Forbes.com contributors", "Fox News ", "FrontPage Magazine", "The Gateway Pundit", "Gawker", "Geni.com", "gnis-class", "gns-class", "Global Times", "GlobalSecurity.org", "Goodreads", "The Grayzone", "Guido Fawkes", "Heat Street", "HispanTV", "History", "HuffPost contributors", "IMDb", "Independent Media Center", "InfoWars", "Inquisitr", "International Business Times", "Investopedia", "Jewish Virtual Library", "Jihad Watch", "Joshua Project", "Know Your Meme", "Last.fm", "Lenta.ru", "LifeSiteNews", "LinkedIn", "LiveJournal", "LiveLeak", "Lulu.com", "The Mail on Sunday", "Marquis Who's Who", "Mashable sponsored content", "Media Bias/Fact Check", "Media Research Center", "Medium", "metal-experience", "Metro", "MintPress News", "MyLife", "National Enquirer", "Natural News", "The New American", "New Eastern Outlook", "New York Post", "News Break", "NewsBlaze", "News of the World", "Newsmax", "NNDB", "Occupy Democrats", "One America News Network", "The Onion", "OpIndia", "Our Campaigns", "PanAm Post", "Patheos", "Peerage websites", "An Phoblacht", "The Points Guy", "The Points Guy ", "The Post Millennial", "PR Newswire", "Press TV", "Project Veritas", "Quadrant", "Quillette", "Quora", "Rate Your Music", "The Raw Story", "Reddit", "RedState", "ResearchGate", "Republic TV", "Rolling Stone ", "Rolling Stone ", "Royal Central", "RT", "Scribd", "Scriptural texts", "Sixth Tone ", "The Skwawkbox", "SourceWatch", "Spirit of Metal", "Sportskeeda", "Sputnik", "Stack Exchange", "starsunfolded.com", "The Sun", "Swarajya", "Taki's Magazine", "TASS", "Telesur", "The Truth About Guns", "TV.com", "TV Tropes", "Twitter", "The Unz Review", "Urban Dictionary", "VDARE", "Venezuelanalysis", "Veterans Today", "VGChartz", "VoC", "Voltaire Network", "Weather2Travel", "The Western Journal", "We Got This Covered", "WhatCulture", "Who's Who ", "WhoSampled", "Wikidata", "WikiLeaks", "Wikinews", "Wikipedia", "WordPress.com", "WorldNetDaily", "Worldometer", "YouTube", "Zero Hedge", "ZoomInfo", "70 News", "ABCnews.com.co", "American News", "banned.video", "Before It's News", "bients.com", "Bipartisan Report", "bizstandardnews.com", "Bloomberg.ma", "The Boston Tribune", "Breaking-CNN.com", "BVA News", "Cairns News", "Celebtricity", "CBSnews.com.co", "cnn-trending.com", "Conservative 101", "Conservative Frontline", "CountyNewsroom.info", "Daily USA Update", "Disclose.tv", "DrudgeReport.com.co", "Empire Herald", "Empire News", "Empire Sports", "The Expos\u00e9", "Fox-news24.com", "The Gateway Pundit", "Global Associated News", "Globalresearch.ca", "Gossip Mill Mzansi", "The Grayzone", "Guerilla News", "Gummy Post", "Houston Chronicle TV", "Huzlers", "InfoWars", "Judicial Watch", "\u039a\u0392\u039f\u03992.com", "KMT 11 News", "The Last Line of Defense", "Law Enforcement Today", "Liberal Society", "Liberty Writers News", "LinkBeef", "MV-media", "Naha Daily", "National Insider Politics", "NationalReport.net", "Natural News", "NBCNews.com.co", "News Breaks Here", "NewsBuzzDaily", "News Examiner", "News Hound", "The News Nerd", "NewsPunch", "NewsWatch33", "The New York Evening", "Next News Network", "Now 8 News", "Oneworld.press", "OpIndia", "Palmer Report", "Peace Data", "Postcard News", "The Predicted", "Prntly", "React 365", "The Reporterz", "Snoopack", "Spin Zone", "St George Gazette", "Stuppid", "Super Station 95", "TruNews", "TrueTrumpers.com", "UConservative", "UndergroundNewsReport.com", "The Unhived Mind", "United Media Publishing", "USA Daily Info", "usatoday.com.co", "US Postman", "washingtonpost.com.co", "WorldNetDaily", "World News Daily Report"], "green": ["ABC News", "The Age", "Agence France-Presse", "Al Jazeera", "Amnesty International", "Anti-Defamation League", "Aon", "Ars Technica", "Associated Press", "The Atlantic", "The Australian", "The A.V. Club", "AVN", "Axios", "BBC", "Behind the Voice Actors", "Bellingcat", "Bloomberg", "Burke's Peerage", "BuzzFeed News", "The Christian Science Monitor", "Climate Feedback", "CNET ", "CNN", "Coda Media", "Common Sense Media", "The Conversation", "The Daily Telegraph", "Deadline Hollywood", "Debrett's", "Deseret News", "Deutsche Welle", "Digital Spy", "The Diplomat", "The Economist", "Encyclop\u00e6dia Iranica", "Engadget", "Entertainment Weekly", "Financial Times", "Forbes", "Fox News", "Game Developer", "Game Informer", "Gazeta Wyborcza", "gnis-coord", "Gizmodo", "The Globe and Mail", "The Guardian", "Haaretz", "The Hill", "The Hindu", "The Hollywood Reporter", "HuffPost", "Idolator", "IGN", "The Independent", "The Indian Express", "Insider culture", "Inter Press Service", "The Intercept", "International Fact-Checking Network", "Jacobin", "JAMA", "The Jewish Chronicle", "Kirkus Reviews", "Kommersant", "Los Angeles Times", "Mail & Guardian", "The Mary Sue", "Metacritic", "Le Monde diplomatique", "Mother Jones", "MSNBC", "The Nation", "National Geographic", "NBC News", "The New Republic", "New York", "New York Daily News", "The New York Times", "The New Yorker", "The New Zealand Herald", "Newslaundry", "Newsweek ", "NPR", "People", "Pew Research Center", "People Make Games", "PinkNews", "Playboy", "Politico", "PolitiFact", "Polygon", "ProPublica", "Quartz", "Radio Free Asia", "Rappler", "Reason", "The Register", "Religion News Service", "Reuters", "Rolling Stone", "Rotten Tomatoes", "Science-Based Medicine", "Scientific American", "SCOTUSblog", "Sky News UK", "Snopes", "SCMP", "Southern Poverty Law Center", "Space.com", "Der Spiegel", "The Sydney Morning Herald", "TheWrap", "Time", "The Times", "TorrentFreak", "TV Guide", "U.S. News & World Report", "USA Today", "Vanity Fair", "Variety", "VentureBeat", "The Verge", "Vogue", "Voice of America", "Vox", "The Wall Street Journal", "The Washington Post", "The Weekly Standard", "Wired", "Yahoo News", "ZDNet"], "yellow": ["Alexa Internet", "AllSides", "The American Conservative", "Anadolu Agency", "Apple Daily", "Arab News", "Asian News International", "AskMen", "Asian News International", "Ballotpedia", "Biography.com", "Bloomberg profiles", "Boing Boing", "Bustle", "BuzzFeed", "Cato Institute", "Center for Economic and Policy Research", "China Daily", "CliffsNotes", "CNET ", "Cosmopolitan", "The Daily Beast", "The Daily Dot", "Daily Mirror", "Daily NK", "Democracy Now!", "Dotdash", "Encyclop\u00e6dia Britannica", "Entrepreneur", "Evening Standard", "Fairness and Accuracy in Reporting", "Fox News ", "Genius", "gns-coord", "Google Maps", "The Green Papers", "The Guardian blogs", "Guinness World Records", "Hansard", "Heavy.com", "Hope not Hate", "HuffPost ", "Human Events", "Independent Journal Review", "Insider", "IslamQA.info", "Jezebel", "Mashable", "MDPI", "Media Matters for America", "Mediaite", "MetalSucks", "Middle East Media Research Institute", "Mondoweiss", "Morning Star", "National Review", "The Needle Drop", "Newsweek ", "The Next Web", "Pride.com", "Quackwatch", "RealClearPolitics", "RhythmOne", "RIA Novosti", "Salon", "ScienceBlogs", "Screen Rant", "Sherdog", "Sixth Tone", "The Skeptic's Dictionary", "Sky News Australia", "SparkNotes", "The Spectator", "The Straits Times", "TechCrunch", "ThinkProgress", "The Times of India", "TMZ", "Townhall", "TRT World", "Us Weekly", "Vice Media", "Washington Examiner", "The Washington Times", "Wikidata transcluded statements", "World Socialist Web Site", "XBIZ", "Xinhua News Agency"]}

flags.json → scholarly_flags.json RENAMED Viewed

File without changes

source_eval_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Import all libraries
 from bs4 import BeautifulSoup
@@ -13,6 +13,7 @@ import mwparserfromhell
 # Given the DOI, PMID, PMC number, fetch journal's meta data
 def get_metainfo_doi(doi):
     """Input: doi string
     Output: the journal name and date published of the article. Return None for each value if the can't parsed
@@ -136,10 +137,9 @@ def parse_html(page_url):
   return all_parsed_citations
-# After finish parsing with HTML tag, we fetch the wikitext version of the page,
-# then match it with the HTML tag to extract more information about the citation
 def parse_match_wikitext(wiki_url):
     """
     This function parse wikitext version of the citations, match it with the HTML version,
@@ -223,12 +223,10 @@ def eval_scholarly_sources(citation):
         the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
     Output:
         the tag for citation (red, green, yellow, unknown)
-    """
-    # TODO: find check for green, both in domain check and name check
     # read the dictionaries of flags from the json file
-    with open("flags.json", "r") as f:
         all_flags = json.load(f)
     found_tag = False
@@ -257,7 +255,7 @@ def eval_scholarly_sources(citation):
         return "unknown"
-def eval_non_scholarly_sources(citation):
     """
     This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
     Input:
@@ -265,8 +263,17 @@ def eval_non_scholarly_sources(citation):
     Output:
         the tag for citation (red, green, yellow, unknown)
     """
-    pass
 def check_source_quality(wiki_url):
     """
@@ -276,6 +283,7 @@ def check_source_quality(wiki_url):
     parsed = parse_match_wikitext(wiki_url)
     red_flag_list = []
     yellow_flag_list = []
     unknown_list = []
     for citation, val in parsed.items():
@@ -285,22 +293,22 @@ def check_source_quality(wiki_url):
            eval = eval_scholarly_sources(val)
         elif val["type"] in {"web", "book", "news", "other"}:
-            eval = eval_non_scholarly_sources(val)
         if eval == "red":
             red_flag_list.append((citation, val["publisher"]))
         elif eval == "yellow":
             yellow_flag_list.append((citation, val["publisher"]))
-        # TODO: tag for green as well
         elif eval == "unknown":
             unknown_list.append((citation, val["publisher"]))
-    return red_flag_list, yellow_flag_list, unknown_list
 # TEST
 a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
-print("Red flag scholarly source:" , a[0])
-print("Yellow flag scholarly source: ", a[1])
-print("Undetermined scholarly sources: ", a[2])

+# %%
 # Import all libraries
 from bs4 import BeautifulSoup
 # Given the DOI, PMID, PMC number, fetch journal's meta data
 def get_metainfo_doi(doi):
     """Input: doi string
     Output: the journal name and date published of the article. Return None for each value if the can't parsed
   return all_parsed_citations
+# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation
+# %%
 def parse_match_wikitext(wiki_url):
     """
     This function parse wikitext version of the citations, match it with the HTML version,
         the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
     Output:
         the tag for citation (red, green, yellow, unknown)
+    """
     # read the dictionaries of flags from the json file
+    with open("scholarly_flags.json", "r") as f:
         all_flags = json.load(f)
     found_tag = False
         return "unknown"
+def eval_non_scholarly_sources(citation, citation_val):
     """
     This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
     Input:
     Output:
         the tag for citation (red, green, yellow, unknown)
     """
+    with open("non_scholarly_flags.json", "r") as f:
+        non_scholarly_flags = json.load(f)
+    # Check if the tag is found in either name or is part of external_link
+    for key, val in non_scholarly_flags.items():
+        for source in val:
+            if source in citation_val["external_link"]:
+                return key
+            elif source in citation:
+                return key
+    return "unknown"
 def check_source_quality(wiki_url):
     """
     parsed = parse_match_wikitext(wiki_url)
     red_flag_list = []
     yellow_flag_list = []
+    green_flag_list = []
     unknown_list = []
     for citation, val in parsed.items():
            eval = eval_scholarly_sources(val)
         elif val["type"] in {"web", "book", "news", "other"}:
+            eval = eval_non_scholarly_sources(citation, val)
         if eval == "red":
             red_flag_list.append((citation, val["publisher"]))
         elif eval == "yellow":
             yellow_flag_list.append((citation, val["publisher"]))
+        elif eval == "green":
+            green_flag_list.append((citation, val["publisher"]))
         elif eval == "unknown":
             unknown_list.append((citation, val["publisher"]))
+    return red_flag_list, yellow_flag_list, green_flag_list, unknown_list
 # TEST
 a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
+print("Red flag source:" , a[0])
+print("Yellow flag source: ", a[1])
+print("Green source: ", a[2])
+print("Undetermined sources: ", a[3])