Zeamays3427 commited on
Commit
65ad974
·
verified ·
1 Parent(s): 495e52e

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. LICENSE +18 -0
  3. WELFake_Dataset.csv +3 -0
  4. app.py +164 -0
  5. requirements.txt +8 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ WELFake_Dataset.csv filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 DengPeng
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
18
+ THE SOFTWARE.
WELFake_Dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:665331424230fc452e9482c3547a6a199a2c29745ade8d236950d1d105223773
3
+ size 245086152
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ import openai
5
+ import os
6
+ import spacy
7
+ import subprocess
8
+ import sys
9
+ import pandas as pd
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+
12
+ # Set OpenAI API key from environment variables
13
+ openai.api_key = os.getenv("OPENAI_API_KEY")
14
+
15
+ # Load the tokenizer and the pretrained classification model
16
+ tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
17
+ model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")
18
+
19
+ # Load spaCy model for keyword extraction
20
+ try:
21
+ nlp = spacy.load('en_core_web_sm')
22
+ except:
23
+ # If spaCy model is not available, download it
24
+ subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
25
+ nlp = spacy.load('en_core_web_sm')
26
+
27
+
28
+ # Load the WELFake dataset and extract top 500 TF-IDF keywords
29
+ def load_data():
30
+ # Load WELFake dataset from CSV file
31
+ wel_fake_data = pd.read_csv('WELFake_Dataset.csv')
32
+ wel_fake_data.dropna(subset=['text'], inplace=True) # Remove rows with missing 'text'
33
+
34
+ # Create a TF-IDF vectorizer and fit it on the dataset's text column
35
+ vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
36
+ X = vectorizer.fit_transform(wel_fake_data['text'])
37
+
38
+ # Get the top 500 keywords from the dataset
39
+ top_keywords = vectorizer.get_feature_names_out()
40
+ return top_keywords
41
+
42
+
43
+ # Load top TF-IDF keywords from the WELFake dataset
44
+ top_keywords = load_data()
45
+
46
+
47
+ # Function to extract keywords using spaCy and matching them with TF-IDF keywords
48
+ def extract_keywords(text):
49
+ # Use spaCy to extract keywords (nouns and proper nouns)
50
+ doc = nlp(text)
51
+ spacy_keywords = [token.text for token in doc if
52
+ token.is_alpha and not token.is_stop and token.pos_ in ['NOUN', 'PROPN']]
53
+
54
+ # Use TF-IDF to match keywords in the input text with the top keywords from the dataset
55
+ tfidf_keywords = [kw for kw in top_keywords if kw.lower() in text.lower()]
56
+
57
+ # Combine the keywords from both sources and remove duplicates
58
+ all_keywords = list(set(spacy_keywords + tfidf_keywords))
59
+
60
+ return all_keywords
61
+
62
+
63
+ # Function to predict whether the news is real or fake using the classification model
64
+ def predict(title, text):
65
+ # Combine the title and text as input to the model
66
+ input_text = title + " " + text
67
+
68
+ # Tokenize the input and prepare it for the model
69
+ inputs = tokenizer.encode_plus(
70
+ input_text,
71
+ add_special_tokens=True,
72
+ max_length=512,
73
+ truncation=True,
74
+ padding='max_length',
75
+ return_tensors="pt"
76
+ )
77
+
78
+ # Set the model to evaluation mode
79
+ model.eval()
80
+
81
+ # Perform the prediction using the model
82
+ with torch.no_grad():
83
+ outputs = model(**inputs)
84
+ logits = outputs.logits
85
+ probabilities = torch.softmax(logits, dim=1)
86
+ prediction_value = torch.argmax(probabilities, dim=1).item()
87
+
88
+ # Map the model's output to 'Fake' or 'Real'
89
+ if prediction_value == 0:
90
+ label = 'Fake'
91
+ else:
92
+ label = 'Real'
93
+
94
+ # Extract keywords from the input text
95
+ keywords = extract_keywords(text)
96
+
97
+ return label, keywords
98
+
99
+
100
+ # Function to generate fact-checking suggestions using OpenAI's GPT model
101
+ def generate_suggestions(title, text, keywords):
102
+ # Construct the prompt for GPT based on the title, text, and keywords
103
+ prompt = f"""
104
+ You are a specialist in fact-checking. Based on the title, text, and keywords of the fake news, please suggest some ways to know more about the facts. Please give recommendations that are easy to accept.
105
+
106
+ Keywords: {', '.join(keywords)}
107
+ Title: {title}
108
+ Text: {text}
109
+ """
110
+ try:
111
+ # Call the OpenAI API to generate suggestions
112
+ response = openai.Completion.create(
113
+ engine="text-davinci-003",
114
+ prompt=prompt,
115
+ max_tokens=150,
116
+ temperature=0.7,
117
+ )
118
+ suggestions = response.choices[0].text.strip()
119
+ except Exception as e:
120
+ suggestions = "Unable to generate suggestions at this time."
121
+ print(f"Error generating suggestions: {e}")
122
+ return suggestions
123
+
124
+
125
+ # Main function that predicts and explains the results
126
+ def predict_and_explain(title, text):
127
+ # Predict whether the news is real or fake, and extract keywords
128
+ label, keywords = predict(title, text)
129
+
130
+ # If the news is classified as fake, generate suggestions
131
+ if label == 'Fake':
132
+ suggestions = generate_suggestions(title, text, keywords)
133
+ return f"""
134
+ **Prediction**: Fake News
135
+
136
+ **Keywords**: {', '.join(keywords)}
137
+
138
+ **Suggestions**:
139
+ {suggestions}
140
+ """
141
+ else:
142
+ # If the news is real, just show the prediction and keywords
143
+ return f"""
144
+ **Prediction**: Real News
145
+
146
+ **Keywords**: {', '.join(keywords)}
147
+ """
148
+
149
+
150
+ # Gradio interface setup
151
+ iface = gr.Interface(
152
+ fn=predict_and_explain, # The function to handle user input and return predictions
153
+ inputs=[
154
+ gr.Textbox(label="Title"), # Textbox for the news title
155
+ gr.Textbox(label="Text", lines=10) # Textbox for the news content
156
+ ],
157
+ outputs="markdown", # Output format is markdown
158
+ title="Fake News Detector with Suggestions", # Title of the Gradio app
159
+ description="Enter the news title and content to check if it's fake. If fake, get suggestions on how to know more about the facts.",
160
+ # Description of the app
161
+ )
162
+
163
+ # Launch the Gradio app
164
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ openai
5
+ spacy
6
+ en-core-web-sm
7
+ pandas
8
+ scikit-learn