Zeamays3427 commited on
Commit
c8e5a0b
·
verified ·
1 Parent(s): e96ec01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -164
app.py CHANGED
@@ -1,164 +1,166 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
- import torch
4
- import openai
5
- import os
6
- import spacy
7
- import subprocess
8
- import sys
9
- import pandas as pd
10
- from sklearn.feature_extraction.text import TfidfVectorizer
11
-
12
- # Set OpenAI API key from environment variables
13
- openai.api_key = os.getenv("OPENAI_API_KEY")
14
-
15
- # Load the tokenizer and the pretrained classification model
16
- tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
17
- model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")
18
-
19
- # Load spaCy model for keyword extraction
20
- try:
21
- nlp = spacy.load('en_core_web_sm')
22
- except:
23
- # If spaCy model is not available, download it
24
- subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
25
- nlp = spacy.load('en_core_web_sm')
26
-
27
-
28
- # Load the WELFake dataset and extract top 500 TF-IDF keywords
29
- def load_data():
30
- # Load WELFake dataset from CSV file
31
- wel_fake_data = pd.read_csv('WELFake_Dataset.csv')
32
- wel_fake_data.dropna(subset=['text'], inplace=True) # Remove rows with missing 'text'
33
-
34
- # Create a TF-IDF vectorizer and fit it on the dataset's text column
35
- vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
36
- X = vectorizer.fit_transform(wel_fake_data['text'])
37
-
38
- # Get the top 500 keywords from the dataset
39
- top_keywords = vectorizer.get_feature_names_out()
40
- return top_keywords
41
-
42
-
43
- # Load top TF-IDF keywords from the WELFake dataset
44
- top_keywords = load_data()
45
-
46
-
47
- # Function to extract keywords using spaCy and matching them with TF-IDF keywords
48
- def extract_keywords(text):
49
- # Use spaCy to extract keywords (nouns and proper nouns)
50
- doc = nlp(text)
51
- spacy_keywords = [token.text for token in doc if
52
- token.is_alpha and not token.is_stop and token.pos_ in ['NOUN', 'PROPN']]
53
-
54
- # Use TF-IDF to match keywords in the input text with the top keywords from the dataset
55
- tfidf_keywords = [kw for kw in top_keywords if kw.lower() in text.lower()]
56
-
57
- # Combine the keywords from both sources and remove duplicates
58
- all_keywords = list(set(spacy_keywords + tfidf_keywords))
59
-
60
- return all_keywords
61
-
62
-
63
- # Function to predict whether the news is real or fake using the classification model
64
- def predict(title, text):
65
- # Combine the title and text as input to the model
66
- input_text = title + " " + text
67
-
68
- # Tokenize the input and prepare it for the model
69
- inputs = tokenizer.encode_plus(
70
- input_text,
71
- add_special_tokens=True,
72
- max_length=512,
73
- truncation=True,
74
- padding='max_length',
75
- return_tensors="pt"
76
- )
77
-
78
- # Set the model to evaluation mode
79
- model.eval()
80
-
81
- # Perform the prediction using the model
82
- with torch.no_grad():
83
- outputs = model(**inputs)
84
- logits = outputs.logits
85
- probabilities = torch.softmax(logits, dim=1)
86
- prediction_value = torch.argmax(probabilities, dim=1).item()
87
-
88
- # Map the model's output to 'Fake' or 'Real'
89
- if prediction_value == 0:
90
- label = 'Fake'
91
- else:
92
- label = 'Real'
93
-
94
- # Extract keywords from the input text
95
- keywords = extract_keywords(text)
96
-
97
- return label, keywords
98
-
99
-
100
- # Function to generate fact-checking suggestions using OpenAI's GPT model
101
- def generate_suggestions(title, text, keywords):
102
- # Construct the prompt for GPT based on the title, text, and keywords
103
- prompt = f"""
104
- You are a specialist in fact-checking. Based on the title, text, and keywords of the fake news, please suggest some ways to know more about the facts. Please give recommendations that are easy to accept.
105
-
106
- Keywords: {', '.join(keywords)}
107
- Title: {title}
108
- Text: {text}
109
- """
110
- try:
111
- # Call the OpenAI API to generate suggestions
112
- response = openai.Completion.create(
113
- engine="gpt-4-2024-08-06",
114
- prompt=prompt,
115
- max_tokens=1000,
116
- temperature=0.7,
117
- )
118
- suggestions = response.choices[0].text.strip()
119
- except Exception as e:
120
- suggestions = "Unable to generate suggestions at this time."
121
- print(f"Error generating suggestions: {e}")
122
- return suggestions
123
-
124
-
125
- # Main function that predicts and explains the results
126
- def predict_and_explain(title, text):
127
- # Predict whether the news is real or fake, and extract keywords
128
- label, keywords = predict(title, text)
129
-
130
- # If the news is classified as fake, generate suggestions
131
- if label == 'Fake':
132
- suggestions = generate_suggestions(title, text, keywords)
133
- return f"""
134
- **Prediction**: Fake News
135
-
136
- **Keywords**: {', '.join(keywords)}
137
-
138
- **Suggestions**:
139
- {suggestions}
140
- """
141
- else:
142
- # If the news is real, just show the prediction and keywords
143
- return f"""
144
- **Prediction**: Real News
145
-
146
- **Keywords**: {', '.join(keywords)}
147
- """
148
-
149
-
150
- # Gradio interface setup
151
- iface = gr.Interface(
152
- fn=predict_and_explain, # The function to handle user input and return predictions
153
- inputs=[
154
- gr.Textbox(label="Title"), # Textbox for the news title
155
- gr.Textbox(label="Text", lines=10) # Textbox for the news content
156
- ],
157
- outputs="markdown", # Output format is markdown
158
- title="Fake News Detector with Suggestions", # Title of the Gradio app
159
- description="Enter the news title and content to check if it's fake. If fake, get suggestions on how to know more about the facts.",
160
- # Description of the app
161
- )
162
-
163
- # Launch the Gradio app
164
- iface.launch()
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ import openai
5
+ import os
6
+ import spacy
7
+ import subprocess
8
+ import sys
9
+ import pandas as pd
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+
12
+ # Set OpenAI API key from environment variables
13
+ openai.api_key = os.getenv("OPENAI_API_KEY")
14
+
15
+ # Load the tokenizer and the pretrained classification model
16
+ tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
17
+ model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")
18
+
19
+ # Load spaCy model for keyword extraction
20
+ import spacy.cli
21
+
22
+ try:
23
+ nlp = spacy.load('en_core_web_sm')
24
+ except OSError:
25
+ # If spaCy model is not available, download it
26
+ spacy.cli.download("en_core_web_sm")
27
+ nlp = spacy.load('en_core_web_sm')
28
+
29
+
30
+ # Load the WELFake dataset and extract top 500 TF-IDF keywords
31
+ def load_data():
32
+ # Load WELFake dataset from CSV file
33
+ wel_fake_data = pd.read_csv('WELFake_Dataset.csv')
34
+ wel_fake_data.dropna(subset=['text'], inplace=True) # Remove rows with missing 'text'
35
+
36
+ # Create a TF-IDF vectorizer and fit it on the dataset's text column
37
+ vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
38
+ X = vectorizer.fit_transform(wel_fake_data['text'])
39
+
40
+ # Get the top 500 keywords from the dataset
41
+ top_keywords = vectorizer.get_feature_names_out()
42
+ return top_keywords
43
+
44
+
45
+ # Load top TF-IDF keywords from the WELFake dataset
46
+ top_keywords = load_data()
47
+
48
+
49
+ # Function to extract keywords using spaCy and matching them with TF-IDF keywords
50
+ def extract_keywords(text):
51
+ # Use spaCy to extract keywords (nouns and proper nouns)
52
+ doc = nlp(text)
53
+ spacy_keywords = [token.text for token in doc if
54
+ token.is_alpha and not token.is_stop and token.pos_ in ['NOUN', 'PROPN']]
55
+
56
+ # Use TF-IDF to match keywords in the input text with the top keywords from the dataset
57
+ tfidf_keywords = [kw for kw in top_keywords if kw.lower() in text.lower()]
58
+
59
+ # Combine the keywords from both sources and remove duplicates
60
+ all_keywords = list(set(spacy_keywords + tfidf_keywords))
61
+
62
+ return all_keywords
63
+
64
+
65
+ # Function to predict whether the news is real or fake using the classification model
66
+ def predict(title, text):
67
+ # Combine the title and text as input to the model
68
+ input_text = title + " " + text
69
+
70
+ # Tokenize the input and prepare it for the model
71
+ inputs = tokenizer.encode_plus(
72
+ input_text,
73
+ add_special_tokens=True,
74
+ max_length=512,
75
+ truncation=True,
76
+ padding='max_length',
77
+ return_tensors="pt"
78
+ )
79
+
80
+ # Set the model to evaluation mode
81
+ model.eval()
82
+
83
+ # Perform the prediction using the model
84
+ with torch.no_grad():
85
+ outputs = model(**inputs)
86
+ logits = outputs.logits
87
+ probabilities = torch.softmax(logits, dim=1)
88
+ prediction_value = torch.argmax(probabilities, dim=1).item()
89
+
90
+ # Map the model's output to 'Fake' or 'Real'
91
+ if prediction_value == 0:
92
+ label = 'Fake'
93
+ else:
94
+ label = 'Real'
95
+
96
+ # Extract keywords from the input text
97
+ keywords = extract_keywords(text)
98
+
99
+ return label, keywords
100
+
101
+
102
+ # Function to generate fact-checking suggestions using OpenAI's GPT model
103
+ def generate_suggestions(title, text, keywords):
104
+ # Construct the prompt for GPT based on the title, text, and keywords
105
+ prompt = f"""
106
+ You are a specialist in fact-checking. Based on the title, text, and keywords of the fake news, please suggest some ways to know more about the facts. Please give recommendations that are easy to accept.
107
+
108
+ Keywords: {', '.join(keywords)}
109
+ Title: {title}
110
+ Text: {text}
111
+ """
112
+ try:
113
+ # Call the OpenAI API to generate suggestions
114
+ response = openai.Completion.create(
115
+ engine="gpt-4-2024-08-06",
116
+ prompt=prompt,
117
+ max_tokens=1000,
118
+ temperature=0.7,
119
+ )
120
+ suggestions = response.choices[0].text.strip()
121
+ except Exception as e:
122
+ suggestions = "Unable to generate suggestions at this time."
123
+ print(f"Error generating suggestions: {e}")
124
+ return suggestions
125
+
126
+
127
+ # Main function that predicts and explains the results
128
+ def predict_and_explain(title, text):
129
+ # Predict whether the news is real or fake, and extract keywords
130
+ label, keywords = predict(title, text)
131
+
132
+ # If the news is classified as fake, generate suggestions
133
+ if label == 'Fake':
134
+ suggestions = generate_suggestions(title, text, keywords)
135
+ return f"""
136
+ **Prediction**: Fake News
137
+
138
+ **Keywords**: {', '.join(keywords)}
139
+
140
+ **Suggestions**:
141
+ {suggestions}
142
+ """
143
+ else:
144
+ # If the news is real, just show the prediction and keywords
145
+ return f"""
146
+ **Prediction**: Real News
147
+
148
+ **Keywords**: {', '.join(keywords)}
149
+ """
150
+
151
+
152
+ # Gradio interface setup
153
+ iface = gr.Interface(
154
+ fn=predict_and_explain, # The function to handle user input and return predictions
155
+ inputs=[
156
+ gr.Textbox(label="Title"), # Textbox for the news title
157
+ gr.Textbox(label="Text", lines=10) # Textbox for the news content
158
+ ],
159
+ outputs="markdown", # Output format is markdown
160
+ title="Fake News Detector with Suggestions", # Title of the Gradio app
161
+ description="Enter the news title and content to check if it's fake. If fake, get suggestions on how to know more about the facts.",
162
+ # Description of the app
163
+ )
164
+
165
+ # Launch the Gradio app
166
+ iface.launch()