Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
import openai | |
import os | |
import spacy | |
import subprocess | |
import sys | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
# Set OpenAI API key from environment variables | |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
# Load the tokenizer and the pretrained classification model | |
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification") | |
model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification") | |
# Load spaCy model for keyword extraction | |
import spacy.cli | |
try: | |
nlp = spacy.load('en_core_web_sm') | |
except OSError: | |
# If spaCy model is not available, download it | |
spacy.cli.download("en_core_web_sm") | |
nlp = spacy.load('en_core_web_sm') | |
# Load the WELFake dataset and extract top 500 TF-IDF keywords | |
def load_data(): | |
# Load WELFake dataset from CSV file | |
wel_fake_data = pd.read_csv('WELFake_Dataset.csv') | |
wel_fake_data.dropna(subset=['text'], inplace=True) # Remove rows with missing 'text' | |
# Create a TF-IDF vectorizer and fit it on the dataset's text column | |
vectorizer = TfidfVectorizer(max_features=500, stop_words='english') | |
X = vectorizer.fit_transform(wel_fake_data['text']) | |
# Get the top 500 keywords from the dataset | |
top_keywords = vectorizer.get_feature_names_out() | |
return top_keywords | |
# Load top TF-IDF keywords from the WELFake dataset | |
top_keywords = load_data() | |
# Function to extract keywords using spaCy and matching them with TF-IDF keywords | |
def extract_keywords(text): | |
# Use spaCy to extract keywords (nouns and proper nouns) | |
doc = nlp(text) | |
spacy_keywords = [token.text for token in doc if | |
token.is_alpha and not token.is_stop and token.pos_ in ['NOUN', 'PROPN']] | |
# Use TF-IDF to match keywords in the input text with the top keywords from the dataset | |
tfidf_keywords = [kw for kw in top_keywords if kw.lower() in text.lower()] | |
# Combine the keywords from both sources and remove duplicates | |
all_keywords = list(set(spacy_keywords + tfidf_keywords)) | |
return all_keywords | |
# Function to predict whether the news is real or fake using the classification model | |
def predict(title, text): | |
# Combine the title and text as input to the model | |
input_text = title + " " + text | |
# Tokenize the input and prepare it for the model | |
inputs = tokenizer.encode_plus( | |
input_text, | |
add_special_tokens=True, | |
max_length=512, | |
truncation=True, | |
padding='max_length', | |
return_tensors="pt" | |
) | |
# Set the model to evaluation mode | |
model.eval() | |
# Perform the prediction using the model | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
logits = outputs.logits | |
probabilities = torch.softmax(logits, dim=1) | |
prediction_value = torch.argmax(probabilities, dim=1).item() | |
# Map the model's output to 'Fake' or 'Real' | |
if prediction_value == 0: | |
label = 'Fake' | |
else: | |
label = 'Real' | |
# Extract keywords from the input text | |
keywords = extract_keywords(text) | |
return label, keywords | |
def generate_suggestions(title, text, keywords): | |
# Construct the prompt for GPT based on the title, text, and keywords | |
prompt = f""" | |
You are a specialist in fact-checking. Based on the title, text, and keywords of the fake news, | |
please suggest some ways to know more about the facts. Please give recommendations that are easy to accept. | |
Keywords: {', '.join(keywords)} | |
Title: {title} | |
Text: {text} | |
""" | |
try: | |
# Call OpenAI's chat completion method using GPT-4 model | |
response = client.chat.completions.create( | |
model="gpt-4", # Using the GPT-4 model | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant specialized in fact-checking."}, # System role definition | |
{"role": "user", "content": prompt} # User input (the constructed prompt) | |
], | |
max_tokens=4000, # Set the maximum token limit to 4000 | |
temperature=0.7, # Controls the randomness in the generated text | |
) | |
# Extract and clean the suggestions from the API response | |
suggestions = response.choices[0].message["content"].strip() | |
except Exception as e: | |
# If there's an error, set a default error message and print the exception details for debugging | |
suggestions = "Unable to generate suggestions at this time." | |
print(f"Error generating suggestions: {e}") # Debug: print the error details to the console | |
return suggestions | |
# Main function that predicts and explains the results | |
def predict_and_explain(title, text): | |
# Predict whether the news is real or fake, and extract keywords | |
label, keywords = predict(title, text) | |
# If the news is classified as fake, generate suggestions | |
if label == 'Fake': | |
suggestions = generate_suggestions(title, text, keywords) | |
return f""" | |
**Prediction**: Fake News | |
**Keywords**: {', '.join(keywords)} | |
**Suggestions**: | |
{suggestions} | |
""" | |
else: | |
# If the news is real, just show the prediction and keywords | |
return f""" | |
**Prediction**: Real News | |
**Keywords**: {', '.join(keywords)} | |
""" | |
# Gradio interface setup | |
iface = gr.Interface( | |
fn=predict_and_explain, # The function to handle user input and return predictions | |
inputs=[ | |
gr.Textbox(label="Title"), # Textbox for the news title | |
gr.Textbox(label="Text", lines=10) # Textbox for the news content | |
], | |
outputs="markdown", # Output format is markdown | |
title="Fake News Detector with Suggestions", # Title of the Gradio app | |
description="Enter the news title and content to check if it's fake. If fake, get suggestions on how to know more about the facts.", | |
# Description of the app | |
) | |
# Launch the Gradio app | |
iface.launch() | |