Spaces:

Zeamays3427
/

fake-news-debunker

Running

File size: 6,015 Bytes

import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import openai
import os
import spacy
import subprocess
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Set OpenAI API key from environment variables
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load the tokenizer and the pretrained classification model
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")

# Load spaCy model for keyword extraction
import spacy.cli

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    # If spaCy model is not available, download it
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load('en_core_web_sm')


# Load the WELFake dataset and extract top 500 TF-IDF keywords
def load_data():
    # Load WELFake dataset from CSV file
    wel_fake_data = pd.read_csv('WELFake_Dataset.csv')
    wel_fake_data.dropna(subset=['text'], inplace=True)  # Remove rows with missing 'text'

    # Create a TF-IDF vectorizer and fit it on the dataset's text column
    vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
    X = vectorizer.fit_transform(wel_fake_data['text'])

    # Get the top 500 keywords from the dataset
    top_keywords = vectorizer.get_feature_names_out()
    return top_keywords


# Load top TF-IDF keywords from the WELFake dataset
top_keywords = load_data()


# Function to extract keywords using spaCy and matching them with TF-IDF keywords
def extract_keywords(text):
    # Use spaCy to extract keywords (nouns and proper nouns)
    doc = nlp(text)
    spacy_keywords = [token.text for token in doc if
                      token.is_alpha and not token.is_stop and token.pos_ in ['NOUN', 'PROPN']]

    # Use TF-IDF to match keywords in the input text with the top keywords from the dataset
    tfidf_keywords = [kw for kw in top_keywords if kw.lower() in text.lower()]

    # Combine the keywords from both sources and remove duplicates
    all_keywords = list(set(spacy_keywords + tfidf_keywords))

    return all_keywords


# Function to predict whether the news is real or fake using the classification model
def predict(title, text):
    # Combine the title and text as input to the model
    input_text = title + " " + text

    # Tokenize the input and prepare it for the model
    inputs = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )

    # Set the model to evaluation mode
    model.eval()

    # Perform the prediction using the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        prediction_value = torch.argmax(probabilities, dim=1).item()

    # Map the model's output to 'Fake' or 'Real'
    if prediction_value == 0:
        label = 'Fake'
    else:
        label = 'Real'

    # Extract keywords from the input text
    keywords = extract_keywords(text)

    return label, keywords


def generate_suggestions(title, text, keywords):
    # Construct the prompt for GPT based on the title, text, and keywords
    prompt = f"""
    You are a specialist in fact-checking. Based on the title, text, and keywords of the fake news, 
    please suggest some ways to know more about the facts. Please give recommendations that are easy to accept.

    Keywords: {', '.join(keywords)}
    Title: {title}
    Text: {text}
    """

    try:
        # Call OpenAI's chat completion method using GPT-4 model
        response = client.chat.completions.create(
            model="gpt-4",  # Using the GPT-4 model
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in fact-checking."},  # System role definition
                {"role": "user", "content": prompt}  # User input (the constructed prompt)
            ],
            max_tokens=4000,  # Set the maximum token limit to 4000
            temperature=0.7,  # Controls the randomness in the generated text
        )
        
        # Extract and clean the suggestions from the API response
        suggestions = response.choices[0].message["content"].strip()

    except Exception as e:
        # If there's an error, set a default error message and print the exception details for debugging
        suggestions = "Unable to generate suggestions at this time."
        print(f"Error generating suggestions: {e}")  # Debug: print the error details to the console

    return suggestions


# Main function that predicts and explains the results
def predict_and_explain(title, text):
    # Predict whether the news is real or fake, and extract keywords
    label, keywords = predict(title, text)

    # If the news is classified as fake, generate suggestions
    if label == 'Fake':
        suggestions = generate_suggestions(title, text, keywords)
        return f"""
**Prediction**: Fake News

**Keywords**: {', '.join(keywords)}

**Suggestions**:
{suggestions}
"""
    else:
        # If the news is real, just show the prediction and keywords
        return f"""
**Prediction**: Real News

**Keywords**: {', '.join(keywords)}
"""


# Gradio interface setup
iface = gr.Interface(
    fn=predict_and_explain,  # The function to handle user input and return predictions
    inputs=[
        gr.Textbox(label="Title"),  # Textbox for the news title
        gr.Textbox(label="Text", lines=10)  # Textbox for the news content
    ],
    outputs="markdown",  # Output format is markdown
    title="Fake News Detector with Suggestions",  # Title of the Gradio app
    description="Enter the news title and content to check if it's fake. If fake, get suggestions on how to know more about the facts.",
    # Description of the app
)

# Launch the Gradio app
iface.launch()