#%%writefile debias_app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, pipeline as tf_pipeline
import transformers
import torch
import pandas as pd


# Define the BiasPipeline class with text processing methods
class BiasPipeline:
    def __init__(self):
        # Load models and tokenizers
        self.load_resources()

    def load_resources(self):
        """Load models and tokenizers."""
        self.classifier_tokenizer = AutoTokenizer.from_pretrained("newsmediabias/UnBIAS-classification-bert")
        self.classifier_model = AutoModelForSequenceClassification.from_pretrained("newsmediabias/UnBIAS-classification-bert")

        #self.ner_tokenizer = AutoTokenizer.from_pretrained("newsmediabias/UnBIAS-Named-Entity-Recognition")
        #self.ner_model = AutoModelForTokenClassification.from_pretrained("newsmediabias/UnBIAS-Named-Entity-Recognition")
        self.ner_tokenizer = AutoTokenizer.from_pretrained("newsmediabias/UnBIAS-NER")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("newsmediabias/UnBIAS-NER")

        self.classifier = pipeline("text-classification", model=self.classifier_model, tokenizer=self.classifier_tokenizer)
        self.ner = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)

    def clean_text(self, text):
        """Clean up the text by removing any redundant spaces."""
        return ' '.join(text.split())

    def process(self, texts):
        """Process texts to classify and find named entities."""
        classification_results = self.classifier(texts)
        ner_results = self.ner(texts)
        return classification_results, ner_results


# Model setup for debiasing
debias_model = "newsmediabias/UnBIAS-LLama2-Debiaser-Chat-QLoRA"
debias_tokenizer = AutoTokenizer.from_pretrained(debias_model)
debias_pipeline = transformers.pipeline(
    "text-generation",
    model=debias_model,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Instruction for debiasing
instruction = ("Instruction: As a helpful, respectful and trustworthy debiasing assistant, your "
               "task is to receive a text and return its unbiased version, without adding any unrelated content "
               "or additional outputs.")


def get_debiased_sequence(prompt):
    """Generate a debiased version of the provided text using the debiasing pipeline."""
    instruction_prefix = "<s> <<SYS>> {instruction} <</SYS>> [INST]".format(instruction=instruction)
    instruction_suffix = "[/INST]</s>"
    full_input_text = f"{instruction_prefix}{prompt}{instruction_suffix}"

    # Tokenize the full input text to calculate its length in tokens
    input_tokens = debias_tokenizer.encode(full_input_text)

    # Ensure max_length is greater than the number of input tokens
    max_length = len(input_tokens) + 50  # Add a buffer to accommodate generation without truncation

    try:
        sequences = debias_pipeline(
            full_input_text,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=debias_tokenizer.eos_token_id,
            max_length=max_length,  # Updated to use calculated max_length
        )

        if sequences:
            res = sequences[0]['generated_text']
            # Assuming the response also includes the [/INST] tag, split and extract after this tag
            result_part = res.split('[/INST]')[-1]
            clean_result = ''.join(c for c in result_part if c.isprintable())
            return clean_result.strip()
    except RuntimeError as e:
        if 'CUDA out of memory' in str(e):
            torch.cuda.empty_cache()  # Try clearing cache to free up memory
            return "Error: Out of memory. Please try again with shorter input or less complex instructions."
        else:
            raise e  # Re-raise the exception if it's not a memory error

    return "No output generated. Check model configuration or input."


# Initialize the BiasPipeline
pipeline = BiasPipeline()

# Streamlit interface
st.title('UnBIAS App')
example_sentences = [
    "Women are just too emotional to be leaders.",
    "All young people are lazy and addicted to their phones.",
    "People from that country are always dishonest and corrupt.",
    "Men are inherently better at science and math than women.",
    "The elderly are just burdens to society; they contribute nothing.",
    "All poor people are lazy and don't want to work.",
    "People who follow that religion are all terrorists.",
    "Immigrants are taking all our jobs and ruining the country.",
    "All wealthy people achieved success only through manipulation and deceit.",
    "People with tattoos are not professional.",
    "All politicians are liars and cannot be trusted.",
    "Blue-collar workers don't have any real skills.",
    "Anyone who doesn't attend college is unintelligent.",
    "Only people from the city are sophisticated.",
    "Rural folks have backward mindsets and don't understand progress.",
    "All artists are always broke and unreliable.",
    "Anyone from that region is uneducated and narrow-minded.",
    "People without jobs are simply not trying hard enough."
]

selected_sentence = st.selectbox("Choose an example or type your own below:", [""] + example_sentences)
input_text = st.text_area("Enter text:", selected_sentence, height=150)

if st.button("Process Text"):
    if input_text:
        cleaned_text = pipeline.clean_text(input_text)
        classification_results, ner_results = pipeline.process(cleaned_text)
        label = classification_results[0]['label']
        score = classification_results[0]['score']
        st.write(f"**Classification:** {label} (Confidence: {score:.2f})")
        biased_words = [result['word'] for result in ner_results if result['entity'].startswith('B-BIAS')]
        st.write("**Biased Words Identified:**", ", ".join(biased_words))
    else:
        st.write("Please enter some text to process.")

# Disclaimer
st.info("Disclaimer: Please note that while this tool aims to identify and highlight biased language, no automated system is perfect. The detection of bias depends on various factors, including the context, the training data used for the models, and the inherent limitations of natural language processing technologies. As such, some biases may not be detected, and all results should be reviewed critically by human users.")