import streamlit as st
import pandas as pd
import os
import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import HfFolder
from io import StringIO
from tqdm import tqdm
import accelerate
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model, disk_offload

# Access the Hugging Face API token from environment variables
hf_token = os.getenv('HF_API_TOKEN')

if not hf_token:
    raise ValueError("Hugging Face API token is not set. Please set the HF_API_TOKEN environment variable.")
HfFolder.save_token(hf_token)

# Set environment variable to avoid floating-point errors
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Load the GPT-2 tokenizer and model
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

# Create a pipeline for text generation using GPT-2
text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer_gpt2)

# Load the Llama-3 model and tokenizer once during startup
tokenizer_llama = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B", token=hf_token)
model_llama = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B",
    torch_dtype='auto',
    device_map='auto',
    token=hf_token
)

# Define your prompt template
prompt_template = """\
You are an expert in generating synthetic data for machine learning models.
Your task is to generate a synthetic tabular dataset based on the description provided below.
Description: {description}
The dataset should include the following columns: {columns}
Please provide the data in CSV format with a minimum of 100 rows per generation.
Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
Example Description:
Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
Example Output:
Size,Location,Number of Bedrooms,Price
1200,Suburban,3,250000
900,Urban,2,200000
1500,Rural,4,300000
...
Description:
{description}
Columns:
{columns}
Output: """

def preprocess_user_prompt(user_prompt):
    generated_text = text_generator(user_prompt, max_length=60, num_return_sequences=1, truncation=True)[0]["generated_text"]
    return generated_text

def format_prompt(description, columns):
    processed_description = preprocess_user_prompt(description)
    prompt = prompt_template.format(description=processed_description, columns=",".join(columns))
    return prompt

generation_params = {
    "top_p": 0.90,
    "temperature": 0.8,
    "max_new_tokens": 512,
    "return_full_text": False,
    "use_cache": False
}

# Generate synthetic data
def generate_synthetic_data(description, columns):
    try:
        formatted_prompt = format_prompt(description, columns)

        # Tokenize the prompt with truncation enabled
        inputs = tokenizer_llama(formatted_prompt, return_tensors="pt", truncation=True, max_length=512)
        
        # Move inputs to the correct device
        inputs = {k: v.to(model_llama.device) for k, v in inputs.items()}

        # Generate synthetic data
        with torch.no_grad():
            outputs = model_llama.generate(
                **inputs,
                max_length=512,
                top_p=generation_params["top_p"],
                temperature=generation_params["temperature"],
                num_return_sequences=1,
            )

        # Check for meta tensor before decoding
        if outputs.is_meta:
            raise ValueError("Output tensor is in meta state, check model and input.")

        # Decode the generated output
        generated_text = tokenizer_llama.decode(outputs[0], skip_special_tokens=True)
        
        # Return the generated synthetic data
        return generated_text
    except Exception as e:
        return f"Error: {e}"


def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
    data_frames = []
    num_iterations = num_rows // rows_per_generation

    # Create a progress bar
    progress_bar = st.progress(0)

    for i in tqdm(range(num_iterations)):
        generated_data = generate_synthetic_data(description, columns)
        print("Generated Data:\n", generated_data)  # Move the print statement here

        if "Error" in generated_data:
            return generated_data

        df_synthetic = process_generated_data(generated_data)
        data_frames.append(df_synthetic)

        # Update the progress bar
        progress_bar.progress((i + 1) / num_iterations)

    return pd.concat(data_frames, ignore_index=True)

def process_generated_data(csv_data):
    try:
        # Check if the data is not empty and has valid content
        if not csv_data.strip():
            raise ValueError("Generated data is empty.")

        data = StringIO(csv_data)
        df = pd.read_csv(data)
        print("DataFrame Shape:", df.shape)
        print("DataFrame Head:\n", df.head())

        # Check if the DataFrame is empty
        if df.empty:
            raise ValueError("Generated DataFrame is empty.")

        return df
    except Exception as e:
        st.error(f"Error processing generated data: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error


# Streamlit app interface
st.title("Synthetic Data Generator")
description = st.text_input("Description", "e.g., Generate a dataset for predicting students' grades")
columns = st.text_input("Columns (comma-separated)", "e.g., name, age, course, grade")

if st.button("Generate"):
    description = description.strip()
    columns = [col.strip() for col in columns.split(',')]
    df_synthetic = generate_large_synthetic_data(description, columns)

    if isinstance(df_synthetic, str) and "Error" in df_synthetic:
        st.error(df_synthetic)  # Display error message if any
    else:
        st.success("Synthetic Data Generated!")
        st.dataframe(df_synthetic)  # Display the generated DataFrame
        st.download_button(
            label="Download CSV",
            data=df_synthetic.to_csv(index=False),
            file_name="synthetic_data.csv",
            mime="text/csv"
        )