import streamlit as st 


import transformers
# import torch
import json
import os
# from transformers import AutoTokenizer, TextStreamer , pipeline


# model_id = "WizardLM/WizardMath-7B-V1.1"


# # Configuration
# runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu)
# cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive.
# scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM.

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


model_id = "WizardLM/WizardMath-7B-V1.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

question = st.text_area("Enter questoin")

# text = "Sum of two numbers is 20 and difference is 4. What are the numbers?"
text = st.text_area("Enter questoin")


# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

if text:

    device = "cuda:0"

    inputs = tokenizer(str(text), return_tensors="pt").to(device)
    
    outputs = model_4bit.generate(**inputs, max_new_tokens=512)
    # out = pipe(question)[0]['generated_text']
    
    st.write(tokenizer.decode(outputs[0], skip_special_tokens=True))