import gradio as gr
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle
import sys

def greet(name):
    return "Hello " + name + "!!"

# load the CountVectorizer from disk
cv = pickle.load(open('countVectTrain.pkl', 'rb'))

# load the model from disk
filename = 'corona_pred.pkl'
model = pickle.load(open(filename, 'rb'))

# function to convert sequence string into k-mer words, default size = 6 (hexamer words)
kmer_size = 6
def getKmers(sequence, size=kmer_size):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

# define the Gradio interface
def classify_sequence(sequence):
    # convert the input sequence into k-mer words
    words = getKmers(sequence)
    # convert the k-mer words into a list of space-separated strings
    text = ' '.join(words)
    # vectorize the text using Count Vectorization
    X = cv.transform([text])
    # make predictions using the pre-trained model
    pred_label = model.predict(X)[0]
    pred_prob_percentage = model.predict_proba(X).max()*100
    # return the predicted class and probability
    return {'predicted_class': pred_label, 'probability': pred_prob_percentage}

iface = gr.Interface(fn=classify_sequence, inputs="text", outputs=["text"], 
                     title="Coronavirus Sequence Classifier", 
                     description="Enter a coronavirus sequence to predict its class and probability.")

iface.launch()