Engineer / app.py
atifsial123's picture
Update app.py
71db3e6 verified
raw
history blame
3.78 kB
# Install necessary libraries
import os
import subprocess
# Function to install a package if it is not already installed
def install(package):
subprocess.check_call([os.sys.executable, "-m", "pip", "install", package])
# Ensure the necessary packages are installed
install("transformers")
install("torch")
install("pandas")
install("scikit-learn")
install("gradio")
import os
import pandas as pd
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.model_selection import train_test_split
# Function to convert a list to a DataFrame
def list_to_dataframe(data_list):
# Convert the list to a DataFrame (assuming it's a list of dicts or tuples)
df = pd.DataFrame(data_list)
return df
# Load your dataset from a file
def load_dataset(file_path=None):
if file_path is None:
file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab
# Check if the file exists
if file_path and not os.path.exists(file_path):
print(f"File not found at '{file_path}', using default list data...")
# Fallback to a default list if file is not found
default_data = [
{'text': 'Example sentence 1', 'label': 'label1'},
{'text': 'Example sentence 2', 'label': 'label2'},
# Add more example data as needed
]
return list_to_dataframe(default_data)
try:
df = pd.read_excel(file_path)
print("Columns in the dataset:", df.columns.tolist())
return df
except Exception as e:
print(f"Error loading dataset: {e}")
return None
# Preprocess the data
def preprocess_data(df):
# Add your preprocessing steps here
# For example: cleaning, tokenization, etc.
return df
# Train your model
def train_model(df):
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Load your pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
# Add your training code here
# This may involve tokenizing the data and feeding it into the model
return model
# Define the Gradio interface function
def predict(input_text):
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
# Tokenize input and make predictions
inputs = tokenizer(input_text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# Process the outputs as needed (e.g., extracting relevant information)
return outputs.last_hidden_state
# Build the Gradio interface
def build_interface(file_path=None):
df = load_dataset(file_path) # Load your dataset
if df is None:
return None
df = preprocess_data(df) # Preprocess the dataset
model = train_model(df) # Train your model
iface = gr.Interface(
fn=predict,
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
outputs="text"
)
return iface
# Run the Gradio interface
if __name__ == "__main__":
# You can specify a file_path here if you have a specific file to use
file_path = None # Change this to your specific file path if needed
iface = build_interface(file_path=file_path)
if iface:
iface.launch()
else:
print("Failed to build the Gradio interface. Please check the dataset and model.")