Spaces:
Sleeping
Sleeping
# Install necessary libraries | |
import os | |
import subprocess | |
# Function to install a package if it is not already installed | |
def install(package): | |
subprocess.check_call([os.sys.executable, "-m", "pip", "install", package]) | |
# Ensure the necessary packages are installed | |
install("transformers") | |
install("torch") | |
install("pandas") | |
install("scikit-learn") | |
install("gradio") | |
import os | |
import pandas as pd | |
import gradio as gr | |
from transformers import AutoModel, AutoTokenizer | |
import torch | |
from sklearn.model_selection import train_test_split | |
# Function to convert a list to a DataFrame | |
def list_to_dataframe(data_list): | |
# Convert the list to a DataFrame (assuming it's a list of dicts or tuples) | |
df = pd.DataFrame(data_list) | |
return df | |
# Load your dataset from a file | |
def load_dataset(file_path=None): | |
if file_path is None: | |
file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab | |
# Check if the file exists | |
if file_path and not os.path.exists(file_path): | |
print(f"File not found at '{file_path}', using default list data...") | |
# Fallback to a default list if file is not found | |
default_data = [ | |
{'text': 'Example sentence 1', 'label': 'label1'}, | |
{'text': 'Example sentence 2', 'label': 'label2'}, | |
# Add more example data as needed | |
] | |
return list_to_dataframe(default_data) | |
try: | |
df = pd.read_excel(file_path) | |
print("Columns in the dataset:", df.columns.tolist()) | |
return df | |
except Exception as e: | |
print(f"Error loading dataset: {e}") | |
return None | |
# Preprocess the data | |
def preprocess_data(df): | |
# Add your preprocessing steps here | |
# For example: cleaning, tokenization, etc. | |
return df | |
# Train your model | |
def train_model(df): | |
# Split the dataset into training and testing sets | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
# Load your pre-trained model and tokenizer from Hugging Face | |
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
# Add your training code here | |
# This may involve tokenizing the data and feeding it into the model | |
return model | |
# Define the Gradio interface function | |
def predict(input_text): | |
# Load the model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
# Tokenize input and make predictions | |
inputs = tokenizer(input_text, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Process the outputs as needed (e.g., extracting relevant information) | |
return outputs.last_hidden_state | |
# Build the Gradio interface | |
def build_interface(file_path=None): | |
df = load_dataset(file_path) # Load your dataset | |
if df is None: | |
return None | |
df = preprocess_data(df) # Preprocess the dataset | |
model = train_model(df) # Train your model | |
iface = gr.Interface( | |
fn=predict, | |
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."), | |
outputs="text" | |
) | |
return iface | |
# Run the Gradio interface | |
if __name__ == "__main__": | |
# You can specify a file_path here if you have a specific file to use | |
file_path = None # Change this to your specific file path if needed | |
iface = build_interface(file_path=file_path) | |
if iface: | |
iface.launch() | |
else: | |
print("Failed to build the Gradio interface. Please check the dataset and model.") | |