File size: 3,782 Bytes
823ded0
 
 
 
 
 
 
 
 
 
 
 
 
 
a78f83f
f13a3ca
 
a78f83f
c12ca9b
 
01df9cf
a21cc8f
 
 
 
 
f13a3ca
a21cc8f
 
 
 
01df9cf
 
a21cc8f
 
 
 
 
 
 
 
 
68b29f4
a78f83f
 
 
 
 
 
 
e403126
c12ca9b
 
01df9cf
 
c12ca9b
 
 
 
 
 
a78f83f
c12ca9b
71db3e6
 
c12ca9b
01df9cf
 
c12ca9b
 
 
 
 
71db3e6
 
c12ca9b
 
 
 
 
 
 
 
9e57aa8
c12ca9b
a21cc8f
 
a78f83f
 
c12ca9b
 
 
a78f83f
e403126
c12ca9b
 
 
e403126
 
 
c12ca9b
e403126
a21cc8f
 
 
c12ca9b
 
a78f83f
c12ca9b
68b29f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Install necessary libraries
import os
import subprocess

# Function to install a package if it is not already installed
def install(package):
    subprocess.check_call([os.sys.executable, "-m", "pip", "install", package])

# Ensure the necessary packages are installed
install("transformers")
install("torch")
install("pandas")
install("scikit-learn")
install("gradio")
import os
import pandas as pd
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.model_selection import train_test_split

# Function to convert a list to a DataFrame
def list_to_dataframe(data_list):
    # Convert the list to a DataFrame (assuming it's a list of dicts or tuples)
    df = pd.DataFrame(data_list)
    return df

# Load your dataset from a file
def load_dataset(file_path=None):
    if file_path is None:
        file_path = '/content/Valid-part-2.xlsx'  # Default path if the file is uploaded manually to Colab

    # Check if the file exists
    if file_path and not os.path.exists(file_path):
        print(f"File not found at '{file_path}', using default list data...")
        # Fallback to a default list if file is not found
        default_data = [
            {'text': 'Example sentence 1', 'label': 'label1'},
            {'text': 'Example sentence 2', 'label': 'label2'},
            # Add more example data as needed
        ]
        return list_to_dataframe(default_data)
    
    try:
        df = pd.read_excel(file_path)
        print("Columns in the dataset:", df.columns.tolist())
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Preprocess the data
def preprocess_data(df):
    # Add your preprocessing steps here
    # For example: cleaning, tokenization, etc.
    return df

# Train your model
def train_model(df):
    # Split the dataset into training and testing sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Load your pre-trained model and tokenizer from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
    model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)

    # Add your training code here
    # This may involve tokenizing the data and feeding it into the model
    return model

# Define the Gradio interface function
def predict(input_text):
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
    model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
    
    # Tokenize input and make predictions
    inputs = tokenizer(input_text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Process the outputs as needed (e.g., extracting relevant information)
    return outputs.last_hidden_state

# Build the Gradio interface
def build_interface(file_path=None):
    df = load_dataset(file_path)  # Load your dataset
    if df is None:
        return None

    df = preprocess_data(df)  # Preprocess the dataset
    model = train_model(df)  # Train your model
    
    iface = gr.Interface(
        fn=predict,
        inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
        outputs="text"
    )
    return iface

# Run the Gradio interface
if __name__ == "__main__":
    # You can specify a file_path here if you have a specific file to use
    file_path = None  # Change this to your specific file path if needed
    iface = build_interface(file_path=file_path)
    if iface:
        iface.launch()
    else:
        print("Failed to build the Gradio interface. Please check the dataset and model.")