import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import gradio as gr # Step 1: Read the CSV file df = pd.read_csv('./all_combine_main.csv') # Ensure the CSV file is uploaded to Hugging Face # Step 2: Extract the text column for embeddings and keep the entire DataFrame rows text_column = 'detail_โครงการ' # Replace 'your_column' with your text column name texts = df[text_column].astype(str).tolist() # Keep the entire DataFrame rows as a list of dictionaries rows = df.to_dict('records') # Step 3: Load the pre-trained model model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') # Step 4: Generate embeddings for all texts embeddings = model.encode(texts, show_progress_bar=True) # Optional: Save embeddings to disk to avoid recomputing in future runs # np.save('embeddings.npy', embeddings) # If you have saved embeddings before, you can load them directly # embeddings = np.load('embeddings.npy') # Step 5: Define the semantic search function def semantic_search(query, embeddings, texts, rows, top_n=5): # Generate embedding for the query query_embedding = model.encode([query]) # Compute cosine similarities similarities = cosine_similarity(query_embedding, embeddings)[0] # Get the indices of the top_n most similar texts top_indices = np.argsort(similarities)[::-1][:top_n] # Return the top_n most similar rows and their similarity scores results = [(rows[idx], similarities[idx]) for idx in top_indices] return results # Step 6: Create the Gradio interface def search_interface(query): # Perform the search results = semantic_search(query, embeddings, texts, rows) # Specify the columns to display columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ'] # Prepare the output output = "" for row, score in results: output += f"**Score:** {score:.4f}\n\n" # Check if either 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN # Display only specified columns for key, value in row.items(): if key in columns_to_display and not pd.isna(value): # Only show specified columns and skip NaNs output += f"**{key}:** {value}\n\n" if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')): # Display 'detail_โครงการ' if any of the above columns are NaN output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n" output += "---\n\n" return output iface = gr.Interface( fn=search_interface, inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'), outputs="markdown", title="Semantic Search Application", description="Enter a search query to find the most relevant entries from the dataset.", ) if __name__ == "__main__": iface.launch(share=True)