Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import gradio as gr | |
# Step 1: Load the CSV file | |
df = pd.read_csv('./White-Stride-Red-68.csv') | |
# Step 2: Filter out rows where the 'detail_โครงการ' column is NaN or an empty string | |
text_column = 'detail_โครงการ' | |
df_filtered = df[df[text_column].notna() & df[text_column].str.strip().ne('')] | |
# Reset index to ensure we have a unique identifier for each row | |
df_filtered = df_filtered.reset_index() # 'index' becomes a column now | |
# Step 3: Extract the text column for embeddings | |
texts = df_filtered[text_column].astype(str).tolist() | |
# Keep the entire DataFrame rows as a list of dictionaries | |
rows = df_filtered.to_dict('records') | |
# **New Step**: Split texts into chunks of up to 500 characters | |
chunks = [] | |
chunk_rows = [] | |
for idx, text in enumerate(texts): | |
# Split text into chunks of up to 500 characters | |
text_chunks = [text[i:i+500] for i in range(0, len(text), 500)] | |
# For each chunk, store the chunk and its corresponding row | |
for chunk in text_chunks: | |
chunks.append(chunk) | |
chunk_rows.append(rows[idx]) | |
# Step 4: Load the pre-trained model | |
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
# Step 5: Generate embeddings for all text chunks | |
embeddings = model.encode(chunks, show_progress_bar=True) | |
# Step 6: Define the semantic search function | |
def semantic_search(query, embeddings, chunks, chunk_rows, top_n=50): | |
# Generate embedding for the query | |
query_embedding = model.encode([query]) | |
# Compute cosine similarities | |
similarities = cosine_similarity(query_embedding, embeddings)[0] | |
# Get the indices of the chunks sorted by similarity | |
sorted_indices = np.argsort(similarities)[::-1] | |
# Collect top_n unique results based on the original row | |
results = [] | |
seen_row_ids = set() | |
for idx in sorted_indices: | |
row = chunk_rows[idx] | |
row_id = row['index'] # Unique identifier for the row | |
if row_id not in seen_row_ids: | |
seen_row_ids.add(row_id) | |
results.append((row, similarities[idx])) | |
if len(results) >= top_n: | |
break | |
return results | |
# Step 7: Create the Gradio interface | |
def search_interface(query): | |
# Perform the search | |
results = semantic_search(query, embeddings, chunks, chunk_rows) | |
# Specify the columns to display | |
columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ'] | |
# Prepare the output | |
output = "" | |
for row, score in results: | |
output += f"**Score:** {score:.4f}\n\n" | |
# Display only specified columns and skip NaNs | |
for key, value in row.items(): | |
if key in columns_to_display and not pd.isna(value): | |
output += f"**{key}:** {value}\n\n" | |
# Display 'detail_โครงการ' if 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN | |
if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')): | |
output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n" | |
output += "---\n\n" | |
return output | |
iface = gr.Interface( | |
fn=search_interface, | |
inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'), | |
outputs="markdown", | |
title="Semantic Search Application", | |
description="Enter a search query to find the most relevant entries from the dataset.", | |
) | |
if __name__ == "__main__": | |
iface.launch(share=True) | |