Spaces:

visualpolice
/

White-Stride-Red68

Sleeping

File size: 4,052 Bytes

b9a685a

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# Step 1: Load the CSV file
df = pd.read_csv('./White-Stride-Red-68.csv')

# Step 2: Filter out rows where the 'detail_โครงการ' column is NaN or an empty string
text_column = 'detail_โครงการ'
df_filtered = df[df[text_column].notna() & df[text_column].str.strip().ne('')]

# Reset index to ensure we have a unique identifier for each row
df_filtered = df_filtered.reset_index()  # 'index' becomes a column now

# Step 3: Extract the text column for embeddings
texts = df_filtered[text_column].astype(str).tolist()

# Keep the entire DataFrame rows as a list of dictionaries
rows = df_filtered.to_dict('records')

# **New Step**: Split texts into chunks of up to 500 characters
chunks = []
chunk_rows = []

for idx, text in enumerate(texts):
    # Split text into chunks of up to 500 characters
    text_chunks = [text[i:i+500] for i in range(0, len(text), 500)]
    # For each chunk, store the chunk and its corresponding row
    for chunk in text_chunks:
        chunks.append(chunk)
        chunk_rows.append(rows[idx])

# Step 4: Load the pre-trained model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Step 5: Generate embeddings for all text chunks
embeddings = model.encode(chunks, show_progress_bar=True)

# Step 6: Define the semantic search function
def semantic_search(query, embeddings, chunks, chunk_rows, top_n=50):
    # Generate embedding for the query
    query_embedding = model.encode([query])
    
    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get the indices of the chunks sorted by similarity
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Collect top_n unique results based on the original row
    results = []
    seen_row_ids = set()
    for idx in sorted_indices:
        row = chunk_rows[idx]
        row_id = row['index']  # Unique identifier for the row
        if row_id not in seen_row_ids:
            seen_row_ids.add(row_id)
            results.append((row, similarities[idx]))
            if len(results) >= top_n:
                break
    return results

# Step 7: Create the Gradio interface
def search_interface(query):
    # Perform the search
    results = semantic_search(query, embeddings, chunks, chunk_rows)
    
    # Specify the columns to display
    columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ']
    
    # Prepare the output
    output = ""
    for row, score in results:
        output += f"**Score:** {score:.4f}\n\n"
        
        # Display only specified columns and skip NaNs
        for key, value in row.items():
            if key in columns_to_display and not pd.isna(value):
                output += f"**{key}:** {value}\n\n"
                
        # Display 'detail_โครงการ' if 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN
        if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')):
            output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n"
        output += "---\n\n"
    
    return output

iface = gr.Interface(
    fn=search_interface,
    inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'),
    outputs="markdown",
    title="Semantic Search Application",
    description="Enter a search query to find the most relevant entries from the dataset.",
)

if __name__ == "__main__":
    iface.launch(share=True)