import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import gradio as gr # Step 1: Load the CSV file df = pd.read_csv('./White-Stride-Red-68.csv') # Step 2: Filter out rows where the 'detail_โครงการ' column is NaN or an empty string text_column = 'detail_โครงการ' df_filtered = df[df[text_column].notna() & df[text_column].str.strip().ne('')] # Reset index to ensure we have a unique identifier for each row df_filtered = df_filtered.reset_index() # 'index' becomes a column now # Step 3: Extract the text column for embeddings texts = df_filtered[text_column].astype(str).tolist() # Keep the entire DataFrame rows as a list of dictionaries rows = df_filtered.to_dict('records') # **New Step**: Split texts into chunks of up to 500 characters chunks = [] chunk_rows = [] for idx, text in enumerate(texts): # Split text into chunks of up to 500 characters text_chunks = [text[i:i+500] for i in range(0, len(text), 500)] # For each chunk, store the chunk and its corresponding row for chunk in text_chunks: chunks.append(chunk) chunk_rows.append(rows[idx]) # Step 4: Load the pre-trained model model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') # Step 5: Generate embeddings for all text chunks embeddings = model.encode(chunks, show_progress_bar=True) # Step 6: Define the semantic search function def semantic_search(query, embeddings, chunks, chunk_rows, top_n=50): # Generate embedding for the query query_embedding = model.encode([query]) # Compute cosine similarities similarities = cosine_similarity(query_embedding, embeddings)[0] # Get the indices of the chunks sorted by similarity sorted_indices = np.argsort(similarities)[::-1] # Collect top_n unique results based on the original row results = [] seen_row_ids = set() for idx in sorted_indices: row = chunk_rows[idx] row_id = row['index'] # Unique identifier for the row if row_id not in seen_row_ids: seen_row_ids.add(row_id) results.append((row, similarities[idx])) if len(results) >= top_n: break return results # Step 7: Create the Gradio interface def search_interface(query): # Perform the search results = semantic_search(query, embeddings, chunks, chunk_rows) # Specify the columns to display columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ'] # Prepare the output output = "" for row, score in results: output += f"**Score:** {score:.4f}\n\n" # Display only specified columns and skip NaNs for key, value in row.items(): if key in columns_to_display and not pd.isna(value): output += f"**{key}:** {value}\n\n" # Display 'detail_โครงการ' if 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')): output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n" output += "---\n\n" return output iface = gr.Interface( fn=search_interface, inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'), outputs="markdown", title="Semantic Search Application", description="Enter a search query to find the most relevant entries from the dataset.", ) if __name__ == "__main__": iface.launch(share=True)