Spaces:

visualpolice
/

wt

Sleeping

App Files Files Community

Teera commited on Oct 1, 2024

Commit

d16656d

verified ·

1 Parent(s): 63509b4

Upload 3 files

Browse files

Files changed (3) hide show

all_combine_main.csv +0 -0
app.py +79 -0
requirements.txt +5 -0

all_combine_main.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import gradio as gr
+# Step 1: Read the CSV file
+df = pd.read_csv('./all_combine_main.csv')  # Ensure the CSV file is uploaded to Hugging Face
+# Step 2: Extract the text column for embeddings and keep the entire DataFrame rows
+text_column = 'detail_โครงการ'  # Replace 'your_column' with your text column name
+texts = df[text_column].astype(str).tolist()
+# Keep the entire DataFrame rows as a list of dictionaries
+rows = df.to_dict('records')
+# Step 3: Load the pre-trained model
+model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+# Step 4: Generate embeddings for all texts
+embeddings = model.encode(texts, show_progress_bar=True)
+# Optional: Save embeddings to disk to avoid recomputing in future runs
+# np.save('embeddings.npy', embeddings)
+# If you have saved embeddings before, you can load them directly
+# embeddings = np.load('embeddings.npy')
+# Step 5: Define the semantic search function
+def semantic_search(query, embeddings, texts, rows, top_n=5):
+    # Generate embedding for the query
+    query_embedding = model.encode([query])
+    # Compute cosine similarities
+    similarities = cosine_similarity(query_embedding, embeddings)[0]
+    # Get the indices of the top_n most similar texts
+    top_indices = np.argsort(similarities)[::-1][:top_n]
+    # Return the top_n most similar rows and their similarity scores
+    results = [(rows[idx], similarities[idx]) for idx in top_indices]
+    return results
+# Step 6: Create the Gradio interface
+def search_interface(query):
+    # Perform the search
+    results = semantic_search(query, embeddings, texts, rows)
+    # Specify the columns to display
+    columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ']
+    # Prepare the output
+    output = ""
+    for row, score in results:
+        output += f"**Score:** {score:.4f}\n\n"
+        # Check if either 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN
+        # Display only specified columns
+        for key, value in row.items():
+            if key in columns_to_display and not pd.isna(value):  # Only show specified columns and skip NaNs
+                output += f"**{key}:** {value}\n\n"
+        if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')):
+        # Display 'detail_โครงการ' if any of the above columns are NaN
+            output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n"
+        output += "---\n\n"
+    return output
+iface = gr.Interface(
+    fn=search_interface,
+    inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'),
+    outputs="markdown",
+    title="Semantic Search Application",
+    description="Enter a search query to find the most relevant entries from the dataset.",
+)
+if __name__ == "__main__":
+    iface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pandas
+numpy
+sentence-transformers
+scikit-learn
+gradio