Spaces:

visualpolice
/

White-Stride-Red68

Sleeping

App Files Files Community

Teera commited on Oct 1, 2024

Commit

b9a685a

verified ·

1 Parent(s): e29602b

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
White-Stride-Red-68.csv +3 -0
app.py +99 -0
requirements.txt +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+White-Stride-Red-68.csv filter=lfs diff=lfs merge=lfs -text

White-Stride-Red-68.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6020f6eae629050bf0c90ad99e6d4e0f338ce72add1a4aaaf2c27da598bd48cc
+size 15401258

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import gradio as gr
+# Step 1: Load the CSV file
+df = pd.read_csv('./White-Stride-Red-68.csv')
+# Step 2: Filter out rows where the 'detail_โครงการ' column is NaN or an empty string
+text_column = 'detail_โครงการ'
+df_filtered = df[df[text_column].notna() & df[text_column].str.strip().ne('')]
+# Reset index to ensure we have a unique identifier for each row
+df_filtered = df_filtered.reset_index()  # 'index' becomes a column now
+# Step 3: Extract the text column for embeddings
+texts = df_filtered[text_column].astype(str).tolist()
+# Keep the entire DataFrame rows as a list of dictionaries
+rows = df_filtered.to_dict('records')
+# **New Step**: Split texts into chunks of up to 500 characters
+chunks = []
+chunk_rows = []
+for idx, text in enumerate(texts):
+    # Split text into chunks of up to 500 characters
+    text_chunks = [text[i:i+500] for i in range(0, len(text), 500)]
+    # For each chunk, store the chunk and its corresponding row
+    for chunk in text_chunks:
+        chunks.append(chunk)
+        chunk_rows.append(rows[idx])
+# Step 4: Load the pre-trained model
+model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+# Step 5: Generate embeddings for all text chunks
+embeddings = model.encode(chunks, show_progress_bar=True)
+# Step 6: Define the semantic search function
+def semantic_search(query, embeddings, chunks, chunk_rows, top_n=50):
+    # Generate embedding for the query
+    query_embedding = model.encode([query])
+    # Compute cosine similarities
+    similarities = cosine_similarity(query_embedding, embeddings)[0]
+    # Get the indices of the chunks sorted by similarity
+    sorted_indices = np.argsort(similarities)[::-1]
+    # Collect top_n unique results based on the original row
+    results = []
+    seen_row_ids = set()
+    for idx in sorted_indices:
+        row = chunk_rows[idx]
+        row_id = row['index']  # Unique identifier for the row
+        if row_id not in seen_row_ids:
+            seen_row_ids.add(row_id)
+            results.append((row, similarities[idx]))
+            if len(results) >= top_n:
+                break
+    return results
+# Step 7: Create the Gradio interface
+def search_interface(query):
+    # Perform the search
+    results = semantic_search(query, embeddings, chunks, chunk_rows)
+    # Specify the columns to display
+    columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ']
+    # Prepare the output
+    output = ""
+    for row, score in results:
+        output += f"**Score:** {score:.4f}\n\n"
+        # Display only specified columns and skip NaNs
+        for key, value in row.items():
+            if key in columns_to_display and not pd.isna(value):
+                output += f"**{key}:** {value}\n\n"
+        # Display 'detail_โครงการ' if 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN
+        if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')):
+            output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n"
+        output += "---\n\n"
+    return output
+iface = gr.Interface(
+    fn=search_interface,
+    inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'),
+    outputs="markdown",
+    title="Semantic Search Application",
+    description="Enter a search query to find the most relevant entries from the dataset.",
+)
+if __name__ == "__main__":
+    iface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pandas
+numpy
+sentence-transformers
+scikit-learn
+gradio