Teera commited on
Commit
d16656d
·
verified ·
1 Parent(s): 63509b4

Upload 3 files

Browse files
Files changed (3) hide show
  1. all_combine_main.csv +0 -0
  2. app.py +79 -0
  3. requirements.txt +5 -0
all_combine_main.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import gradio as gr
6
+
7
+ # Step 1: Read the CSV file
8
+ df = pd.read_csv('./all_combine_main.csv') # Ensure the CSV file is uploaded to Hugging Face
9
+
10
+ # Step 2: Extract the text column for embeddings and keep the entire DataFrame rows
11
+ text_column = 'detail_โครงการ' # Replace 'your_column' with your text column name
12
+ texts = df[text_column].astype(str).tolist()
13
+
14
+ # Keep the entire DataFrame rows as a list of dictionaries
15
+ rows = df.to_dict('records')
16
+
17
+ # Step 3: Load the pre-trained model
18
+ model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
19
+
20
+ # Step 4: Generate embeddings for all texts
21
+ embeddings = model.encode(texts, show_progress_bar=True)
22
+
23
+ # Optional: Save embeddings to disk to avoid recomputing in future runs
24
+ # np.save('embeddings.npy', embeddings)
25
+ # If you have saved embeddings before, you can load them directly
26
+ # embeddings = np.load('embeddings.npy')
27
+
28
+ # Step 5: Define the semantic search function
29
+ def semantic_search(query, embeddings, texts, rows, top_n=5):
30
+ # Generate embedding for the query
31
+ query_embedding = model.encode([query])
32
+
33
+ # Compute cosine similarities
34
+ similarities = cosine_similarity(query_embedding, embeddings)[0]
35
+
36
+ # Get the indices of the top_n most similar texts
37
+ top_indices = np.argsort(similarities)[::-1][:top_n]
38
+
39
+ # Return the top_n most similar rows and their similarity scores
40
+ results = [(rows[idx], similarities[idx]) for idx in top_indices]
41
+ return results
42
+
43
+ # Step 6: Create the Gradio interface
44
+ def search_interface(query):
45
+ # Perform the search
46
+ results = semantic_search(query, embeddings, texts, rows)
47
+
48
+ # Specify the columns to display
49
+ columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ']
50
+
51
+ # Prepare the output
52
+ output = ""
53
+ for row, score in results:
54
+ output += f"**Score:** {score:.4f}\n\n"
55
+
56
+ # Check if either 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN
57
+
58
+
59
+ # Display only specified columns
60
+ for key, value in row.items():
61
+ if key in columns_to_display and not pd.isna(value): # Only show specified columns and skip NaNs
62
+ output += f"**{key}:** {value}\n\n"
63
+ if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')):
64
+ # Display 'detail_โครงการ' if any of the above columns are NaN
65
+ output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n"
66
+ output += "---\n\n"
67
+
68
+ return output
69
+
70
+ iface = gr.Interface(
71
+ fn=search_interface,
72
+ inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'),
73
+ outputs="markdown",
74
+ title="Semantic Search Application",
75
+ description="Enter a search query to find the most relevant entries from the dataset.",
76
+ )
77
+
78
+ if __name__ == "__main__":
79
+ iface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ sentence-transformers
4
+ scikit-learn
5
+ gradio