Teera commited on
Commit
b9a685a
·
verified ·
1 Parent(s): e29602b

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. White-Stride-Red-68.csv +3 -0
  3. app.py +99 -0
  4. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ White-Stride-Red-68.csv filter=lfs diff=lfs merge=lfs -text
White-Stride-Red-68.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6020f6eae629050bf0c90ad99e6d4e0f338ce72add1a4aaaf2c27da598bd48cc
3
+ size 15401258
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import gradio as gr
6
+
7
+ # Step 1: Load the CSV file
8
+ df = pd.read_csv('./White-Stride-Red-68.csv')
9
+
10
+ # Step 2: Filter out rows where the 'detail_โครงการ' column is NaN or an empty string
11
+ text_column = 'detail_โครงการ'
12
+ df_filtered = df[df[text_column].notna() & df[text_column].str.strip().ne('')]
13
+
14
+ # Reset index to ensure we have a unique identifier for each row
15
+ df_filtered = df_filtered.reset_index() # 'index' becomes a column now
16
+
17
+ # Step 3: Extract the text column for embeddings
18
+ texts = df_filtered[text_column].astype(str).tolist()
19
+
20
+ # Keep the entire DataFrame rows as a list of dictionaries
21
+ rows = df_filtered.to_dict('records')
22
+
23
+ # **New Step**: Split texts into chunks of up to 500 characters
24
+ chunks = []
25
+ chunk_rows = []
26
+
27
+ for idx, text in enumerate(texts):
28
+ # Split text into chunks of up to 500 characters
29
+ text_chunks = [text[i:i+500] for i in range(0, len(text), 500)]
30
+ # For each chunk, store the chunk and its corresponding row
31
+ for chunk in text_chunks:
32
+ chunks.append(chunk)
33
+ chunk_rows.append(rows[idx])
34
+
35
+ # Step 4: Load the pre-trained model
36
+ model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
37
+
38
+ # Step 5: Generate embeddings for all text chunks
39
+ embeddings = model.encode(chunks, show_progress_bar=True)
40
+
41
+ # Step 6: Define the semantic search function
42
+ def semantic_search(query, embeddings, chunks, chunk_rows, top_n=50):
43
+ # Generate embedding for the query
44
+ query_embedding = model.encode([query])
45
+
46
+ # Compute cosine similarities
47
+ similarities = cosine_similarity(query_embedding, embeddings)[0]
48
+
49
+ # Get the indices of the chunks sorted by similarity
50
+ sorted_indices = np.argsort(similarities)[::-1]
51
+
52
+ # Collect top_n unique results based on the original row
53
+ results = []
54
+ seen_row_ids = set()
55
+ for idx in sorted_indices:
56
+ row = chunk_rows[idx]
57
+ row_id = row['index'] # Unique identifier for the row
58
+ if row_id not in seen_row_ids:
59
+ seen_row_ids.add(row_id)
60
+ results.append((row, similarities[idx]))
61
+ if len(results) >= top_n:
62
+ break
63
+ return results
64
+
65
+ # Step 7: Create the Gradio interface
66
+ def search_interface(query):
67
+ # Perform the search
68
+ results = semantic_search(query, embeddings, chunks, chunk_rows)
69
+
70
+ # Specify the columns to display
71
+ columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ']
72
+
73
+ # Prepare the output
74
+ output = ""
75
+ for row, score in results:
76
+ output += f"**Score:** {score:.4f}\n\n"
77
+
78
+ # Display only specified columns and skip NaNs
79
+ for key, value in row.items():
80
+ if key in columns_to_display and not pd.isna(value):
81
+ output += f"**{key}:** {value}\n\n"
82
+
83
+ # Display 'detail_โครงการ' if 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN
84
+ if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')):
85
+ output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n"
86
+ output += "---\n\n"
87
+
88
+ return output
89
+
90
+ iface = gr.Interface(
91
+ fn=search_interface,
92
+ inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'),
93
+ outputs="markdown",
94
+ title="Semantic Search Application",
95
+ description="Enter a search query to find the most relevant entries from the dataset.",
96
+ )
97
+
98
+ if __name__ == "__main__":
99
+ iface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ sentence-transformers
4
+ scikit-learn
5
+ gradio