kritsadaK commited on
Commit
d49af1e
1 Parent(s): 30fec51

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +110 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import torchvision
3
+ import torch
4
+ import pandas as pd
5
+ from transformers.pipelines import pipeline
6
+ from transformers import AutoTokenizer, AutoModel
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import streamlit as st
9
+
10
+ # Suppress torchvision beta warnings
11
+ torchvision.disable_beta_transforms_warning()
12
+ warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
13
+
14
+ # Initialize fill-mask pipeline and model/tokenizer for embedding
15
+ pipe = pipeline("fill-mask", model="airesearch/wangchanberta-base-att-spm-uncased", framework="pt", use_fast=False)
16
+ tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False)
17
+ model = AutoModel.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
18
+
19
+ # Function to generate embeddings for full sentences
20
+ def get_embedding(text):
21
+ inputs = tokenizer(text, return_tensors="pt")
22
+ with torch.no_grad():
23
+ outputs = model(**inputs)
24
+ return outputs.last_hidden_state[:, 0, :].cpu().numpy()
25
+
26
+ # Streamlit app setup
27
+ st.title("Thai Full Sentence Similarity App")
28
+ # Explanation Section
29
+ st.write("""
30
+ ### How This App Works
31
+ This app uses a mask-filling model to predict possible words or phrases that could fill in the `<mask>` token in a given sentence. It then calculates the similarity of each prediction with the original sentence to determine the most contextually appropriate completion.
32
+
33
+ ### Example Sentence
34
+ In this example, we have the following sentence in Thai with a `<mask>` token:
35
+ - **Input**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ"`
36
+ - **Translation**: "Many tourists choose to visit `<mask>` to experience nature."
37
+
38
+ The `<mask>` token represents a location popular for its natural beauty.
39
+
40
+ ### Potential Predictions
41
+ Here are some possible predictions the model might generate for `<mask>`:
42
+ 1. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"` - Chiang Mai
43
+ 2. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เขาใหญ่ เพื่อสัมผัสธรรมชาติ"` - Khao Yai
44
+ 3. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เกาะสมุย เพื่อสัมผัสธรรมชาติ"` - Koh Samui
45
+ 4. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน ภูเก็ต เพื่อสัมผัสธรรมชาติ"` - Phuket
46
+
47
+ ### Results Table
48
+ For each prediction, the app calculates:
49
+ - **Similarity Score**: Indicates how similar the predicted sentence is to the original input.
50
+ - **Model Score**: Represents the model's confidence in the predicted word for `<mask>`.
51
+
52
+ ### Most Similar Prediction
53
+ The app will display the most contextually similar prediction based on the similarity score. For example:
54
+ - **Most Similar Prediction**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"`
55
+ - **Similarity Score**: 0.89
56
+ - **Model Score**: 0.16
57
+
58
+ Feel free to enter your own sentence with `<mask>` and explore the predictions!
59
+ """)
60
+
61
+ # User input box
62
+ st.subheader("Input Text")
63
+ input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "ผู้ใช้งานท่าอากาศยานนานาชาติ <mask> มีกว่าสามล้านคน")
64
+
65
+ # Ensure the input includes a `<mask>`
66
+ if "<mask>" not in input_text:
67
+ input_text += " <mask>"
68
+ st.warning("`<mask>` token was missing in your input. It has been added automatically.")
69
+
70
+ # Process the input when available
71
+ if input_text:
72
+ # Display input text
73
+ st.write(f"Input Text: {input_text}")
74
+
75
+ # Get baseline embedding for comparison (remove "<mask>" to get the full sentence)
76
+ baseline_text = input_text.replace("<mask>", "")
77
+ input_embedding = get_embedding(baseline_text)
78
+
79
+ # Generate mask predictions and calculate similarity with the full sentences
80
+ similarity_results = []
81
+ result = pipe(input_text)
82
+
83
+ for r in result:
84
+ # Full predicted sentence
85
+ prediction_text = r['sequence']
86
+
87
+ # Calculate embedding and similarity for the full sentence
88
+ prediction_embedding = get_embedding(prediction_text)
89
+ similarity = cosine_similarity(input_embedding, prediction_embedding)[0][0]
90
+
91
+ # Append results to the list
92
+ similarity_results.append({
93
+ "Prediction": prediction_text,
94
+ "Similarity Score": similarity,
95
+ "Model Score": r['score']
96
+ })
97
+
98
+ # Convert results to DataFrame for easy sorting and display
99
+ df_results = pd.DataFrame(similarity_results).sort_values(by="Similarity Score", ascending=False)
100
+
101
+ # Display all predictions sorted by similarity score
102
+ st.subheader("All Predictions Sorted by Similarity")
103
+ st.dataframe(df_results)
104
+
105
+ # Find and display the most similar prediction
106
+ most_similar = df_results.iloc[0]
107
+ st.subheader("Most Similar Prediction")
108
+ st.write(f"**Prediction**: {most_similar['Prediction']}")
109
+ st.write(f"**Similarity Score**: {most_similar['Similarity Score']:.4f}")
110
+ st.write(f"**Model Score**: {most_similar['Model Score']:.4f}")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ streamlit
4
+ pandas
5
+ scikit-learn
6
+ torchvision