|
import warnings |
|
import torchvision |
|
import torch |
|
import pandas as pd |
|
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import streamlit as st |
|
|
|
|
|
torchvision.disable_beta_transforms_warning() |
|
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision") |
|
|
|
|
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False) |
|
model = AutoModelForMaskedLM.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") |
|
except Exception: |
|
st.warning("Switching to xlm-roberta-base model due to compatibility issues.") |
|
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=False) |
|
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base") |
|
|
|
|
|
pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer, framework="pt") |
|
|
|
|
|
def get_embedding(text): |
|
inputs = tokenizer(text, return_tensors="pt") |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
return outputs.logits[:, 0, :].cpu().numpy() |
|
|
|
|
|
st.title("Thai Full Sentence Similarity App") |
|
|
|
st.write(""" |
|
## using Thai Law nlp dataset""") |
|
|
|
st.write(""" |
|
### How This App Works |
|
This app uses a mask-filling model to predict possible words or phrases that could fill in the `<mask>` token in a given sentence. It then calculates the similarity of each prediction with the original sentence to determine the most contextually appropriate completion. |
|
|
|
### Example Sentence |
|
In this example, we have the following sentence in Thai with a `<mask>` token: |
|
- **Input**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ"` |
|
- **Translation**: "Many tourists choose to visit `<mask>` to experience nature." |
|
|
|
The `<mask>` token represents a location popular for its natural beauty. |
|
|
|
### Potential Predictions |
|
Here are some possible predictions the model might generate for `<mask>`: |
|
1. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"` - Chiang Mai |
|
2. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เขาใหญ่ เพื่อสัมผัสธรรมชาติ"` - Khao Yai |
|
3. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เกาะสมุย เพื่อสัมผัสธรรมชาติ"` - Koh Samui |
|
4. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน ภูเก็ต เพื่อสัมผัสธรรมชาติ"` - Phuket |
|
|
|
### Results Table |
|
For each prediction, the app calculates: |
|
- **Similarity Score**: Indicates how similar the predicted sentence is to the original input. |
|
- **Model Score**: Represents the model's confidence in the predicted word for `<mask>`. |
|
|
|
### Most Similar Prediction |
|
The app will display the most contextually similar prediction based on the similarity score. For example: |
|
- **Most Similar Prediction**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"` |
|
- **Similarity Score**: 0.89 |
|
- **Model Score**: 0.16 |
|
|
|
Feel free to enter your own sentence with `<mask>` and explore the predictions! |
|
""") |
|
|
|
|
|
st.subheader("Input Text") |
|
input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "เมนูโปรดของฉันคือ <mask> ที่ทำจากวัตถุดิบสดใหม่") |
|
|
|
|
|
|
|
if "<mask>" not in input_text: |
|
input_text += " <mask>" |
|
st.warning("`<mask>` token was missing in your input. It has been added automatically.") |
|
|
|
|
|
if input_text: |
|
st.write(f"Input Text: {input_text}") |
|
|
|
|
|
baseline_text = input_text.replace("<mask>", "") |
|
input_embedding = get_embedding(baseline_text) |
|
|
|
|
|
similarity_results = [] |
|
|
|
try: |
|
result = pipe(input_text) |
|
|
|
for r in result: |
|
prediction_text = r.get('sequence', '') |
|
|
|
if prediction_text: |
|
prediction_embedding = get_embedding(prediction_text) |
|
similarity = cosine_similarity(input_embedding, prediction_embedding)[0][0] |
|
similarity_results.append({ |
|
"Prediction": prediction_text, |
|
"Similarity Score": similarity, |
|
"Model Score": r['score'] |
|
}) |
|
|
|
|
|
df_results = pd.DataFrame(similarity_results).sort_values(by="Similarity Score", ascending=False) |
|
|
|
|
|
st.subheader("All Predictions Sorted by Similarity") |
|
st.dataframe(df_results) |
|
|
|
|
|
most_similar = df_results.iloc[0] |
|
st.subheader("Most Similar Prediction") |
|
st.write(f"**Prediction**: {most_similar['Prediction']}") |
|
st.write(f"**Similarity Score**: {most_similar['Similarity Score']:.4f}") |
|
st.write(f"**Model Score**: {most_similar['Model Score']:.4f}") |
|
|
|
except KeyError: |
|
st.error("Unexpected model output structure; unable to retrieve predictions.") |
|
|