import gradio as gr
import os
from transformers import pipeline
import re
import matplotlib.pyplot as plt

def preprocess_text(text):
    text = re.sub(r'[^\u4e00-\u9fff]', '', text)
    return text

os.environ['HF_TOKEN'] = os.environ['Century_Test']

nlp = pipeline('text-classification', model='bdsl/HanmunRoBERTa')

def predict_century(text):
    preprocessed_input = preprocess_text(text)
    result = nlp(preprocessed_input, top_k=None)
    result.sort(key=lambda x: x['score'], reverse=True)
    
    scores = {f"{i}th century": 0 for i in range(15, 20)}
    
    for item in result:
        scores[f"{item['label']}th century"] = item['score']
    
    scores_text = "\n".join([f"{century}: {score*100:.2f}%" for century, score in scores.items()])
    
    return preprocessed_input, scores_text

iface = gr.Interface(fn=predict_century,
                     inputs=gr.Textbox(label="Enter your text here:"),
                     outputs=[
                         gr.Textbox(label="Processed Text"),
                         gr.Textbox(label="Confidence Scores")
                     ],
                     description="This Gradio web app uses the March 2024 version of the HanmunRoBERTa model to estimate the century in which the provided text was written. HanmunRoBERTa is a transformer-based model trained exclusively on texts in literary Sinitic authored by Koreans before the 20th century. This version is an early prototype, optimized with data from the Veritable Records and the Diary of the Royal Secretariat. As such, it is prone to overfitting and requires further adjustments and refinement for improved performance. The app automatically removes all non-Sinitic characters and special symbols, including punctuation.")
iface.launch()