import gradio as gr import os from transformers import pipeline import re import matplotlib.pyplot as plt def preprocess_text(text): text = re.sub(r'[^\u4e00-\u9fff]', '', text) return text os.environ['HF_TOKEN'] = os.environ['Century_Test'] nlp = pipeline('text-classification', model='bdsl/HanmunRoBERTa') def predict_century(text): preprocessed_input = preprocess_text(text) result = nlp(preprocessed_input, top_k=None) result.sort(key=lambda x: x['score'], reverse=True) scores = {f"{i}th century": 0 for i in range(15, 20)} for item in result: scores[f"{item['label']}th century"] = item['score'] scores_text = "\n".join([f"{century}: {score*100:.2f}%" for century, score in scores.items()]) return preprocessed_input, scores_text iface = gr.Interface(fn=predict_century, inputs=gr.Textbox(label="Enter your text here:"), outputs=[ gr.Textbox(label="Processed Text"), gr.Textbox(label="Confidence Scores") ], description="This Gradio web app uses the March 2024 version of the HanmunRoBERTa model to estimate the century in which the provided text was written. HanmunRoBERTa is a transformer-based model trained exclusively on texts in literary Sinitic authored by Koreans before the 20th century. This version is an early prototype, optimized with data from the Veritable Records and the Diary of the Royal Secretariat. As such, it is prone to overfitting and requires further adjustments and refinement for improved performance. The app automatically removes all non-Sinitic characters and special symbols, including punctuation.") iface.launch()