Spaces:

javiercha
/

HanmunRoBERTa_century

Runtime error

App Files Files Community

HanmunRoBERTa_century / app.py

javiercha

Update app.py

b0ec70c verified over 1 year ago

raw

history blame contribute delete

1.76 kB

	import gradio as gr
	import os
	from transformers import pipeline
	import re
	import matplotlib.pyplot as plt

	def preprocess_text(text):
	text = re.sub(r'[^\u4e00-\u9fff]', '', text)
	return text

	os.environ['HF_TOKEN'] = os.environ['Century_Test']

	nlp = pipeline('text-classification', model='bdsl/HanmunRoBERTa')

	def predict_century(text):
	preprocessed_input = preprocess_text(text)
	result = nlp(preprocessed_input, top_k=None)
	result.sort(key=lambda x: x['score'], reverse=True)

	scores = {f"{i}th century": 0 for i in range(15, 20)}

	for item in result:
	scores[f"{item['label']}th century"] = item['score']

	scores_text = "\n".join([f"{century}: {score*100:.2f}%" for century, score in scores.items()])

	return preprocessed_input, scores_text

	iface = gr.Interface(fn=predict_century,
	inputs=gr.Textbox(label="Enter your text here:"),
	outputs=[
	gr.Textbox(label="Processed Text"),
	gr.Textbox(label="Confidence Scores")
	],
	description="This Gradio web app uses the March 2024 version of the HanmunRoBERTa model to estimate the century in which the provided text was written. HanmunRoBERTa is a transformer-based model trained exclusively on texts in literary Sinitic authored by Koreans before the 20th century. This version is an early prototype, optimized with data from the Veritable Records and the Diary of the Royal Secretariat. As such, it is prone to overfitting and requires further adjustments and refinement for improved performance. The app automatically removes all non-Sinitic characters and special symbols, including punctuation.")
	iface.launch()