File size: 3,802 Bytes
93d3903 2a8aa8e 93d3903 7c2d88d 93d3903 2a8aa8e 93d3903 d61d898 93d3903 d61d898 93d3903 d61d898 93d3903 d61d898 93d3903 d61d898 2a8aa8e 93d3903 2a8aa8e a5bb67d 93d3903 d61d898 93d3903 a5bb67d 93d3903 2a8aa8e b789f36 2a8aa8e 93d3903 5dd6b29 93d3903 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import streamlit as st
from transformers import AutoTokenizer, T5ForConditionalGeneration
import post_ocr
# Sidebar information
info = '''Welcome to the demo of the [swedish-ocr-correction](https://huggingface.co/viklofg/swedish-ocr-correction) model.
Enter or upload OCR output and the model will attempt to correct it.
:clock2: Slow generation? Try a shorter input.
'''
# Example inputs
examples = {
'Examples': None,
'Example 1': 'En Gosse fur plats nu genast ! inetallyrkc, JU 83 Drottninggatan.',
'Example 2': '— Storartad gåfva till Göteborgs Museum. Den i HandelstidniDgens g&rdagsnnmmer omtalade hvalfisken, sorn fångats i Frölnndaviken, har i dag af hr brukspatronen James Dickson blifvit inköpt för 1,500 rdr och skänkt till härvarande Museum.',
'Example 3': 'Sn underlig race att ſtudera, desfa uppſinnare! utropar en Londontidnings fronifôr. Wet ni hur ſtort antalet är af patenter, ſom ſiſtlidet är utfärdades i British Patent Office? Jo, 14,000 ſty>en !! Det kan man ju fkalla en rif rd! Fjorton tuſen uppfinninnar! Herre Gud, hwilfet märkrwoärdigt tidehrvarf wi lefroa i!'
}
# Load model
@st.cache_resource
def load_model():
return T5ForConditionalGeneration.from_pretrained('KBLab/swedish-ocr-correction')
model = load_model()
# Load tokenizer
@st.cache_resource
def load_tokenizer():
return AutoTokenizer.from_pretrained('google/byt5-small')
tokenizer = load_tokenizer()
# Set model and tokenizer
post_ocr.set_model(model, tokenizer)
# Title
st.title(':memo: Swedish OCR correction')
# Input and output areas
tab1, tab2 = st.tabs(["Text input", "From file"])
# Initialize session states
if 'inputs' not in st.session_state:
st.session_state.inputs = {'tab1': None, 'tab2': None}
if 'outputs' not in st.session_state:
st.session_state.outputs = {'tab1': None, 'tab2': None}
# Sidebar (info)
with st.sidebar:
st.header('About')
st.markdown(info)
def handle_input(input_, id_):
"""Generate and display output"""
# Put everything output-related in a bordered container
with st.container(border=True):
st.caption('Output')
# Only update the output if the input has been updated
if input_ and st.session_state.inputs[id_] != input_:
st.session_state.inputs[id_] = input_
with st.spinner('Generating...'):
output = post_ocr.process(input_)
st.session_state.outputs[id_] = output
# This container is needed to display the `show changes` toggle
# after the output text
container = st.container()
st.divider()
show_changes = st.toggle('Show changes', key=f'show_changes_{id_}')
with container:
# Display output
output = st.session_state.outputs[id_]
if output is not None:
st.write(post_ocr.diff(input_, output) if show_changes else output)
# Manual entry tab
with tab1:
col1, col2 = st.columns([4, 1])
with col2:
example_title = st.selectbox('Examples', options=examples,
label_visibility='collapsed')
with col1:
text = st.text_area(
label='Input text',
value=examples[example_title],
height=200,
label_visibility='collapsed',
placeholder='Enter OCR generated text or choose an example')
if text is not None:
handle_input(text, 'tab1')
# File upload tab
with tab2:
uploaded_file = st.file_uploader('Choose a file', type='.txt')
# Display file content
if uploaded_file is not None:
file_content = uploaded_file.getvalue().decode('utf-8')
text = st.text_area('File content', value=file_content, height=300)
handle_input(text, 'tab2')
|