File size: 3,802 Bytes
93d3903
 
 
 
2a8aa8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93d3903
 
 
 
7c2d88d
93d3903
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a8aa8e
 
93d3903
 
 
 
d61d898
93d3903
d61d898
93d3903
 
 
d61d898
93d3903
 
d61d898
93d3903
d61d898
2a8aa8e
 
93d3903
 
2a8aa8e
 
a5bb67d
93d3903
 
 
 
 
 
 
d61d898
93d3903
 
a5bb67d
 
 
 
 
 
 
 
 
 
 
93d3903
 
 
 
2a8aa8e
 
 
 
 
 
 
 
 
b789f36
2a8aa8e
 
 
 
 
 
93d3903
 
 
 
 
 
5dd6b29
 
 
 
93d3903
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
from transformers import AutoTokenizer, T5ForConditionalGeneration
import post_ocr

# Sidebar information
info = '''Welcome to the demo of the [swedish-ocr-correction](https://huggingface.co/viklofg/swedish-ocr-correction) model.

Enter or upload OCR output and the model will attempt to correct it.

:clock2:  Slow generation? Try a shorter input.
'''

# Example inputs
examples = {
    'Examples': None,
    'Example 1': 'En Gosse fur plats nu genast ! inetallyrkc, JU 83 Drottninggatan.',
    'Example 2': '— Storartad gåfva till Göteborgs Museum. Den i HandelstidniDgens g&rdagsnnmmer omtalade hvalfisken, sorn fångats i Frölnndaviken, har i dag af hr brukspatronen James Dickson blifvit inköpt för 1,500 rdr och skänkt till härvarande Museum.',
    'Example 3': 'Sn underlig race att ſtudera, desfa uppſinnare! utropar en Londontidnings fronifôr. Wet ni hur ſtort antalet är af patenter, ſom ſiſtlidet är utfärdades i British Patent Office? Jo, 14,000 ſty>en !! Det kan man ju fkalla en rif rd! Fjorton tuſen uppfinninnar! Herre Gud, hwilfet märkrwoärdigt tidehrvarf wi lefroa i!'
}


# Load model
@st.cache_resource
def load_model():
    return T5ForConditionalGeneration.from_pretrained('KBLab/swedish-ocr-correction')
model = load_model()


# Load tokenizer
@st.cache_resource
def load_tokenizer():
    return AutoTokenizer.from_pretrained('google/byt5-small')
tokenizer = load_tokenizer()


# Set model and tokenizer
post_ocr.set_model(model, tokenizer)


# Title
st.title(':memo: Swedish OCR correction')


# Input and output areas
tab1, tab2 = st.tabs(["Text input", "From file"])


# Initialize session states
if 'inputs' not in st.session_state:
    st.session_state.inputs = {'tab1': None, 'tab2': None}


if 'outputs' not in st.session_state:
    st.session_state.outputs = {'tab1': None, 'tab2': None}


# Sidebar (info)
with st.sidebar:
    st.header('About')
    st.markdown(info)


def handle_input(input_, id_):
    """Generate and display output"""

    # Put everything output-related in a bordered container
    with st.container(border=True):
        st.caption('Output')

        # Only update the output if the input has been updated
        if input_ and st.session_state.inputs[id_] != input_:
            st.session_state.inputs[id_] = input_
            with st.spinner('Generating...'):
                output = post_ocr.process(input_)
            st.session_state.outputs[id_] = output

        # This container is needed to display the `show changes` toggle
        # after the output text
        container = st.container()
        st.divider()
        show_changes = st.toggle('Show changes', key=f'show_changes_{id_}')

        with container:
            # Display output
            output = st.session_state.outputs[id_]
            if output is not None:
                st.write(post_ocr.diff(input_, output) if show_changes else output)


# Manual entry tab
with tab1:
    col1, col2 = st.columns([4, 1])

    with col2:
        example_title = st.selectbox('Examples', options=examples,
                               label_visibility='collapsed')

    with col1:
        text = st.text_area(
            label='Input text',
            value=examples[example_title],
            height=200,
            label_visibility='collapsed',
            placeholder='Enter OCR generated text or choose an example')

        if text is not None:
            handle_input(text, 'tab1')


# File upload tab
with tab2:
    uploaded_file = st.file_uploader('Choose a file', type='.txt')

    # Display file content
    if uploaded_file is not None:
        file_content = uploaded_file.getvalue().decode('utf-8')
        text = st.text_area('File content', value=file_content, height=300)
        handle_input(text, 'tab2')