Bram Vanroy
commited on
Commit
·
d1f2e36
1
Parent(s):
6fc246d
make style
Browse files
app.py
CHANGED
@@ -2,14 +2,11 @@ import base64
|
|
2 |
from io import StringIO
|
3 |
from math import ceil
|
4 |
|
5 |
-
from utils import get_resources, simplify
|
6 |
-
|
7 |
import streamlit as st
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
)
|
13 |
|
14 |
BATCH_SIZE = 8
|
15 |
|
@@ -33,8 +30,10 @@ if fupload_check:
|
|
33 |
st.session_state["text_to_simplify"] = None
|
34 |
else:
|
35 |
st.session_state["text_to_simplify"] = st.text_area(
|
36 |
-
label="Sentences to translate",
|
37 |
-
|
|
|
|
|
38 |
).strip()
|
39 |
|
40 |
|
@@ -44,6 +43,7 @@ def _get_increment_size(num_sents) -> int:
|
|
44 |
else:
|
45 |
return ceil(100 / (num_sents / BATCH_SIZE))
|
46 |
|
|
|
47 |
btn_col, results_col = st.columns(2)
|
48 |
btn_ct = btn_col.empty()
|
49 |
error_ct = st.empty()
|
@@ -51,7 +51,9 @@ simpl_ct = st.container()
|
|
51 |
if st.session_state["text_to_simplify"]:
|
52 |
if btn_ct.button("Simplify text"):
|
53 |
error_ct.empty()
|
54 |
-
lines = [
|
|
|
|
|
55 |
num_sentences = len(lines)
|
56 |
|
57 |
pbar = st.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
|
@@ -73,7 +75,7 @@ if st.session_state["text_to_simplify"]:
|
|
73 |
<li><strong>Simplification:</strong> {simplification}</li>
|
74 |
</ul>
|
75 |
</li>"""
|
76 |
-
output_ct.markdown(html+"</ol>", unsafe_allow_html=True)
|
77 |
|
78 |
all_simplifications.extend(simplifications)
|
79 |
|
@@ -83,7 +85,10 @@ if st.session_state["text_to_simplify"]:
|
|
83 |
|
84 |
all_simplifications = "\n".join(all_simplifications) + "\n"
|
85 |
b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
|
86 |
-
results_col.markdown(
|
|
|
|
|
|
|
87 |
else:
|
88 |
btn_ct.empty()
|
89 |
error_ct.error("Text cannot be empty!", icon="⚠️")
|
@@ -95,7 +100,8 @@ else:
|
|
95 |
########################
|
96 |
st.header("Project background")
|
97 |
|
98 |
-
st.markdown(
|
|
|
99 |
|
100 |
Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset.
|
101 |
|
@@ -107,11 +113,14 @@ The following models are available, all finetuned from the awesome Dutch T5 mode
|
|
107 |
|
108 |
The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).
|
109 |
|
110 |
-
"""
|
|
|
111 |
|
112 |
|
113 |
st.header("Contact ✒️")
|
114 |
|
115 |
-
st.markdown(
|
116 |
-
|
117 |
-
|
|
|
|
|
|
2 |
from io import StringIO
|
3 |
from math import ceil
|
4 |
|
|
|
|
|
5 |
import streamlit as st
|
6 |
|
7 |
+
from utils import get_resources, simplify
|
8 |
+
|
9 |
+
st.set_page_config(page_title="Text Simplification in Dutch", page_icon="🏃")
|
|
|
10 |
|
11 |
BATCH_SIZE = 8
|
12 |
|
|
|
30 |
st.session_state["text_to_simplify"] = None
|
31 |
else:
|
32 |
st.session_state["text_to_simplify"] = st.text_area(
|
33 |
+
label="Sentences to translate",
|
34 |
+
label_visibility="collapsed",
|
35 |
+
height=200,
|
36 |
+
value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld.",
|
37 |
).strip()
|
38 |
|
39 |
|
|
|
43 |
else:
|
44 |
return ceil(100 / (num_sents / BATCH_SIZE))
|
45 |
|
46 |
+
|
47 |
btn_col, results_col = st.columns(2)
|
48 |
btn_ct = btn_col.empty()
|
49 |
error_ct = st.empty()
|
|
|
51 |
if st.session_state["text_to_simplify"]:
|
52 |
if btn_ct.button("Simplify text"):
|
53 |
error_ct.empty()
|
54 |
+
lines = [
|
55 |
+
strip_line for line in st.session_state["text_to_simplify"].splitlines() if (strip_line := line.strip())
|
56 |
+
]
|
57 |
num_sentences = len(lines)
|
58 |
|
59 |
pbar = st.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
|
|
|
75 |
<li><strong>Simplification:</strong> {simplification}</li>
|
76 |
</ul>
|
77 |
</li>"""
|
78 |
+
output_ct.markdown(html + "</ol>", unsafe_allow_html=True)
|
79 |
|
80 |
all_simplifications.extend(simplifications)
|
81 |
|
|
|
85 |
|
86 |
all_simplifications = "\n".join(all_simplifications) + "\n"
|
87 |
b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
|
88 |
+
results_col.markdown(
|
89 |
+
f'<a download="dutch-simplifications.txt" href="data:file/txt;base64,{b64}" title="Download">Download simplifications</a>',
|
90 |
+
unsafe_allow_html=True,
|
91 |
+
)
|
92 |
else:
|
93 |
btn_ct.empty()
|
94 |
error_ct.error("Text cannot be empty!", icon="⚠️")
|
|
|
100 |
########################
|
101 |
st.header("Project background")
|
102 |
|
103 |
+
st.markdown(
|
104 |
+
"""This demo highlights work that has been done in light of a master thesis by Charlotte Van de Velde as part of the Master of Science in Artificial Intelligence at KU Leuven in 2023. Charlotte is supervised by Vincent Vandeghinste and Bram Vanroy.
|
105 |
|
106 |
Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset.
|
107 |
|
|
|
113 |
|
114 |
The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).
|
115 |
|
116 |
+
"""
|
117 |
+
)
|
118 |
|
119 |
|
120 |
st.header("Contact ✒️")
|
121 |
|
122 |
+
st.markdown(
|
123 |
+
"Would you like additional functionality in the demo, do you have questions, or just want to get in touch?"
|
124 |
+
" Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
|
125 |
+
" or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!"
|
126 |
+
)
|
utils.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
-
from
|
2 |
-
from typing import Tuple, Generator, List
|
3 |
|
4 |
-
from optimum.bettertransformer import BetterTransformer
|
5 |
import streamlit as st
|
6 |
import torch
|
7 |
-
from
|
8 |
from torch import nn, qint8
|
9 |
-
from
|
|
|
10 |
|
11 |
|
12 |
@st.cache_resource(show_spinner=False)
|
13 |
def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer]:
|
14 |
-
"""
|
15 |
-
"""
|
16 |
tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
|
17 |
model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
|
18 |
|
@@ -30,20 +28,24 @@ def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForCo
|
|
30 |
|
31 |
|
32 |
def batchify(iterable, batch_size=16):
|
|
|
|
|
|
|
|
|
33 |
num_items = len(iterable)
|
34 |
for idx in range(0, num_items, batch_size):
|
35 |
-
yield iterable[idx:min(idx + batch_size, num_items)]
|
36 |
|
37 |
|
38 |
def simplify(
|
39 |
-
|
40 |
-
model: T5ForConditionalGeneration,
|
41 |
-
tokenizer: T5Tokenizer,
|
42 |
-
batch_size: int = 16
|
43 |
) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
44 |
"""
|
45 |
-
"""
|
46 |
-
|
47 |
for batch_texts in batchify(texts, batch_size=batch_size):
|
48 |
nlg_batch_texts = ["[NLG] " + text for text in batch_texts]
|
49 |
encoded = tokenizer(nlg_batch_texts, return_tensors="pt", padding=True, truncation=True)
|
|
|
1 |
+
from typing import List, Tuple
|
|
|
2 |
|
|
|
3 |
import streamlit as st
|
4 |
import torch
|
5 |
+
from optimum.bettertransformer import BetterTransformer
|
6 |
from torch import nn, qint8
|
7 |
+
from torch.quantization import quantize_dynamic
|
8 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
9 |
|
10 |
|
11 |
@st.cache_resource(show_spinner=False)
|
12 |
def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer]:
|
13 |
+
"""Load a T5 model and its (slow) tokenizer"""
|
|
|
14 |
tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
|
15 |
model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
|
16 |
|
|
|
28 |
|
29 |
|
30 |
def batchify(iterable, batch_size=16):
|
31 |
+
"""Turn an iterable in a batch generator
|
32 |
+
:param iterable: iterable to batchify
|
33 |
+
:param batch_size: batch size
|
34 |
+
"""
|
35 |
num_items = len(iterable)
|
36 |
for idx in range(0, num_items, batch_size):
|
37 |
+
yield iterable[idx : min(idx + batch_size, num_items)]
|
38 |
|
39 |
|
40 |
def simplify(
|
41 |
+
texts: List[str], model: T5ForConditionalGeneration, tokenizer: T5Tokenizer, batch_size: int = 16
|
|
|
|
|
|
|
42 |
) -> List[str]:
|
43 |
+
"""Simplify a given set of texts with a given model and tokenizer. Yields results in batches of 'batch_size'
|
44 |
+
:param texts: texts to simplify
|
45 |
+
:param model: model to use for simplification
|
46 |
+
:param tokenizer: tokenizer to use for simplification
|
47 |
+
:param batch_size: batch size to yield results in
|
48 |
"""
|
|
|
|
|
49 |
for batch_texts in batchify(texts, batch_size=batch_size):
|
50 |
nlg_batch_texts = ["[NLG] " + text for text in batch_texts]
|
51 |
encoded = tokenizer(nlg_batch_texts, return_tensors="pt", padding=True, truncation=True)
|