Yehor Smoliakov commited on
Commit
b2fe391
·
1 Parent(s): 5c06b16
Files changed (7) hide show
  1. .dockerignore +2 -0
  2. .gitignore +5 -0
  3. Dockerfile +61 -0
  4. README.md +26 -3
  5. app.py +188 -0
  6. requirements-dev.txt +1 -0
  7. requirements.txt +4 -0
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .ruff_cache/
2
+ .venv/
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .idea/
2
+ .venv/
3
+ .ruff_cache/
4
+
5
+ flagged/
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ RUN apt-get update && \
6
+ apt-get upgrade -y && \
7
+ apt-get install -y --no-install-recommends \
8
+ git \
9
+ git-lfs \
10
+ wget \
11
+ curl \
12
+ # python build dependencies \
13
+ build-essential \
14
+ libssl-dev \
15
+ zlib1g-dev \
16
+ libbz2-dev \
17
+ libreadline-dev \
18
+ libsqlite3-dev \
19
+ libncursesw5-dev \
20
+ xz-utils \
21
+ tk-dev \
22
+ libxml2-dev \
23
+ libxmlsec1-dev \
24
+ libffi-dev \
25
+ liblzma-dev \
26
+ # gradio dependencies \
27
+ ffmpeg \
28
+ && apt-get clean \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+
32
+ RUN useradd -m -u 1000 user
33
+ USER user
34
+ ENV HOME=/home/user \
35
+ PATH=/home/user/.local/bin:${PATH}
36
+ WORKDIR ${HOME}/app
37
+
38
+ RUN curl https://pyenv.run | bash
39
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
40
+ ARG PYTHON_VERSION=3.10.12
41
+ RUN pyenv install ${PYTHON_VERSION} && \
42
+ pyenv global ${PYTHON_VERSION} && \
43
+ pyenv rehash && \
44
+ pip install --no-cache-dir -U pip setuptools wheel && \
45
+ pip install packaging ninja
46
+
47
+ COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
48
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
49
+
50
+ RUN git clone --depth 1 https://huggingface.co/Pravopysnyk/best-unlp ${HOME}/app/best-unlp
51
+
52
+ COPY --chown=1000 . ${HOME}/app
53
+ ENV PYTHONPATH=${HOME}/app \
54
+ PYTHONUNBUFFERED=1 \
55
+ GRADIO_ALLOW_FLAGGING=never \
56
+ GRADIO_NUM_PORTS=1 \
57
+ GRADIO_SERVER_NAME=0.0.0.0 \
58
+ GRADIO_THEME=huggingface \
59
+ SYSTEM=spaces
60
+
61
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,33 @@
1
  ---
2
- title: Grammar Correction Uk
3
- emoji: 🌍
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Grammar Correction for Ukrainian
3
+ emoji: 📝
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
+ ## Install
11
+
12
+ ```shell
13
+ uv venv --python 3.10
14
+
15
+ source .venv/bin/activate
16
+
17
+ uv pip install -r requirements.txt
18
+
19
+ # in development mode
20
+ uv pip install -r requirements-dev.txt
21
+ ```
22
+
23
+ ## Build image
24
+
25
+ ```shell
26
+ docker build -t grammar-correction-uk .
27
+ ```
28
+
29
+ ## Run
30
+
31
+ ```shell
32
+ docker run -it --rm -p 8888:7860 grammar-correction-uk
33
+ ```
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+
4
+ from importlib.metadata import version
5
+
6
+ import torch
7
+ import gradio as gr
8
+
9
+ from transformers import MBartForConditionalGeneration, AutoTokenizer
10
+
11
+ # Config
12
+ model_name = "/home/user/app/best-unlp"
13
+ concurrency_limit = 5
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ # Load the model
18
+ model = MBartForConditionalGeneration.from_pretrained(
19
+ model_name,
20
+ low_cpu_mem_usage=True,
21
+ device_map=device,
22
+ )
23
+ model.eval()
24
+
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ tokenizer.src_lang = "uk_UA"
27
+ tokenizer.tgt_lang = "uk_UA"
28
+
29
+ examples = [
30
+ "привіт як справі?",
31
+ "як твої дела?",
32
+ ]
33
+
34
+ title = "Grammar Correction for Ukrainian"
35
+
36
+ # https://www.tablesgenerator.com/markdown_tables
37
+ authors_table = """
38
+ ## Authors
39
+
40
+ Follow them on social networks and **contact** if you need any help or have any questions:
41
+
42
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
43
+ |-------------------------------------------------------------------------------------------------|
44
+ | https://t.me/smlkw in Telegram |
45
+ | https://x.com/yehor_smoliakov at X |
46
+ | https://github.com/egorsmkv at GitHub |
47
+ | https://huggingface.co/Yehor at Hugging Face |
48
+ | or use [email protected] |
49
+ """.strip()
50
+
51
+ description_head = f"""
52
+ # {title}
53
+
54
+ ## Overview
55
+
56
+ This space uses https://huggingface.co/Pravopysnyk/best-unlp model.
57
+
58
+ Paste the text you want to enhance.
59
+ """.strip()
60
+
61
+ description_foot = f"""
62
+ {authors_table}
63
+ """.strip()
64
+
65
+ normalized_text_value = """
66
+ Normalized text will appear here.
67
+
68
+ Choose **an example** below the Normalize button or paste **your text**.
69
+ """.strip()
70
+
71
+ tech_env = f"""
72
+ #### Environment
73
+
74
+ - Python: {sys.version}
75
+ """.strip()
76
+
77
+ tech_libraries = f"""
78
+ #### Libraries
79
+
80
+ - torch: {version('torch')}
81
+ - gradio: {version('gradio')}
82
+ - transformers: {version('transformers')}
83
+ """.strip()
84
+
85
+
86
+ def inference(text, progress=gr.Progress()):
87
+ if not text:
88
+ raise gr.Error("Please paste your text.")
89
+
90
+ gr.Info("Starting normalizing", duration=2)
91
+
92
+ progress(0, desc="Normalizing...")
93
+
94
+ results = []
95
+
96
+ sentences = [
97
+ text,
98
+ ]
99
+
100
+ for sentence in progress.tqdm(sentences, desc="Normalizing...", unit="sentence"):
101
+ sentence = sentence.strip()
102
+
103
+ if len(sentence) == 0:
104
+ continue
105
+
106
+ t0 = time.time()
107
+
108
+ input_text = "<verbalization>:" + sentence
109
+
110
+ encoded_input = tokenizer(
111
+ input_text,
112
+ return_tensors="pt",
113
+ padding=True,
114
+ truncation=True,
115
+ max_length=1024,
116
+ ).to(device)
117
+ output_ids = model.generate(
118
+ **encoded_input, max_length=1024, num_beams=5, early_stopping=True
119
+ )
120
+ normalized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
121
+
122
+ if not normalized_text:
123
+ normalized_text = "-"
124
+
125
+ elapsed_time = round(time.time() - t0, 2)
126
+
127
+ normalized_text = normalized_text.strip()
128
+ results.append(
129
+ {
130
+ "sentence": sentence,
131
+ "normalized_text": normalized_text,
132
+ "elapsed_time": elapsed_time,
133
+ }
134
+ )
135
+
136
+ gr.Info("Finished!", duration=2)
137
+
138
+ result_texts = []
139
+
140
+ for result in results:
141
+ result_texts.append(f'> {result["normalized_text"]}')
142
+ result_texts.append("\n")
143
+
144
+ sum_elapsed_text = sum([result["elapsed_time"] for result in results])
145
+ result_texts.append(f"Elapsed time: {sum_elapsed_text} seconds")
146
+
147
+ return "\n".join(result_texts)
148
+
149
+
150
+ demo = gr.Blocks(
151
+ title=title,
152
+ analytics_enabled=False,
153
+ # theme="huggingface",
154
+ theme=gr.themes.Base(),
155
+ )
156
+
157
+ with demo:
158
+ gr.Markdown(description_head)
159
+
160
+ gr.Markdown("## Usage")
161
+
162
+ with gr.Row():
163
+ text = gr.Textbox(label="Text", autofocus=True, max_lines=1)
164
+ normalized_text = gr.Textbox(
165
+ label="Normalized text",
166
+ placeholder=normalized_text_value,
167
+ show_copy_button=True,
168
+ )
169
+
170
+ gr.Button("Normalize").click(
171
+ inference,
172
+ concurrency_limit=concurrency_limit,
173
+ inputs=text,
174
+ outputs=normalized_text,
175
+ )
176
+
177
+ with gr.Row():
178
+ gr.Examples(label="Choose an example", inputs=text, examples=examples)
179
+
180
+ gr.Markdown(description_foot)
181
+
182
+ gr.Markdown("### Gradio app uses:")
183
+ gr.Markdown(tech_env)
184
+ gr.Markdown(tech_libraries)
185
+
186
+ if __name__ == "__main__":
187
+ demo.queue()
188
+ demo.launch()
requirements-dev.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ruff
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+
3
+ transformers
4
+ accelerate