Maslov-Artem
commited on
Commit
•
b90441b
1
Parent(s):
c747562
New weights and streamlit features
Browse files- .gitignore +13 -2
- 17/config.json +1 -1
- 17/model.safetensors +1 -1
- app.py +48 -1
- enlightened_static.jpg +0 -0
- model/funcs.py +18 -0
- model/model_weights.pt +2 -2
- preprocessing.py +0 -30
- static_toxic.jpg +0 -0
.gitignore
CHANGED
@@ -1,6 +1,17 @@
|
|
1 |
.venv
|
2 |
healthcare_facilities_reviews.jsonl
|
3 |
*.ipynb
|
4 |
-
__pycache__/
|
5 |
*.csv
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
.venv
|
2 |
healthcare_facilities_reviews.jsonl
|
3 |
*.ipynb
|
4 |
+
/__pycache__/
|
5 |
*.csv
|
6 |
+
/.ipynb_checkoints/
|
7 |
+
.DS_Store
|
8 |
+
RNN/
|
9 |
+
bert/
|
10 |
+
cached_lm_GPT2Tokenizer_64_wiki_content.txt
|
11 |
+
cached_lm_GPT2Tokenizer_64_wiki_content.txt.lock
|
12 |
+
finetuned/
|
13 |
+
.gitattributes
|
14 |
+
*.txt
|
15 |
+
model/.ipynb_checkpoints/
|
16 |
+
model/__pycache__/
|
17 |
+
preprocessing/__pycache__/
|
17/config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"activation_function": "gelu_new",
|
4 |
"architectures": [
|
5 |
"GPT2LMHeadModel"
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "/content/drive/MyDrive/model__weights",
|
3 |
"activation_function": "gelu_new",
|
4 |
"architectures": [
|
5 |
"GPT2LMHeadModel"
|
17/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 500941440
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e39686188a07e05ea4860c12df5bb451c630233ccebeee26dc24a4c3219b3b53
|
3 |
size 500941440
|
app.py
CHANGED
@@ -1,3 +1,50 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
static_toxicity_path = "https://imagizer.imageshack.com/v2/480x360q70/r/924/L4Ditq.jpg"
|
4 |
+
animated_toxicity_path = (
|
5 |
+
"https://i.kym-cdn.com/photos/images/original/001/264/967/cdc.gif"
|
6 |
+
)
|
7 |
+
animated_enlighten_path = "https://gifdb.com/images/high/zen-meditation-chakras-illustration-6lujnenasnfmn8dt.gif"
|
8 |
+
static_enlighten_path = "https://imagizer.imageshack.com/v2/668x500q70/r/922/bpoy6G.jpg"
|
9 |
+
|
10 |
+
# Calculate the column widths dynamically
|
11 |
+
|
12 |
+
|
13 |
+
toxicity_html = f"""
|
14 |
+
<div class="toxicity-image-container">
|
15 |
+
<a href="review_predictor" target="_self" class="toxicity-link">
|
16 |
+
<img src="{static_toxicity_path}" class="toxicity-image" />
|
17 |
+
</a>
|
18 |
+
</div>
|
19 |
+
<style>
|
20 |
+
/* Define the hover state for column 1 */
|
21 |
+
.toxicity-image-container:hover .toxicity-image {{
|
22 |
+
content: url("{animated_toxicity_path}");
|
23 |
+
transform: scale(1.1); /* Enlarge the image by 10% */
|
24 |
+
transition: transform 0.5s ease; /* Add smooth transition */
|
25 |
+
}}
|
26 |
+
</style>
|
27 |
+
"""
|
28 |
+
|
29 |
+
enlighten_html = f"""
|
30 |
+
<div class="enlighten-image-container">
|
31 |
+
<a href="text_generator" target="_self" class="enlighten-link">
|
32 |
+
<img src="{static_enlighten_path}" class="enlighten-image" />
|
33 |
+
</a>
|
34 |
+
</div>
|
35 |
+
<style>
|
36 |
+
/* Define the hover state for column 2 */
|
37 |
+
.enlighten-image-container:hover .enlighten-image {{
|
38 |
+
content: url("{animated_enlighten_path}");
|
39 |
+
transform: scale(1.1); /* Enlarge the image by 10% */
|
40 |
+
transition: transform 0.5s ease; /* Add smooth transition */
|
41 |
+
}}
|
42 |
+
</style>
|
43 |
+
"""
|
44 |
+
|
45 |
+
# Display HTML code with Streamlit
|
46 |
+
st.markdown(toxicity_html, unsafe_allow_html=True)
|
47 |
+
st.markdown(enlighten_html, unsafe_allow_html=True)
|
48 |
+
|
49 |
+
|
50 |
+
# Display JavaScript code with Streamlit
|
enlightened_static.jpg
ADDED
model/funcs.py
CHANGED
@@ -1,10 +1,27 @@
|
|
|
|
|
|
|
|
1 |
import matplotlib.pyplot as plt
|
|
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
from sklearn.metrics import f1_score
|
5 |
from torch.utils.data import Dataset
|
6 |
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
|
9 |
# Создаем объекты для токенизатора и модели
|
10 |
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
|
@@ -98,6 +115,7 @@ def train_model(
|
|
98 |
return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
|
99 |
|
100 |
|
|
|
101 |
def predict_sentiment(text, model, tokenizer, DEVICE):
|
102 |
# Модель должна быть в режиме оценки
|
103 |
model.eval()
|
|
|
1 |
+
import time
|
2 |
+
from functools import wraps
|
3 |
+
|
4 |
import matplotlib.pyplot as plt
|
5 |
+
import streamlit as st
|
6 |
import torch
|
7 |
import torch.nn as nn
|
8 |
from sklearn.metrics import f1_score
|
9 |
from torch.utils.data import Dataset
|
10 |
|
11 |
|
12 |
+
def execution_time(func):
|
13 |
+
@wraps(func)
|
14 |
+
def wrapper(*args, **kwargs):
|
15 |
+
start_time = time.time()
|
16 |
+
result = func(*args, **kwargs)
|
17 |
+
end_time = time.time()
|
18 |
+
execution_seconds = end_time - start_time
|
19 |
+
st.write(f"Model calculating time = {execution_seconds:.5f} seconds")
|
20 |
+
return result
|
21 |
+
|
22 |
+
return wrapper
|
23 |
+
|
24 |
+
|
25 |
def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
|
26 |
# Создаем объекты для токенизатора и модели
|
27 |
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
|
|
|
115 |
return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
|
116 |
|
117 |
|
118 |
+
@execution_time
|
119 |
def predict_sentiment(text, model, tokenizer, DEVICE):
|
120 |
# Модель должна быть в режиме оценки
|
121 |
model.eval()
|
model/model_weights.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38d0d9dfdc648de05fb1bd62dab307a558d045305c5fd4700331a0967ea5e1b5
|
3 |
+
size 50647220
|
preprocessing.py
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import string
|
3 |
-
|
4 |
-
import nltk
|
5 |
-
import pymorphy2
|
6 |
-
from nltk.tokenize import word_tokenize
|
7 |
-
|
8 |
-
nltk.download("punkt")
|
9 |
-
|
10 |
-
|
11 |
-
def clean_text(text: str) -> str:
|
12 |
-
text = text.lower()
|
13 |
-
text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
|
14 |
-
text = re.sub(r"\d+\w*", "", text)
|
15 |
-
text = re.sub(r"\[.*?\]", "", text)
|
16 |
-
text = text.translate(str.maketrans("", "", string.punctuation))
|
17 |
-
return text
|
18 |
-
|
19 |
-
|
20 |
-
def lemmize_and_tokenize_text(text: str) -> list[str]:
|
21 |
-
morph = pymorphy2.MorphAnalyzer()
|
22 |
-
tokens = word_tokenize(text)
|
23 |
-
lemmas = [morph.parse(token)[0].normal_form for token in tokens]
|
24 |
-
return lemmas
|
25 |
-
|
26 |
-
|
27 |
-
def data_preprocessing(text: str) -> list[str]:
|
28 |
-
cleaned_text = clean_text(text)
|
29 |
-
lemmized_text = lemmize_and_tokenize_text(cleaned_text)
|
30 |
-
return lemmized_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static_toxic.jpg
ADDED