Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,9 +6,11 @@ import spacy
|
|
6 |
import re
|
7 |
from nltk.corpus import stopwords
|
8 |
from nltk.tokenize import word_tokenize
|
9 |
-
from tensorflow.keras.preprocessing.text import Tokenizer
|
10 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
|
|
11 |
|
|
|
12 |
import spacy.cli
|
13 |
spacy.cli.download("en_core_web_sm")
|
14 |
nltk.download('punkt_tab')
|
@@ -17,7 +19,6 @@ stop_words = set(stopwords.words('english'))
|
|
17 |
nlp = spacy.load('en_core_web_sm')
|
18 |
|
19 |
# Download the model file from Hugging Face
|
20 |
-
import requests
|
21 |
model_url = "https://huggingface.co/Zmorell/HIPA_2/resolve/main/saved_keras_model.keras"
|
22 |
local_model_path = "saved_keras_model.keras"
|
23 |
|
@@ -31,12 +32,17 @@ print(f"Model downloaded to {local_model_path}")
|
|
31 |
model = tf.keras.models.load_model(local_model_path)
|
32 |
print(f"Model loaded from {local_model_path}")
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def preprocess_text(text):
|
35 |
-
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
36 |
-
# Tokenize and remove stopwords
|
37 |
tokens = word_tokenize(text.lower())
|
38 |
tokens = [word for word in tokens if word not in stop_words]
|
39 |
-
# Lemmatize
|
40 |
doc = nlp(' '.join(tokens))
|
41 |
lemmas = [token.lemma_ for token in doc]
|
42 |
return ' '.join(lemmas)
|
@@ -47,7 +53,6 @@ def predict(text):
|
|
47 |
inputs = preprocess_text(text)
|
48 |
print(f"Preprocessed text: {inputs}")
|
49 |
|
50 |
-
# Ensure the input shape matches what the model expects
|
51 |
inputs = tokenizer.texts_to_sequences([inputs])
|
52 |
print(f"Tokenized text: {inputs}")
|
53 |
|
|
|
6 |
import re
|
7 |
from nltk.corpus import stopwords
|
8 |
from nltk.tokenize import word_tokenize
|
|
|
9 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
10 |
+
import requests
|
11 |
+
import pickle
|
12 |
|
13 |
+
# Download necessary resources
|
14 |
import spacy.cli
|
15 |
spacy.cli.download("en_core_web_sm")
|
16 |
nltk.download('punkt_tab')
|
|
|
19 |
nlp = spacy.load('en_core_web_sm')
|
20 |
|
21 |
# Download the model file from Hugging Face
|
|
|
22 |
model_url = "https://huggingface.co/Zmorell/HIPA_2/resolve/main/saved_keras_model.keras"
|
23 |
local_model_path = "saved_keras_model.keras"
|
24 |
|
|
|
32 |
model = tf.keras.models.load_model(local_model_path)
|
33 |
print(f"Model loaded from {local_model_path}")
|
34 |
|
35 |
+
# Load the tokenizer
|
36 |
+
tokenizer_file_path = "tokenizer.pickle"
|
37 |
+
with open(tokenizer_file_path, 'rb') as handle:
|
38 |
+
tokenizer = pickle.load(handle)
|
39 |
+
|
40 |
+
print("Tokenizer loaded from tokenizer.pickle")
|
41 |
+
|
42 |
def preprocess_text(text):
|
43 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
|
|
44 |
tokens = word_tokenize(text.lower())
|
45 |
tokens = [word for word in tokens if word not in stop_words]
|
|
|
46 |
doc = nlp(' '.join(tokens))
|
47 |
lemmas = [token.lemma_ for token in doc]
|
48 |
return ' '.join(lemmas)
|
|
|
53 |
inputs = preprocess_text(text)
|
54 |
print(f"Preprocessed text: {inputs}")
|
55 |
|
|
|
56 |
inputs = tokenizer.texts_to_sequences([inputs])
|
57 |
print(f"Tokenized text: {inputs}")
|
58 |
|