dtcda

Sleeping

App Files Files Community

zmbfeng commited on Sep 20

Commit

40ddb2b

•

1 Parent(s): 4c2c5b7

encode sentence extracted

Browse files

Files changed (1) hide show

app.py +13 -9

app.py CHANGED Viewed

@@ -78,6 +78,15 @@ if 'is_initialized' not in st.session_state:
     st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
     st.session_state.roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
     st.session_state.roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
 if 'list_count' in st.session_state:
     st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
@@ -96,15 +105,10 @@ if 'list_count' in st.session_state:
             paragraph_without_newline= paragraph['paragraph'].replace("\n", "")
             sentences = sent_tokenize(paragraph_without_newline)
             for sentence in sentences:
-                if sentence.strip().endswith('?'):
-                    sentence_encodings.append(None)
-                    continue
-                if len(sentence.strip()) < 4:
-                    sentence_encodings.append(None)
-                    continue
-                sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
-                with torch.no_grad():
-                    sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
                 sentence_encodings.append([sentence, sentence_encoding])
                 # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
             st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])

     st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
     st.session_state.roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
     st.session_state.roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
+def encode_sentence(sentence):
+    if len(sentence.strip()) < 4:
+        return None
+    sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(
+        'cuda')
+    with torch.no_grad():
+        sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
+    return sentence_encoding
 if 'list_count' in st.session_state:
     st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
             paragraph_without_newline= paragraph['paragraph'].replace("\n", "")
             sentences = sent_tokenize(paragraph_without_newline)
             for sentence in sentences:
+                # if sentence.strip().endswith('?'):
+                #     sentence_encodings.append(None)
+                #     continue
+                sentence_encoding = encode_sentence(sentence)
                 sentence_encodings.append([sentence, sentence_encoding])
                 # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
             st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])