Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Use transformer tokenizer to make chunks
Browse filesBased off https://gist.github.com/saprativa/b5cb639e0c035876e0dd3c46e5a380fd
Replaces rudementary and inaccurant method
app.py
CHANGED
@@ -5,7 +5,7 @@ import json
|
|
5 |
import streamlit as st
|
6 |
from googleapiclient.discovery import build
|
7 |
from slugify import slugify
|
8 |
-
from transformers import pipeline
|
9 |
import uuid
|
10 |
import spacy
|
11 |
from spacy.matcher import PhraseMatcher
|
@@ -93,7 +93,7 @@ def get_summary( url, keywords ):
|
|
93 |
content = prep_chunks_summary( strings, keywords )
|
94 |
# Save content to cache file.
|
95 |
with open( content_cache, 'w' ) as file:
|
96 |
-
print(content, file=file)
|
97 |
|
98 |
max_lenth = 200
|
99 |
# Rudementary method to count number of tokens in a chunk.
|
@@ -178,25 +178,25 @@ def filter_sentences_by_keywords( strings, keywords ):
|
|
178 |
|
179 |
return sentences
|
180 |
|
181 |
-
def split_content_into_chunks( sentences ):
|
182 |
"""
|
183 |
Split content into chunks.
|
184 |
"""
|
185 |
-
|
186 |
-
|
187 |
chunks = []
|
188 |
-
# Loop through sentences and split into chunks.
|
189 |
for sentence in sentences:
|
190 |
-
#
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
chunks.append(chunk)
|
195 |
chunk = '' # Reset chunk.
|
196 |
-
|
197 |
|
198 |
# Add sentence to chunk.
|
199 |
-
|
200 |
chunk += sentence + ' '
|
201 |
|
202 |
chunks.append(chunk)
|
@@ -208,29 +208,37 @@ def prep_chunks_summary( strings, keywords ):
|
|
208 |
Chunk summary.
|
209 |
"""
|
210 |
try:
|
|
|
|
|
|
|
|
|
211 |
sentences = filter_sentences_by_keywords( strings, keywords )
|
212 |
-
chunks
|
213 |
|
|
|
214 |
number_of_chunks = len( chunks )
|
215 |
# Loop through chunks if there are more than one.
|
216 |
if number_of_chunks > 1:
|
217 |
-
# Calculate the max summary length based on the number of chunks so that the final combined text is not longer than
|
218 |
-
max_length = int(
|
219 |
|
220 |
-
content = ''
|
221 |
# Loop through chunks and generate summary.
|
222 |
for chunk in chunks:
|
223 |
-
#
|
224 |
-
chunk_length = len(
|
225 |
# If chunk is shorter than max length, divide chunk length by 2.
|
226 |
if chunk_length < max_length:
|
227 |
max_length = int( chunk_length / 2 )
|
228 |
|
229 |
# Generate summary for chunk.
|
230 |
-
|
|
|
|
|
|
|
231 |
for summary in chunk_summary:
|
232 |
content += summary['summary_text'] + ' '
|
233 |
-
|
|
|
234 |
content = chunks[0]
|
235 |
|
236 |
return content
|
|
|
5 |
import streamlit as st
|
6 |
from googleapiclient.discovery import build
|
7 |
from slugify import slugify
|
8 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
9 |
import uuid
|
10 |
import spacy
|
11 |
from spacy.matcher import PhraseMatcher
|
|
|
93 |
content = prep_chunks_summary( strings, keywords )
|
94 |
# Save content to cache file.
|
95 |
with open( content_cache, 'w' ) as file:
|
96 |
+
print(content.strip(), file=file)
|
97 |
|
98 |
max_lenth = 200
|
99 |
# Rudementary method to count number of tokens in a chunk.
|
|
|
178 |
|
179 |
return sentences
|
180 |
|
181 |
+
def split_content_into_chunks( sentences, tokenizer ):
|
182 |
"""
|
183 |
Split content into chunks.
|
184 |
"""
|
185 |
+
combined_length = 0
|
186 |
+
chunk = ""
|
187 |
chunks = []
|
|
|
188 |
for sentence in sentences:
|
189 |
+
# Lenth of tokens in sentence.
|
190 |
+
length = len( tokenizer.tokenize( sentence ) )
|
191 |
+
|
192 |
+
# If the combined token length plus the current sentence is larger then max length, start a new chunk.
|
193 |
+
if combined_length + length > tokenizer.max_len_single_sentence:
|
194 |
chunks.append(chunk)
|
195 |
chunk = '' # Reset chunk.
|
196 |
+
combined_length = 0 # Reset token length.
|
197 |
|
198 |
# Add sentence to chunk.
|
199 |
+
combined_length += length
|
200 |
chunk += sentence + ' '
|
201 |
|
202 |
chunks.append(chunk)
|
|
|
208 |
Chunk summary.
|
209 |
"""
|
210 |
try:
|
211 |
+
checkpoint = "sshleifer/distilbart-cnn-12-6"
|
212 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
213 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
214 |
+
|
215 |
sentences = filter_sentences_by_keywords( strings, keywords )
|
216 |
+
chunks = split_content_into_chunks( sentences, tokenizer )
|
217 |
|
218 |
+
content = ''
|
219 |
number_of_chunks = len( chunks )
|
220 |
# Loop through chunks if there are more than one.
|
221 |
if number_of_chunks > 1:
|
222 |
+
# Calculate the max summary length based on the number of chunks so that the final combined text is not longer than max tokens.
|
223 |
+
max_length = int( tokenizer.max_len_single_sentence / number_of_chunks )
|
224 |
|
|
|
225 |
# Loop through chunks and generate summary.
|
226 |
for chunk in chunks:
|
227 |
+
# Number of tokens in a chunk.
|
228 |
+
chunk_length = len( tokenizer.tokenize( chunk ) )
|
229 |
# If chunk is shorter than max length, divide chunk length by 2.
|
230 |
if chunk_length < max_length:
|
231 |
max_length = int( chunk_length / 2 )
|
232 |
|
233 |
# Generate summary for chunk.
|
234 |
+
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
235 |
+
# https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
|
236 |
+
chunk_summary = summarizer(chunk, max_length, min_length=10, do_sample=False, truncation=True)
|
237 |
+
|
238 |
for summary in chunk_summary:
|
239 |
content += summary['summary_text'] + ' '
|
240 |
+
|
241 |
+
elif number_of_chunks == 1:
|
242 |
content = chunks[0]
|
243 |
|
244 |
return content
|