Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Commit
·
5cad0cc
1
Parent(s):
dc5c663
Attempt on batch processing
Browse files- .gitignore +1 -0
- README.md +5 -4
- app.py +99 -36
- requirements.txt +2 -0
.gitignore
CHANGED
@@ -6,3 +6,4 @@ __pycache__
|
|
6 |
/page-content
|
7 |
/summaries
|
8 |
/.streamlit
|
|
|
|
6 |
/page-content
|
7 |
/summaries
|
8 |
/.streamlit
|
9 |
+
/transformer
|
README.md
CHANGED
@@ -34,14 +34,15 @@ google_search_engine_id = "search-engine-id"
|
|
34 |
- To start the interface: `streamlit run app.py`
|
35 |
|
36 |
### Todo
|
37 |
-
- [x] Fix issue of duplicate content extracted by beautifulsoup.
|
38 |
-
- [x] Exclude code from content
|
39 |
- [ ] Improve fetched content.
|
|
|
|
|
40 |
- [x] Find sentences that contain the search keywords.
|
41 |
- [ ] Find sentences that contain the search keywords taking into account different spellings health care vs healthcare.
|
42 |
- [ ] Get some content from every search result.
|
43 |
-
- [ ] Div's with text & tags. Extract text from tags and then decompose the tags.
|
44 |
- [ ] Summarization requires truncation. Find solution where not needed.
|
45 |
-
- [ ] Support German content.
|
46 |
- [ ] Improve queries to include more keywords (Expand abrivations & define context)
|
47 |
- [ ] Control the number of results from the UI.
|
|
|
|
34 |
- To start the interface: `streamlit run app.py`
|
35 |
|
36 |
### Todo
|
|
|
|
|
37 |
- [ ] Improve fetched content.
|
38 |
+
- [x] Fix issue of duplicate content extracted by beautifulsoup.
|
39 |
+
- [x] Exclude code from content
|
40 |
- [x] Find sentences that contain the search keywords.
|
41 |
- [ ] Find sentences that contain the search keywords taking into account different spellings health care vs healthcare.
|
42 |
- [ ] Get some content from every search result.
|
43 |
+
- [ ] Div's with text & tags. Extract text from tags and then decompose the tags. Keep order of content and no duplicates.
|
44 |
- [ ] Summarization requires truncation. Find solution where not needed.
|
45 |
+
- [ ] Support German content with language switcher.
|
46 |
- [ ] Improve queries to include more keywords (Expand abrivations & define context)
|
47 |
- [ ] Control the number of results from the UI.
|
48 |
+
- [ ] Control summary length via settings: https://docs.streamlit.io/library/advanced-features/session-state
|
app.py
CHANGED
@@ -7,14 +7,18 @@ from googleapiclient.discovery import build
|
|
7 |
from slugify import slugify
|
8 |
from transformers import pipeline
|
9 |
import uuid
|
|
|
|
|
10 |
|
11 |
from beautiful_soup.beautiful_soup import get_url_content
|
12 |
|
13 |
-
|
14 |
-
Request Google Search API with query and return results.
|
15 |
-
"""
|
16 |
@cache
|
17 |
def google_search_api_request( query ):
|
|
|
|
|
|
|
|
|
18 |
api_key = st.secrets["google_search_api_key"]
|
19 |
cx = st.secrets["google_search_engine_id"]
|
20 |
service = build(
|
@@ -35,10 +39,11 @@ def google_search_api_request( query ):
|
|
35 |
fields='items(title,link),searchInformation(totalResults)'
|
36 |
).execute()
|
37 |
|
38 |
-
|
39 |
-
Request Google Search API with query and return results. Results are cached in files.
|
40 |
-
"""
|
41 |
def search_results( query ):
|
|
|
|
|
|
|
42 |
file_path = 'search-results/' + slugify( query ) + '.json'
|
43 |
|
44 |
results = []
|
@@ -58,54 +63,96 @@ def search_results( query ):
|
|
58 |
|
59 |
return results
|
60 |
|
61 |
-
|
62 |
-
Generate summary for content.
|
63 |
-
"""
|
64 |
-
def generate_summary( url_id, content ):
|
65 |
file_path = 'summaries/' + url_id + '.json'
|
66 |
makedirs(dirname(file_path), exist_ok=True)
|
67 |
if exists( file_path ):
|
68 |
with open( file_path, 'r' ) as file:
|
69 |
summary = json.load( file )
|
70 |
else:
|
71 |
-
|
72 |
-
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
73 |
-
# https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
|
74 |
-
summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
|
75 |
-
except Exception as exception:
|
76 |
-
raise exception
|
77 |
|
78 |
with open( file_path, 'w' ) as file:
|
79 |
json.dump( summary, file )
|
80 |
|
81 |
return summary
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
def exception_notice( exception ):
|
|
|
|
|
|
|
87 |
query_params = st.experimental_get_query_params()
|
88 |
if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
|
89 |
st.exception(exception)
|
90 |
else:
|
91 |
st.warning(str(exception))
|
92 |
|
93 |
-
"""
|
94 |
-
Checks if string contains keyword.
|
95 |
-
"""
|
96 |
def is_keyword_in_string( keywords, string ):
|
|
|
|
|
|
|
97 |
for keyword in keywords:
|
98 |
if keyword in string:
|
99 |
return True
|
100 |
return False
|
101 |
|
102 |
-
def
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
for string in strings:
|
105 |
-
#
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
def main():
|
111 |
st.title('Racoon Search')
|
@@ -140,14 +187,30 @@ def main():
|
|
140 |
st.markdown('### ' + result['title'])
|
141 |
url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
|
142 |
try:
|
143 |
-
strings
|
144 |
-
keywords
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
except Exception as exception:
|
152 |
exception_notice(exception)
|
153 |
|
|
|
7 |
from slugify import slugify
|
8 |
from transformers import pipeline
|
9 |
import uuid
|
10 |
+
import spacy
|
11 |
+
from spacy.matcher import PhraseMatcher
|
12 |
|
13 |
from beautiful_soup.beautiful_soup import get_url_content
|
14 |
|
15 |
+
|
|
|
|
|
16 |
@cache
|
17 |
def google_search_api_request( query ):
|
18 |
+
"""
|
19 |
+
Request Google Search API with query and return results.
|
20 |
+
"""
|
21 |
+
|
22 |
api_key = st.secrets["google_search_api_key"]
|
23 |
cx = st.secrets["google_search_engine_id"]
|
24 |
service = build(
|
|
|
39 |
fields='items(title,link),searchInformation(totalResults)'
|
40 |
).execute()
|
41 |
|
42 |
+
|
|
|
|
|
43 |
def search_results( query ):
|
44 |
+
"""
|
45 |
+
Request Google Search API with query and return results. Results are cached in files.
|
46 |
+
"""
|
47 |
file_path = 'search-results/' + slugify( query ) + '.json'
|
48 |
|
49 |
results = []
|
|
|
63 |
|
64 |
return results
|
65 |
|
66 |
+
def get_summary( url_id, content ):
|
|
|
|
|
|
|
67 |
file_path = 'summaries/' + url_id + '.json'
|
68 |
makedirs(dirname(file_path), exist_ok=True)
|
69 |
if exists( file_path ):
|
70 |
with open( file_path, 'r' ) as file:
|
71 |
summary = json.load( file )
|
72 |
else:
|
73 |
+
summary = generate_summary( content )
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
with open( file_path, 'w' ) as file:
|
76 |
json.dump( summary, file )
|
77 |
|
78 |
return summary
|
79 |
|
80 |
+
def generate_summary( content, max_length = 200 ):
|
81 |
+
"""
|
82 |
+
Generate summary for content.
|
83 |
+
"""
|
84 |
+
try:
|
85 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
86 |
+
# https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
|
87 |
+
summary = summarizer(content, max_length, min_length=30, do_sample=False, truncation=True)
|
88 |
+
except Exception as exception:
|
89 |
+
raise exception
|
90 |
+
|
91 |
+
return summary
|
92 |
+
|
93 |
def exception_notice( exception ):
|
94 |
+
"""
|
95 |
+
Helper function for exception notices.
|
96 |
+
"""
|
97 |
query_params = st.experimental_get_query_params()
|
98 |
if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
|
99 |
st.exception(exception)
|
100 |
else:
|
101 |
st.warning(str(exception))
|
102 |
|
|
|
|
|
|
|
103 |
def is_keyword_in_string( keywords, string ):
|
104 |
+
"""
|
105 |
+
Checks if string contains keyword.
|
106 |
+
"""
|
107 |
for keyword in keywords:
|
108 |
if keyword in string:
|
109 |
return True
|
110 |
return False
|
111 |
|
112 |
+
def filter_sentences_by_keywords( strings, keywords ):
|
113 |
+
nlp = spacy.load("en_core_web_sm")
|
114 |
+
matcher = PhraseMatcher(nlp.vocab)
|
115 |
+
phrases = keywords
|
116 |
+
patterns = [nlp(phrase) for phrase in phrases]
|
117 |
+
matcher.add("QueryList", patterns)
|
118 |
+
|
119 |
+
sentences = []
|
120 |
for string in strings:
|
121 |
+
# Exclude short sentences
|
122 |
+
string_length = len( string.split(' ') )
|
123 |
+
if string_length < 5:
|
124 |
+
continue
|
125 |
+
doc = nlp(string)
|
126 |
+
for sentence in doc.sents:
|
127 |
+
matches = matcher(nlp(sentence.text))
|
128 |
+
for match_id, start, end in matches:
|
129 |
+
if nlp.vocab.strings[match_id] in ["QueryList"]:
|
130 |
+
sentences.append(sentence.text)
|
131 |
+
|
132 |
+
return sentences
|
133 |
+
|
134 |
+
def split_content_into_chunks( sentences ):
|
135 |
+
"""
|
136 |
+
Split content into chunks.
|
137 |
+
"""
|
138 |
+
chunk = ''
|
139 |
+
word_count = 0
|
140 |
+
chunks = []
|
141 |
+
for sentence in sentences:
|
142 |
+
current_word_count = len(sentence.split(' '))
|
143 |
+
if word_count + current_word_count > 512:
|
144 |
+
st.write("Number of words(tokens): {}".format(word_count))
|
145 |
+
chunks.append(chunk)
|
146 |
+
chunk = ''
|
147 |
+
word_count = 0
|
148 |
+
|
149 |
+
word_count += current_word_count
|
150 |
+
chunk += sentence + ' '
|
151 |
+
|
152 |
+
st.write("Number of words(tokens): {}".format(word_count))
|
153 |
+
chunks.append(chunk)
|
154 |
+
|
155 |
+
return chunks
|
156 |
|
157 |
def main():
|
158 |
st.title('Racoon Search')
|
|
|
187 |
st.markdown('### ' + result['title'])
|
188 |
url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
|
189 |
try:
|
190 |
+
strings = get_url_content( result['link'] )
|
191 |
+
keywords = query.split(' ')
|
192 |
+
sentences = filter_sentences_by_keywords( strings, keywords )
|
193 |
+
chunks = split_content_into_chunks( sentences )
|
194 |
+
|
195 |
+
number_of_chunks = len( chunks )
|
196 |
+
if number_of_chunks > 1:
|
197 |
+
max_length = int( 512 / len( chunks ) )
|
198 |
+
st.write("Max length: {}".format(max_length))
|
199 |
+
|
200 |
+
content = ''
|
201 |
+
for chunk in chunks:
|
202 |
+
chunk_length = len( chunk.split(' ') )
|
203 |
+
chunk_max_length = 200
|
204 |
+
if chunk_length < max_length:
|
205 |
+
chunk_max_length = int( chunk_length / 2 )
|
206 |
+
chunk_summary = generate_summary( chunk, min( max_length, chunk_max_length ) )
|
207 |
+
for summary in chunk_summary:
|
208 |
+
content += summary['summary_text'] + ' '
|
209 |
+
else:
|
210 |
+
content = chunks[0]
|
211 |
+
|
212 |
+
summary = get_summary( url_id, content )
|
213 |
+
|
214 |
except Exception as exception:
|
215 |
exception_notice(exception)
|
216 |
|
requirements.txt
CHANGED
@@ -3,3 +3,5 @@ google-api-python-client
|
|
3 |
beautifulsoup4
|
4 |
python-slugify
|
5 |
transformers[sentencepiece,torch]
|
|
|
|
|
|
3 |
beautifulsoup4
|
4 |
python-slugify
|
5 |
transformers[sentencepiece,torch]
|
6 |
+
spacy
|
7 |
+
https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
|