grapplerulrich commited on
Commit
5cad0cc
·
1 Parent(s): dc5c663

Attempt on batch processing

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +5 -4
  3. app.py +99 -36
  4. requirements.txt +2 -0
.gitignore CHANGED
@@ -6,3 +6,4 @@ __pycache__
6
  /page-content
7
  /summaries
8
  /.streamlit
 
 
6
  /page-content
7
  /summaries
8
  /.streamlit
9
+ /transformer
README.md CHANGED
@@ -34,14 +34,15 @@ google_search_engine_id = "search-engine-id"
34
  - To start the interface: `streamlit run app.py`
35
 
36
  ### Todo
37
- - [x] Fix issue of duplicate content extracted by beautifulsoup.
38
- - [x] Exclude code from content
39
  - [ ] Improve fetched content.
 
 
40
  - [x] Find sentences that contain the search keywords.
41
  - [ ] Find sentences that contain the search keywords taking into account different spellings health care vs healthcare.
42
  - [ ] Get some content from every search result.
43
- - [ ] Div's with text & tags. Extract text from tags and then decompose the tags.
44
  - [ ] Summarization requires truncation. Find solution where not needed.
45
- - [ ] Support German content.
46
  - [ ] Improve queries to include more keywords (Expand abrivations & define context)
47
  - [ ] Control the number of results from the UI.
 
 
34
  - To start the interface: `streamlit run app.py`
35
 
36
  ### Todo
 
 
37
  - [ ] Improve fetched content.
38
+ - [x] Fix issue of duplicate content extracted by beautifulsoup.
39
+ - [x] Exclude code from content
40
  - [x] Find sentences that contain the search keywords.
41
  - [ ] Find sentences that contain the search keywords taking into account different spellings health care vs healthcare.
42
  - [ ] Get some content from every search result.
43
+ - [ ] Div's with text & tags. Extract text from tags and then decompose the tags. Keep order of content and no duplicates.
44
  - [ ] Summarization requires truncation. Find solution where not needed.
45
+ - [ ] Support German content with language switcher.
46
  - [ ] Improve queries to include more keywords (Expand abrivations & define context)
47
  - [ ] Control the number of results from the UI.
48
+ - [ ] Control summary length via settings: https://docs.streamlit.io/library/advanced-features/session-state
app.py CHANGED
@@ -7,14 +7,18 @@ from googleapiclient.discovery import build
7
  from slugify import slugify
8
  from transformers import pipeline
9
  import uuid
 
 
10
 
11
  from beautiful_soup.beautiful_soup import get_url_content
12
 
13
- """
14
- Request Google Search API with query and return results.
15
- """
16
  @cache
17
  def google_search_api_request( query ):
 
 
 
 
18
  api_key = st.secrets["google_search_api_key"]
19
  cx = st.secrets["google_search_engine_id"]
20
  service = build(
@@ -35,10 +39,11 @@ def google_search_api_request( query ):
35
  fields='items(title,link),searchInformation(totalResults)'
36
  ).execute()
37
 
38
- """
39
- Request Google Search API with query and return results. Results are cached in files.
40
- """
41
  def search_results( query ):
 
 
 
42
  file_path = 'search-results/' + slugify( query ) + '.json'
43
 
44
  results = []
@@ -58,54 +63,96 @@ def search_results( query ):
58
 
59
  return results
60
 
61
- """
62
- Generate summary for content.
63
- """
64
- def generate_summary( url_id, content ):
65
  file_path = 'summaries/' + url_id + '.json'
66
  makedirs(dirname(file_path), exist_ok=True)
67
  if exists( file_path ):
68
  with open( file_path, 'r' ) as file:
69
  summary = json.load( file )
70
  else:
71
- try:
72
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
73
- # https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
74
- summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
75
- except Exception as exception:
76
- raise exception
77
 
78
  with open( file_path, 'w' ) as file:
79
  json.dump( summary, file )
80
 
81
  return summary
82
 
83
- """
84
- Helper function for exception notices.
85
- """
 
 
 
 
 
 
 
 
 
 
86
  def exception_notice( exception ):
 
 
 
87
  query_params = st.experimental_get_query_params()
88
  if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
89
  st.exception(exception)
90
  else:
91
  st.warning(str(exception))
92
 
93
- """
94
- Checks if string contains keyword.
95
- """
96
  def is_keyword_in_string( keywords, string ):
 
 
 
97
  for keyword in keywords:
98
  if keyword in string:
99
  return True
100
  return False
101
 
102
- def filter_strings_by_keywords( strings, keywords ):
103
- content = ''
 
 
 
 
 
 
104
  for string in strings:
105
- # Filter strings with keywords
106
- if is_keyword_in_string( keywords, string ):
107
- content += string + '\n'
108
- return content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  def main():
111
  st.title('Racoon Search')
@@ -140,14 +187,30 @@ def main():
140
  st.markdown('### ' + result['title'])
141
  url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
142
  try:
143
- strings = get_url_content( result['link'] )
144
- keywords = query.split(' ')
145
- content = filter_strings_by_keywords( strings, keywords )
146
- # print(content)
147
- # print(len(content.split()))
148
- summary = generate_summary( url_id, content )
149
- for sentence in summary:
150
- st.write(sentence['summary_text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  except Exception as exception:
152
  exception_notice(exception)
153
 
 
7
  from slugify import slugify
8
  from transformers import pipeline
9
  import uuid
10
+ import spacy
11
+ from spacy.matcher import PhraseMatcher
12
 
13
  from beautiful_soup.beautiful_soup import get_url_content
14
 
15
+
 
 
16
  @cache
17
  def google_search_api_request( query ):
18
+ """
19
+ Request Google Search API with query and return results.
20
+ """
21
+
22
  api_key = st.secrets["google_search_api_key"]
23
  cx = st.secrets["google_search_engine_id"]
24
  service = build(
 
39
  fields='items(title,link),searchInformation(totalResults)'
40
  ).execute()
41
 
42
+
 
 
43
  def search_results( query ):
44
+ """
45
+ Request Google Search API with query and return results. Results are cached in files.
46
+ """
47
  file_path = 'search-results/' + slugify( query ) + '.json'
48
 
49
  results = []
 
63
 
64
  return results
65
 
66
+ def get_summary( url_id, content ):
 
 
 
67
  file_path = 'summaries/' + url_id + '.json'
68
  makedirs(dirname(file_path), exist_ok=True)
69
  if exists( file_path ):
70
  with open( file_path, 'r' ) as file:
71
  summary = json.load( file )
72
  else:
73
+ summary = generate_summary( content )
 
 
 
 
 
74
 
75
  with open( file_path, 'w' ) as file:
76
  json.dump( summary, file )
77
 
78
  return summary
79
 
80
+ def generate_summary( content, max_length = 200 ):
81
+ """
82
+ Generate summary for content.
83
+ """
84
+ try:
85
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
86
+ # https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
87
+ summary = summarizer(content, max_length, min_length=30, do_sample=False, truncation=True)
88
+ except Exception as exception:
89
+ raise exception
90
+
91
+ return summary
92
+
93
  def exception_notice( exception ):
94
+ """
95
+ Helper function for exception notices.
96
+ """
97
  query_params = st.experimental_get_query_params()
98
  if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
99
  st.exception(exception)
100
  else:
101
  st.warning(str(exception))
102
 
 
 
 
103
  def is_keyword_in_string( keywords, string ):
104
+ """
105
+ Checks if string contains keyword.
106
+ """
107
  for keyword in keywords:
108
  if keyword in string:
109
  return True
110
  return False
111
 
112
+ def filter_sentences_by_keywords( strings, keywords ):
113
+ nlp = spacy.load("en_core_web_sm")
114
+ matcher = PhraseMatcher(nlp.vocab)
115
+ phrases = keywords
116
+ patterns = [nlp(phrase) for phrase in phrases]
117
+ matcher.add("QueryList", patterns)
118
+
119
+ sentences = []
120
  for string in strings:
121
+ # Exclude short sentences
122
+ string_length = len( string.split(' ') )
123
+ if string_length < 5:
124
+ continue
125
+ doc = nlp(string)
126
+ for sentence in doc.sents:
127
+ matches = matcher(nlp(sentence.text))
128
+ for match_id, start, end in matches:
129
+ if nlp.vocab.strings[match_id] in ["QueryList"]:
130
+ sentences.append(sentence.text)
131
+
132
+ return sentences
133
+
134
+ def split_content_into_chunks( sentences ):
135
+ """
136
+ Split content into chunks.
137
+ """
138
+ chunk = ''
139
+ word_count = 0
140
+ chunks = []
141
+ for sentence in sentences:
142
+ current_word_count = len(sentence.split(' '))
143
+ if word_count + current_word_count > 512:
144
+ st.write("Number of words(tokens): {}".format(word_count))
145
+ chunks.append(chunk)
146
+ chunk = ''
147
+ word_count = 0
148
+
149
+ word_count += current_word_count
150
+ chunk += sentence + ' '
151
+
152
+ st.write("Number of words(tokens): {}".format(word_count))
153
+ chunks.append(chunk)
154
+
155
+ return chunks
156
 
157
  def main():
158
  st.title('Racoon Search')
 
187
  st.markdown('### ' + result['title'])
188
  url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
189
  try:
190
+ strings = get_url_content( result['link'] )
191
+ keywords = query.split(' ')
192
+ sentences = filter_sentences_by_keywords( strings, keywords )
193
+ chunks = split_content_into_chunks( sentences )
194
+
195
+ number_of_chunks = len( chunks )
196
+ if number_of_chunks > 1:
197
+ max_length = int( 512 / len( chunks ) )
198
+ st.write("Max length: {}".format(max_length))
199
+
200
+ content = ''
201
+ for chunk in chunks:
202
+ chunk_length = len( chunk.split(' ') )
203
+ chunk_max_length = 200
204
+ if chunk_length < max_length:
205
+ chunk_max_length = int( chunk_length / 2 )
206
+ chunk_summary = generate_summary( chunk, min( max_length, chunk_max_length ) )
207
+ for summary in chunk_summary:
208
+ content += summary['summary_text'] + ' '
209
+ else:
210
+ content = chunks[0]
211
+
212
+ summary = get_summary( url_id, content )
213
+
214
  except Exception as exception:
215
  exception_notice(exception)
216
 
requirements.txt CHANGED
@@ -3,3 +3,5 @@ google-api-python-client
3
  beautifulsoup4
4
  python-slugify
5
  transformers[sentencepiece,torch]
 
 
 
3
  beautifulsoup4
4
  python-slugify
5
  transformers[sentencepiece,torch]
6
+ spacy
7
+ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl