bertugmirasyedi commited on
Commit
b402f97
·
1 Parent(s): 94b6bc9

Complete overhaul

Browse files
Files changed (1) hide show
  1. app.py +433 -239
app.py CHANGED
@@ -1,22 +1,33 @@
1
  from fastapi import FastAPI
 
2
 
3
  # Define the FastAPI app
4
  app = FastAPI(docs_url="/")
5
 
6
- @app.get("/search={query}&similarity={similarity}")
7
- def search(query, similarity=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import time
9
  import requests
10
 
11
  start_time = time.time()
12
 
13
- # Set the API endpoint and query parameters
14
- url = "https://www.googleapis.com/books/v1/volumes"
15
- params = {"q": str(query), "printType": "books", "maxResults": 1}
16
-
17
- # Send a GET request to the API with the specified parameters
18
- response = requests.get(url, params=params)
19
-
20
  # Initialize the lists to store the results
21
  titles = []
22
  authors = []
@@ -24,230 +35,255 @@ def search(query, similarity=False):
24
  descriptions = []
25
  images = []
26
 
27
- # Parse the response JSON and append the results
28
- data = response.json()
29
-
30
- for item in data["items"]:
31
- volume_info = item["volumeInfo"]
32
- try:
33
- titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
34
- except KeyError:
35
- titles.append(volume_info["title"])
36
-
37
- try:
38
- descriptions.append(volume_info["description"])
39
- except KeyError:
40
- descriptions.append("Null")
41
-
42
- try:
43
- publishers.append(volume_info["publisher"])
44
- except KeyError:
45
- publishers.append("Null")
46
-
47
- try:
48
- authors.append(volume_info["authors"][0])
49
- except KeyError:
50
- authors.append("Null")
51
-
52
- try:
53
- images.append(volume_info["imageLinks"]["thumbnail"])
54
- except KeyError:
55
- images.append(
56
- "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
57
- )
58
-
59
- ### Openalex ###
60
- import pyalex
61
- from pyalex import Works
62
-
63
- # Add email to the config
64
- pyalex.config.email = "[email protected]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # Define a pager object with the same query
67
- pager = Works().search(str(query)).paginate(per_page=1, n_max=1)
68
-
69
- # Generate a list of the results
70
- openalex_results = list(pager)
71
-
72
- # Get the titles, descriptions, and publishers and append them to the lists
73
- for result in openalex_results[0]:
74
- try:
75
- titles.append(result["title"])
76
- except KeyError:
77
- titles.append("Null")
78
-
79
- try:
80
- descriptions.append(result["abstract"])
81
- except KeyError:
82
- descriptions.append("Null")
83
-
84
- try:
85
- publishers.append(result["host_venue"]["publisher"])
86
- except KeyError:
87
- publishers.append("Null")
88
-
89
- try:
90
- authors.append(result["authorships"][0]["author"]["display_name"])
91
- except KeyError:
92
- authors.append("Null")
93
-
94
- images.append(
95
- "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
96
- )
97
-
98
- ### OpenAI ###
99
- import openai
100
-
101
- # Set the OpenAI API key
102
- openai.api_key = "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"
103
-
104
- # Create ChatGPT query
105
- chatgpt_response = openai.ChatCompletion.create(
106
- model="gpt-3.5-turbo",
107
- messages=[
108
- {
109
- "role": "system",
110
- "content": "You are a librarian. You are helping a patron find a book.",
111
- },
112
- {
113
- "role": "user",
114
- "content": f"Recommend me 1 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
115
- },
116
- ],
117
- )
118
-
119
- # Split the response into a list of results
120
- chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
121
- 2::2
122
- ]
123
-
124
- # Define a function to parse the results
125
- def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
126
- # Create a dict to store the key-value pairs
127
- parsed_result = {}
128
-
129
- for key in ordered_keys:
130
- # Split the result string by the key and append the value to the list
131
- if key != ordered_keys[-1]:
132
- parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
133
- else:
134
- parsed_result[key] = result.split(f"{key}: ")[1]
135
-
136
- return parsed_result
137
-
138
- ordered_keys = ["Title", "Author", "Publisher", "Summary"]
139
-
140
- for result in chatgpt_results:
141
- try:
142
- # Parse the result
143
- parsed_result = parse_result(result, ordered_keys=ordered_keys)
144
-
145
- # Append the parsed result to the lists
146
- titles.append(parsed_result["Title"])
147
- authors.append(parsed_result["Author"])
148
- publishers.append(parsed_result["Publisher"])
149
- descriptions.append(parsed_result["Summary"])
150
  images.append(
151
  "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
152
  )
153
 
154
- # In case the OpenAI API hits the limit
155
- except IndexError:
156
- break
157
-
158
-
159
- ### Prediction ###
160
- from transformers import (
161
- AutoTokenizer,
162
- AutoModelForSeq2SeqLM,
163
- AutoModelForSequenceClassification,
164
- pipeline,
165
- )
166
- from sentence_transformers import SentenceTransformer, CrossEncoder
167
- from sentence_transformers.util import cos_sim, dot_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Load the classifiers
170
- # classifier = TextClassifier.load(
171
- # "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
172
- # )
173
- # sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
174
- # cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  # Combine title, description, and publisher into a single string
177
  combined_data = [
178
- f"{title} {description} {publisher}"
179
  for title, description, publisher in zip(titles, descriptions, publishers)
180
  ]
181
 
182
- # Prepare the Sentence object
183
- # sentences = [
184
- # Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
185
- # ]
186
-
187
- # Classify the sentences
188
- # classifier.predict(sentences)
189
-
190
- # Get the predicted labels
191
- # classes = [sentence.labels for sentence in sentences]
192
-
193
- # Define the summarizer model and tokenizer
194
- sum_tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
195
-
196
- # sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
197
- sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
198
-
199
- summarizer_pipeline = pipeline(
200
- "summarization",
201
- model=sum_model,
202
- tokenizer=sum_tokenizer,
203
- batch_size=64,
204
- )
205
-
206
- # Define the zero-shot classifier
207
- zs_tokenizer = AutoTokenizer.from_pretrained(
208
- "sileod/deberta-v3-base-tasksource-nli"
209
- )
210
- # Quickfix for the tokenizer
211
- # zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]
212
-
213
- zs_model = AutoModelForSequenceClassification.from_pretrained(
214
- "sileod/deberta-v3-base-tasksource-nli"
215
- )
216
- zs_classifier = pipeline(
217
- "zero-shot-classification",
218
- model=zs_model,
219
- tokenizer=zs_tokenizer,
220
- batch_size=64,
221
- hypothesis_template="This book is {}.",
222
- multi_label=True,
223
- )
224
-
225
- # Summarize the descriptions
226
- summaries = [
227
- summarizer_pipeline(description[0:1024])
228
- if (description != None)
229
- else [{"summary_text": "Null"}]
230
- for description in descriptions
231
- ]
232
-
233
- # Predict the level of the book
234
- candidate_labels = [
235
- "Introductory",
236
- "Advanced",
237
- "Academic",
238
- "Not Academic",
239
- "Manual",
240
- ]
241
-
242
- # Get the predicted labels
243
- classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
244
-
245
- # Calculate the elapsed time
246
- end_time = time.time()
247
- runtime = f"{end_time - start_time:.2f} seconds"
248
-
249
- # Calculate the similarity between the books
250
- if similarity:
251
  from sentence_transformers import util
252
 
253
  sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
@@ -255,36 +291,194 @@ def search(query, similarity=False):
255
  combined_data, convert_to_tensor=True
256
  )
257
 
 
 
 
258
  similar_books = []
259
- for i in range(len(titles)):
 
260
  current_embedding = book_embeddings[i]
261
 
 
262
  similarity_sorted = util.semantic_search(
263
- current_embedding, book_embeddings, top_k=20
264
  )
265
 
 
266
  similar_books.append(
267
  {
268
  "sorted_by_similarity": similarity_sorted[0][1:],
269
  }
270
  )
271
 
272
- # Create a list of dictionaries to store the results
273
- results = []
274
- for i in range(len(titles)):
275
- results.append(
276
- {
277
- "id": i,
278
- "title": titles[i],
279
- "author": authors[i],
280
- "publisher": publishers[i],
281
- "image_link": images[i],
282
- "labels": classes[i]["labels"][0:2],
283
- "label_confidences": classes[i]["scores"][0:2],
284
- "summary": summaries[i][0]["summary_text"],
285
- "similar_books": similar_books[i]["sorted_by_similarity"],
286
- "runtime": runtime,
287
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  )
289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  return results
 
1
  from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
 
4
  # Define the FastAPI app
5
  app = FastAPI(docs_url="/")
6
 
7
+ # Add the CORS middleware to the app
8
+ app.add_middleware(
9
+ CORSMiddleware,
10
+ allow_origins=["*"],
11
+ allow_credentials=True,
12
+ allow_methods=["*"],
13
+ allow_headers=["*"],
14
+ )
15
+
16
+
17
+ @app.get("/search")
18
+ def search(
19
+ query: str,
20
+ classification: bool = True,
21
+ summarization: bool = True,
22
+ similarity: bool = False,
23
+ add_chatgpt_results: bool = True,
24
+ n_results: int = 10,
25
+ ):
26
  import time
27
  import requests
28
 
29
  start_time = time.time()
30
 
 
 
 
 
 
 
 
31
  # Initialize the lists to store the results
32
  titles = []
33
  authors = []
 
35
  descriptions = []
36
  images = []
37
 
38
+ def gbooks_search(query, n_results=30):
39
+ """
40
+ Access the Google Books API and return the results.
41
+ """
42
+ # Set the API endpoint and query parameters
43
+ url = "https://www.googleapis.com/books/v1/volumes"
44
+ params = {"q": str(query), "printType": "books", "maxResults": n_results}
45
+
46
+ # Send a GET request to the API with the specified parameters
47
+ response = requests.get(url, params=params)
48
+
49
+ # Parse the response JSON and append the results
50
+ data = response.json()
51
+
52
+ # Initialize the lists to store the results
53
+ titles = []
54
+ authors = []
55
+ publishers = []
56
+ descriptions = []
57
+ images = []
58
+
59
+ for item in data["items"]:
60
+ volume_info = item["volumeInfo"]
61
+ try:
62
+ titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
63
+ except KeyError:
64
+ titles.append(volume_info["title"])
65
+
66
+ try:
67
+ descriptions.append(volume_info["description"])
68
+ except KeyError:
69
+ descriptions.append("Null")
70
+
71
+ try:
72
+ publishers.append(volume_info["publisher"])
73
+ except KeyError:
74
+ publishers.append("Null")
75
+
76
+ try:
77
+ authors.append(volume_info["authors"][0])
78
+ except KeyError:
79
+ authors.append("Null")
80
+
81
+ try:
82
+ images.append(volume_info["imageLinks"]["thumbnail"])
83
+ except KeyError:
84
+ images.append(
85
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
86
+ )
87
+
88
+ return titles, authors, publishers, descriptions, images
89
+
90
+ # Run the gbooks_search function
91
+ (
92
+ titles_placeholder,
93
+ authors_placeholder,
94
+ publishers_placeholder,
95
+ descriptions_placeholder,
96
+ images_placeholder,
97
+ ) = gbooks_search(query, n_results=n_results)
98
+
99
+ # Append the results to the lists
100
+ [titles.append(title) for title in titles_placeholder]
101
+ [authors.append(author) for author in authors_placeholder]
102
+ [publishers.append(publisher) for publisher in publishers_placeholder]
103
+ [descriptions.append(description) for description in descriptions_placeholder]
104
+ [images.append(image) for image in images_placeholder]
105
+
106
+ # Get the time since the start
107
+ first_checkpoint = time.time()
108
+ first_checkpoint_time = int(first_checkpoint - start_time)
109
+
110
+ def openalex_search(query, n_results=10):
111
+ """
112
+ Run a search on OpenAlex and return the results.
113
+ """
114
+ import pyalex
115
+ from pyalex import Works
116
+
117
+ # Add email to the config
118
+ pyalex.config.email = "[email protected]"
119
+
120
+ # Define a pager object with the same query
121
+ pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)
122
+
123
+ # Generate a list of the results
124
+ openalex_results = list(pager)
125
+
126
+ # Initialize the lists to store the results
127
+ titles = []
128
+ authors = []
129
+ publishers = []
130
+ descriptions = []
131
+ images = []
132
+
133
+ # Get the titles, descriptions, and publishers and append them to the lists
134
+ for result in openalex_results[0]:
135
+ try:
136
+ titles.append(result["title"])
137
+ except KeyError:
138
+ titles.append("Null")
139
+
140
+ try:
141
+ descriptions.append(result["abstract"])
142
+ except KeyError:
143
+ descriptions.append("Null")
144
+
145
+ try:
146
+ publishers.append(result["host_venue"]["publisher"])
147
+ except KeyError:
148
+ publishers.append("Null")
149
+
150
+ try:
151
+ authors.append(result["authorships"][0]["author"]["display_name"])
152
+ except KeyError:
153
+ authors.append("Null")
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  images.append(
156
  "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
157
  )
158
 
159
+ return titles, authors, publishers, descriptions, images
160
+
161
+ # Run the openalex_search function
162
+ (
163
+ titles_placeholder,
164
+ authors_placeholder,
165
+ publishers_placeholder,
166
+ descriptions_placeholder,
167
+ images_placeholder,
168
+ ) = openalex_search(query, n_results=n_results)
169
+
170
+ # Append the results to the lists
171
+ [titles.append(title) for title in titles_placeholder]
172
+ [authors.append(author) for author in authors_placeholder]
173
+ [publishers.append(publisher) for publisher in publishers_placeholder]
174
+ [descriptions.append(description) for description in descriptions_placeholder]
175
+ [images.append(image) for image in images_placeholder]
176
+
177
+ # Calculate the elapsed time between the first and second checkpoints
178
+ second_checkpoint = time.time()
179
+ second_checkpoint_time = int(second_checkpoint - first_checkpoint)
180
+
181
+ def openai_search(query, n_results=10):
182
+ """
183
+ Create a query to the OpenAI ChatGPT API and return the results.
184
+ """
185
+ import openai
186
+
187
+ # Initialize the lists to store the results
188
+ titles = []
189
+ authors = []
190
+ publishers = []
191
+ descriptions = []
192
+ images = []
193
+
194
+ # Set the OpenAI API key
195
+ openai.api_key = "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"
196
+
197
+ # Create ChatGPT query
198
+ chatgpt_response = openai.ChatCompletion.create(
199
+ model="gpt-3.5-turbo",
200
+ messages=[
201
+ {
202
+ "role": "system",
203
+ "content": "You are a librarian. You are helping a patron find a book.",
204
+ },
205
+ {
206
+ "role": "user",
207
+ "content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
208
+ },
209
+ ],
210
+ )
211
 
212
+ # Split the response into a list of results
213
+ chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
214
+ "\n"
215
+ )[2::2]
216
+
217
+ # Define a function to parse the results
218
+ def parse_result(
219
+ result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
220
+ ):
221
+ # Create a dict to store the key-value pairs
222
+ parsed_result = {}
223
+
224
+ for key in ordered_keys:
225
+ # Split the result string by the key and append the value to the list
226
+ if key != ordered_keys[-1]:
227
+ parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
228
+ else:
229
+ parsed_result[key] = result.split(f"{key}: ")[1]
230
+
231
+ return parsed_result
232
+
233
+ ordered_keys = ["Title", "Author", "Publisher", "Summary"]
234
+
235
+ for result in chatgpt_results:
236
+ try:
237
+ # Parse the result
238
+ parsed_result = parse_result(result, ordered_keys=ordered_keys)
239
+
240
+ # Append the parsed result to the lists
241
+ titles.append(parsed_result["Title"])
242
+ authors.append(parsed_result["Author"])
243
+ publishers.append(parsed_result["Publisher"])
244
+ descriptions.append(parsed_result["Summary"])
245
+ images.append(
246
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
247
+ )
248
+
249
+ # In case the OpenAI API hits the limit
250
+ except IndexError:
251
+ break
252
+
253
+ return titles, authors, publishers, descriptions, images
254
+
255
+ if add_chatgpt_results:
256
+ # Run the openai_search function
257
+ (
258
+ titles_placeholder,
259
+ authors_placeholder,
260
+ publishers_placeholder,
261
+ descriptions_placeholder,
262
+ images_placeholder,
263
+ ) = openai_search(query)
264
+
265
+ # Append the results to the lists
266
+ [titles.append(title) for title in titles_placeholder]
267
+ [authors.append(author) for author in authors_placeholder]
268
+ [publishers.append(publisher) for publisher in publishers_placeholder]
269
+ [descriptions.append(description) for description in descriptions_placeholder]
270
+ [images.append(image) for image in images_placeholder]
271
+
272
+ # Calculate the elapsed time between the second and third checkpoints
273
+ third_checkpoint = time.time()
274
+ third_checkpoint_time = int(third_checkpoint - second_checkpoint)
275
 
276
  # Combine title, description, and publisher into a single string
277
  combined_data = [
278
+ f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
279
  for title, description, publisher in zip(titles, descriptions, publishers)
280
  ]
281
 
282
+ def find_similar(combined_data, top_k=10):
283
+ """
284
+ Calculate the similarity between the books and return the top_k results.
285
+ """
286
+ from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  from sentence_transformers import util
288
 
289
  sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
 
291
  combined_data, convert_to_tensor=True
292
  )
293
 
294
+ # Make sure that the top_k value is not greater than the number of books
295
+ top_k = len(combined_data) if top_k > len(combined_data) else top_k
296
+
297
  similar_books = []
298
+ for i in range(len(combined_data)):
299
+ # Get the embedding for the ith book
300
  current_embedding = book_embeddings[i]
301
 
302
+ # Calculate the similarity between the ith book and the rest of the books
303
  similarity_sorted = util.semantic_search(
304
+ current_embedding, book_embeddings, top_k=top_k
305
  )
306
 
307
+ # Append the results to the list
308
  similar_books.append(
309
  {
310
  "sorted_by_similarity": similarity_sorted[0][1:],
311
  }
312
  )
313
 
314
+ return similar_books
315
+
316
+ def summarize(descriptions):
317
+ """
318
+ Summarize the descriptions and return the results.
319
+ """
320
+ from transformers import (
321
+ AutoTokenizer,
322
+ AutoModelForSeq2SeqLM,
323
+ pipeline,
324
+ )
325
+
326
+ # Define the summarizer model and tokenizer
327
+ tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
328
+ model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
329
+
330
+ # Create the summarizer pipeline
331
+ summarizer_pipe = pipeline(
332
+ "summarization",
333
+ model=model,
334
+ tokenizer=tokenizer,
335
+ min_length=10,
336
+ max_length=128,
337
+ )
338
+
339
+ # Summarize the descriptions
340
+ summaries = [
341
+ summarizer_pipe(description)
342
+ if (len(description) > 0)
343
+ else [{"summary_text": "No summary text is available."}]
344
+ for description in descriptions
345
+ ]
346
+
347
+ return summaries
348
+
349
+ def classify(combined_data, parallel=False):
350
+ """
351
+ Create classifier pipeline and return the results.
352
+ """
353
+ from transformers import (
354
+ AutoTokenizer,
355
+ AutoModelForSequenceClassification,
356
+ pipeline,
357
+ )
358
+
359
+ # Define the zero-shot classifier
360
+ tokenizer = AutoTokenizer.from_pretrained(
361
+ "sileod/deberta-v3-base-tasksource-nli"
362
  )
363
 
364
+ model = AutoModelForSequenceClassification.from_pretrained(
365
+ "sileod/deberta-v3-base-tasksource-nli"
366
+ )
367
+ classifier_pipe = pipeline(
368
+ "zero-shot-classification",
369
+ model=model,
370
+ tokenizer=tokenizer,
371
+ hypothesis_template="This book is {}.",
372
+ batch_size=1,
373
+ device=-1,
374
+ multi_label=True,
375
+ )
376
+
377
+ # Define the candidate labels
378
+ candidate_labels = [
379
+ "Introductory",
380
+ "Advanced",
381
+ "Academic",
382
+ "Not Academic",
383
+ "Manual",
384
+ ]
385
+
386
+ if parallel:
387
+ import ray
388
+ import psutil
389
+
390
+ # Define the number of cores to use
391
+ num_cores = psutil.cpu_count(logical=True)
392
+
393
+ # Initialize Ray
394
+ ray.init(num_cpus=num_cores, ignore_reinit_error=True)
395
+ classifier_id = ray.put(classifier_pipe)
396
+
397
+ # Define the function to be parallelized
398
+ @ray.remote
399
+ def classify_parallel(classifier_id, doc, candidate_labels):
400
+ classifier = ray.get(classifier_id)
401
+ return classifier(doc, candidate_labels)
402
+
403
+ # Get the predicted labels
404
+ classes = [
405
+ classify_parallel.remote(classifier_id, doc, candidate_labels)
406
+ for doc in combined_data
407
+ ]
408
+ else:
409
+ # Get the predicted labels
410
+ classes = [classifier_pipe(doc, candidate_labels) for doc in combined_data]
411
+
412
+ return classes
413
+
414
+ # If true then run the similarity, summarize, and classify functions
415
+ if classification:
416
+ classes = classify(combined_data, parallel=False)
417
+ else:
418
+ classes = [
419
+ {"labels": ["No labels available."], "scores": [0]}
420
+ for i in range(len(combined_data))
421
+ ]
422
+
423
+ # Calculate the elapsed time between the third and fourth checkpoints
424
+ fourth_checkpoint = time.time()
425
+ classification_time = int(fourth_checkpoint - third_checkpoint)
426
+
427
+ if summarization:
428
+ summaries = summarize(descriptions)
429
+ else:
430
+ summaries = [
431
+ [{"summary_text": description}]
432
+ if (len(description) > 0)
433
+ else [{"summary_text": "No summary text is available."}]
434
+ for description in descriptions
435
+ ]
436
+
437
+ # Calculate the elapsed time between the fourth and fifth checkpoints
438
+ fifth_checkpoint = time.time()
439
+ summarization_time = int(fifth_checkpoint - fourth_checkpoint)
440
+
441
+ if similarity:
442
+ similar_books = find_similar(combined_data)
443
+ else:
444
+ similar_books = [
445
+ {"sorted_by_similarity": ["No similar books available."]}
446
+ for i in range(len(combined_data))
447
+ ]
448
+
449
+ # Calculate the elapsed time between the fifth and sixth checkpoints
450
+ sixth_checkpoint = time.time()
451
+ similarity_time = int(sixth_checkpoint - fifth_checkpoint)
452
+
453
+ # Calculate the total elapsed time
454
+ end_time = time.time()
455
+ runtime = f"{end_time - start_time:.2f} seconds"
456
+
457
+ # Create a list of dictionaries to store the results
458
+ results = [
459
+ {
460
+ "id": i,
461
+ "title": titles[i],
462
+ "author": authors[i],
463
+ "publisher": publishers[i],
464
+ "image_link": images[i],
465
+ "labels": classes[i]["labels"][0:2],
466
+ "label_confidences": classes[i]["scores"][0:2],
467
+ "summary": summaries[i][0]["summary_text"],
468
+ "similar_books": similar_books[i]["sorted_by_similarity"],
469
+ "checkpoints": [
470
+ {
471
+ "Google Books Time": first_checkpoint_time,
472
+ "OpenAlex Time": second_checkpoint_time,
473
+ "OpenAI Time": third_checkpoint_time,
474
+ "Classification Time": classification_time,
475
+ "Summarization Time": summarization_time,
476
+ "Similarity Computing Time": similarity_time,
477
+ }
478
+ ],
479
+ "total_runtime": runtime,
480
+ }
481
+ for i in range(len(combined_data))
482
+ ]
483
+
484
  return results