bertugmirasyedi commited on
Commit
6b67b82
·
1 Parent(s): 99b3772

First commit

Browse files
Files changed (3) hide show
  1. Dockerfile +11 -0
  2. requirements.txt +8 -0
  3. search.py +308 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["uvicorn", "search:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.95.0
2
+ flair==0.11.3
3
+ openai==0.27.0
4
+ optimum==1.7.1
5
+ pyalex==0.7
6
+ requests==2.25.1
7
+ sentence_transformers==2.2.2
8
+ transformers==4.26.1
search.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import sys
4
+
5
+ # Set the maximum recursion depth to 10000
6
+ sys.setrecursionlimit(10000)
7
+
8
+ # Define the FastAPI app
9
+ app = FastAPI()
10
+
11
+ # Add the CORS middleware to the app
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_credentials=True,
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+
21
+ @app.get("/search={query}&similarity={similarity}")
22
+ def search(query, similarity=False):
23
+ import time
24
+ import requests
25
+
26
+ start_time = time.time()
27
+
28
+ # Set the API endpoint and query parameters
29
+ url = "https://www.googleapis.com/books/v1/volumes"
30
+ params = {"q": str(query), "printType": "books", "maxResults": 30}
31
+
32
+ # Send a GET request to the API with the specified parameters
33
+ response = requests.get(url, params=params)
34
+
35
+ # Initialize the lists to store the results
36
+ titles = []
37
+ authors = []
38
+ publishers = []
39
+ descriptions = []
40
+ images = []
41
+
42
+ # Parse the response JSON and append the results
43
+ data = response.json()
44
+
45
+ for item in data["items"]:
46
+ volume_info = item["volumeInfo"]
47
+ try:
48
+ titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
49
+ except KeyError:
50
+ titles.append(volume_info["title"])
51
+
52
+ try:
53
+ descriptions.append(volume_info["description"])
54
+ except KeyError:
55
+ descriptions.append("Null")
56
+
57
+ try:
58
+ publishers.append(volume_info["publisher"])
59
+ except KeyError:
60
+ publishers.append("Null")
61
+
62
+ try:
63
+ authors.append(volume_info["authors"][0])
64
+ except KeyError:
65
+ authors.append("Null")
66
+
67
+ try:
68
+ images.append(volume_info["imageLinks"]["thumbnail"])
69
+ except KeyError:
70
+ images.append(
71
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
72
+ )
73
+
74
+ ### Openalex ###
75
+ import pyalex
76
+ from pyalex import Works
77
+
78
+ # Add email to the config
79
+ pyalex.config.email = "[email protected]"
80
+
81
+ # Define a pager object with the same query
82
+ pager = Works().search(str(query)).paginate(per_page=10, n_max=10)
83
+
84
+ # Generate a list of the results
85
+ openalex_results = list(pager)
86
+
87
+ # Get the titles, descriptions, and publishers and append them to the lists
88
+ for result in openalex_results[0]:
89
+ try:
90
+ titles.append(result["title"])
91
+ except KeyError:
92
+ titles.append("Null")
93
+
94
+ try:
95
+ descriptions.append(result["abstract"])
96
+ except KeyError:
97
+ descriptions.append("Null")
98
+
99
+ try:
100
+ publishers.append(result["host_venue"]["publisher"])
101
+ except KeyError:
102
+ publishers.append("Null")
103
+
104
+ try:
105
+ authors.append(result["authorships"][0]["author"]["display_name"])
106
+ except KeyError:
107
+ authors.append("Null")
108
+
109
+ images.append(
110
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
111
+ )
112
+
113
+ ### OpenAI ###
114
+ import openai
115
+
116
+ # Set the OpenAI API key
117
+ openai.api_key = "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"
118
+
119
+ # Create ChatGPT query
120
+ chatgpt_response = openai.ChatCompletion.create(
121
+ model="gpt-3.5-turbo",
122
+ messages=[
123
+ {
124
+ "role": "system",
125
+ "content": "You are a librarian. You are helping a patron find a book.",
126
+ },
127
+ {
128
+ "role": "user",
129
+ "content": f"Recommend me 10 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
130
+ },
131
+ ],
132
+ )
133
+
134
+ # Split the response into a list of results
135
+ chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
136
+ 2::2
137
+ ]
138
+
139
+ # Define a function to parse the results
140
+ def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
141
+ # Create a dict to store the key-value pairs
142
+ parsed_result = {}
143
+
144
+ for key in ordered_keys:
145
+ # Split the result string by the key and append the value to the list
146
+ if key != ordered_keys[-1]:
147
+ parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
148
+ else:
149
+ parsed_result[key] = result.split(f"{key}: ")[1]
150
+
151
+ return parsed_result
152
+
153
+ ordered_keys = ["Title", "Author", "Publisher", "Summary"]
154
+
155
+ for result in chatgpt_results:
156
+ # Parse the result
157
+ parsed_result = parse_result(result, ordered_keys=ordered_keys)
158
+
159
+ # Append the parsed result to the lists
160
+ titles.append(parsed_result["Title"])
161
+ authors.append(parsed_result["Author"])
162
+ publishers.append(parsed_result["Publisher"])
163
+ descriptions.append(parsed_result["Summary"])
164
+ images.append(
165
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
166
+ )
167
+
168
+ ### Prediction ###
169
+ from flair.models import TextClassifier
170
+ from flair.data import Sentence
171
+ from flair.tokenization import SegtokTokenizer
172
+ from transformers import (
173
+ AutoTokenizer,
174
+ AutoModelForSeq2SeqLM,
175
+ AutoModelForSequenceClassification,
176
+ pipeline,
177
+ )
178
+ from sentence_transformers import SentenceTransformer, CrossEncoder
179
+ from sentence_transformers.util import cos_sim, dot_score
180
+ from optimum.onnxruntime import (
181
+ ORTModelForSeq2SeqLM,
182
+ ORTModelForSequenceClassification,
183
+ )
184
+ from optimum.pipelines import pipeline as optimum_pipeline
185
+
186
+ # Load the classifiers
187
+ # classifier = TextClassifier.load(
188
+ # "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
189
+ # )
190
+ # sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
191
+ # cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")
192
+
193
+ # Combine title, description, and publisher into a single string
194
+ combined_data = [
195
+ f"{title} {description} {publisher}"
196
+ for title, description, publisher in zip(titles, descriptions, publishers)
197
+ ]
198
+
199
+ # Prepare the Sentence object
200
+ # sentences = [
201
+ # Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
202
+ # ]
203
+
204
+ # Classify the sentences
205
+ # classifier.predict(sentences)
206
+
207
+ # Get the predicted labels
208
+ # classes = [sentence.labels for sentence in sentences]
209
+
210
+ # Define the summarizer model and tokenizer
211
+ sum_tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
212
+ sum_model_quantized = ORTModelForSeq2SeqLM.from_pretrained(
213
+ "trainers/bart-base-samsum-quantized"
214
+ )
215
+ # sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
216
+
217
+ summarizer_pipeline = optimum_pipeline(
218
+ "summarization",
219
+ model=sum_model_quantized,
220
+ tokenizer=sum_tokenizer,
221
+ batch_size=64,
222
+ )
223
+
224
+ # Define the zero-shot classifier
225
+ zs_tokenizer = AutoTokenizer.from_pretrained(
226
+ "sileod/deberta-v3-base-tasksource-nli"
227
+ )
228
+ # Quickfix for the tokenizer
229
+ # zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]
230
+
231
+ zs_model = AutoModelForSequenceClassification.from_pretrained(
232
+ "sileod/deberta-v3-base-tasksource-nli"
233
+ )
234
+ zs_classifier = pipeline(
235
+ "zero-shot-classification",
236
+ model=zs_model,
237
+ tokenizer=zs_tokenizer,
238
+ batch_size=64,
239
+ hypothesis_template="This book is {}.",
240
+ multi_label=True,
241
+ )
242
+
243
+ # Summarize the descriptions
244
+ summaries = [
245
+ summarizer_pipeline(description[0:1024])
246
+ if (description != None)
247
+ else [{"summary_text": "Null"}]
248
+ for description in descriptions
249
+ ]
250
+
251
+ # Predict the level of the book
252
+ candidate_labels = [
253
+ "Introductory",
254
+ "Advanced",
255
+ "Academic",
256
+ "Not Academic",
257
+ "Manual",
258
+ ]
259
+
260
+ # Get the predicted labels
261
+ classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
262
+
263
+ # Calculate the elapsed time
264
+ end_time = time.time()
265
+ runtime = f"{end_time - start_time:.2f} seconds"
266
+
267
+ # Calculate the similarity between the books
268
+ if similarity:
269
+ from sentence_transformers import util
270
+
271
+ sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
272
+ book_embeddings = sentence_transformer.encode(
273
+ combined_data, convert_to_tensor=True
274
+ )
275
+
276
+ similar_books = []
277
+ for i in range(len(titles)):
278
+ current_embedding = book_embeddings[i]
279
+
280
+ similarity_sorted = util.semantic_search(
281
+ current_embedding, book_embeddings, top_k=20
282
+ )
283
+
284
+ similar_books.append(
285
+ {
286
+ "sorted_by_similarity": similarity_sorted[0][1:],
287
+ }
288
+ )
289
+
290
+ # Create a list of dictionaries to store the results
291
+ results = []
292
+ for i in range(len(titles)):
293
+ results.append(
294
+ {
295
+ "id": i,
296
+ "title": titles[i],
297
+ "author": authors[i],
298
+ "publisher": publishers[i],
299
+ "image_link": images[i],
300
+ "labels": classes[i]["labels"][0:2],
301
+ "label_confidences": classes[i]["scores"][0:2],
302
+ "summary": summaries[i][0]["summary_text"],
303
+ "similar_books": similar_books[i]["sorted_by_similarity"],
304
+ "runtime": runtime,
305
+ }
306
+ )
307
+
308
+ return results