zolicsaki commited on
Commit
57d4532
·
verified ·
1 Parent(s): 6e26061

Upload 7 files

Browse files
Files changed (7) hide show
  1. logo.png +0 -0
  2. paper2slides.py +703 -0
  3. pdf_helper.py +181 -0
  4. pptx_utils.py +695 -0
  5. requirements.txt +5 -0
  6. sambaAPI.py +64 -0
  7. utils.py +39 -0
logo.png ADDED
paper2slides.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slide_datasource = {
2
+ 'introduction': ['abstract', 'Introduction'],
3
+ 'objective': ['abstract', 'Introduction'],
4
+ 'methodoloy': ['abstract', 'Introduction', 'Conclusion', 'Methods'],
5
+ 'results': ['abstract', 'Experiments', 'Conclusion'],
6
+ 'conclusion': ['abstract', 'Introduction', 'Conclusion'],
7
+ }
8
+
9
+ from pdf_helper import PDFPaper4LLMParser, dict_to_markdown_list
10
+ from sambaAPI import call_llama_chat, MODEL_ALIAS
11
+ from pdf_helper import markdown_to_slide_dicts
12
+ from pptx_utils import Dict2PPT, os
13
+ import json
14
+ import time
15
+ import string
16
+
17
+ SLIDE_SEP = '<slide_sep>'
18
+
19
+ def trim_string(s):
20
+ return s.strip(string.whitespace + string.punctuation)
21
+
22
+ section_title_key_phrases = {
23
+ 'Introduction': ['introduction'],
24
+ 'Related Works': ['related work'],
25
+ 'Methods': ['method', 'approach'],
26
+ 'Experiments': ['experiment'],
27
+ 'Conclusion': ['conclusion'],
28
+ 'Acknowledgements': ['acknowledgement'],
29
+ 'References': ['references', ' references'], #
30
+ }
31
+
32
+ def find_string_index(string_list, target: str):
33
+ """
34
+ Returns the index of the target string in the list.
35
+ If the target is not found, returns -1.
36
+
37
+ Parameters:
38
+ string_list (list): A list of strings
39
+ target (str): The string to find in the list
40
+
41
+ Returns:
42
+ int: The index of the target string, or -1 if not found
43
+ """
44
+ try:
45
+ return string_list.index(target)
46
+ except ValueError:
47
+ return -1
48
+
49
+
50
+ def get_section_category(section_name: str):
51
+ """
52
+ Scientist paper section name mapping
53
+ """
54
+ for key, phrases in section_title_key_phrases.items():
55
+ for phrase in phrases:
56
+ if phrase in section_name.lower():
57
+ return key
58
+ return 'Other'
59
+
60
+
61
+ class PaperReader(object):
62
+ def __init__(self, page_chunks=False):
63
+ self.paper_reader = PDFPaper4LLMParser(page_chunks=page_chunks)
64
+
65
+ def pdf2text(self, paper_pdf_path: str):
66
+ paper_content = self.paper_reader.run(pdf_path=paper_pdf_path, verbose=False)
67
+ return paper_content
68
+
69
+ def structurize(self, main_text_array: list):
70
+ section_names = [_['title'] for _ in main_text_array]
71
+ section_name_topics = [get_section_category(_) for _ in section_names]
72
+ introduction_idx = find_string_index(section_name_topics, target='Introduction')
73
+ refference_idx = find_string_index(section_name_topics, target='References')
74
+ experiment_idx = find_string_index(section_name_topics, target='Experiments')
75
+ conclusion_idx = find_string_index(section_name_topics, target='Conclusion')
76
+ if refference_idx > 0:
77
+ for idx in range(len(section_name_topics)):
78
+ if idx < refference_idx:
79
+ if section_name_topics[idx] == 'Other':
80
+ section_name_topics[idx] = 'Methods'
81
+ elif idx > refference_idx:
82
+ if not ('appendix' in section_name_topics[idx].lower()):
83
+ section_name_topics[idx] = 'Appendix: ' + section_name_topics[idx]
84
+ else:
85
+ continue
86
+ # print(section_name_topics)
87
+ if experiment_idx > 0:
88
+ for idx in range(experiment_idx +1, refference_idx):
89
+ if section_name_topics[idx] == 'Methods':
90
+ section_name_topics[idx] = 'Experiments'
91
+ # print(section_name_topics)
92
+ experiment_idx = find_string_index(section_name_topics, target='Experiments')
93
+ method_idx = find_string_index(section_name_topics, target='Methods')
94
+ relatedwork_idx = find_string_index(section_name_topics, target='Related Works')
95
+ ack_idx = find_string_index(section_name_topics, target='Acknowledgements')
96
+
97
+ paper_structure_dict = {
98
+ 'Introduction': [introduction_idx],
99
+ 'Related Works': [relatedwork_idx],
100
+ 'References': [refference_idx],
101
+ 'Conclusion': [conclusion_idx],
102
+ 'Acknowledgements': [ack_idx]
103
+ }
104
+
105
+ ## Experiments and methodology
106
+ method_idx_array = []
107
+ if method_idx >=0:
108
+ for idx in range(method_idx, len(section_name_topics)):
109
+ if section_name_topics[idx] == 'Methods':
110
+ method_idx_array.append(idx)
111
+ else:
112
+ break
113
+ else:
114
+ if introduction_idx >=0 and conclusion_idx >=0:
115
+ for idx in range(introduction_idx+1, conclusion_idx):
116
+ if section_name_topics[idx] == 'Methods':
117
+ method_idx_array.append(idx)
118
+ else:
119
+ break
120
+
121
+
122
+ exp_idx_array = []
123
+ if experiment_idx >=0:
124
+ for idx in range(experiment_idx, len(section_name_topics)):
125
+ if section_name_topics[idx] == 'Experiments':
126
+ exp_idx_array.append(idx)
127
+ else:
128
+ break
129
+ else:
130
+ if introduction_idx >=0 and conclusion_idx >=0:
131
+ for idx in range(introduction_idx+1, conclusion_idx):
132
+ if section_name_topics[idx] == 'Experiments':
133
+ exp_idx_array.append(idx)
134
+ else:
135
+ break
136
+
137
+ paper_structure_dict['Experiments'] = exp_idx_array
138
+ paper_structure_dict['Methods'] = method_idx_array
139
+ return section_name_topics, paper_structure_dict
140
+
141
+ def run(self, paper_file_name: str):
142
+ start_time = time.time()
143
+ paper_content = self.pdf2text(paper_pdf_path=paper_file_name)
144
+ section_name_topics, paper_structure_dict = self.structurize(main_text_array=paper_content['main_text'])
145
+ paper_content['structure'] = paper_structure_dict
146
+ paper_content['section_topic'] = section_name_topics
147
+ print('Runtime for pdf2text = {:.4f} seconds.'.format(time.time() - start_time))
148
+ return paper_content
149
+
150
+ ### 1. General System Prompt
151
+
152
+ SCHOLAR_PROMPT = """
153
+ You are an assistant being skilled at critically reading and analyzing academic papers to extract key insights, trends, and findings.
154
+ """
155
+
156
+ ### 2. Paper Outline Generation from Abstract
157
+
158
+ ABSTRACT_SUMMARY_PROMPT = """
159
+ You are given the **title** and **abstract** of an academic paper. Please first identity the research topic, and then extract the following aspects in a minimal title draft (max 15 words) for PowerPoint presentation:
160
+
161
+ 1. **Background**: Introduces the research context and importance.
162
+ 2. **Research Problem**: Identifies the specific problem or knowledge gap.
163
+ 3. **Objectives**: States the research goals or hypotheses.
164
+ 4. **Methodology**: Summarizes the research design and key methods.
165
+ 5. **Results**: Highlights the most significant findings.
166
+ 6. **Conclusions**: Provides the main takeaways and their relation to the research question.
167
+
168
+ Reminder: Strictly output in JSON format **only**, using the keys: "Research topic", "Background", "Research problem", "Objectives", "Methodology", "Results" and "Conclusions".
169
+ """
170
+
171
+ ### 3. Evidence extraction from main paper text for "Background"
172
+ BACKGROUD_EVIDENCE_PROMPT = """
173
+ You are given the **title**, briefly description of **problem backgroud** and **introduction** of a research paper. From the introduction, extract an itemized list of **1 to 3 pieces of evidence** that support the problem background, each evidence should be described in a **minimal draft (min 10 words and max 25 words)** for PowerPoint presentation.
174
+
175
+ Each piece of evidence must:
176
+ 1. Be directly relevant to the problem background.
177
+ 2. Be clear and concise.
178
+ 3. Be unique, not repeating other evidence.
179
+
180
+ **Important**: Strictly output the itemized evidences ONLY.
181
+ """
182
+
183
+
184
+ ### 4. Evidence extraction from main paper text for "Research Problem"
185
+ RESEARCH_PROBLEM_PROMPT = """
186
+ You are given the **title**, briefly description of **research problem** and **introduction** of a research paper. Solely from the given introduction, extract the definition of the research problem for PowerPoint presentation, focusing on:
187
+
188
+ 1. **Scope**: Define the problem’s boundaries as individual items;
189
+ 2. **Challenges**: Identify key gaps or obstacles the research addresses as individual items;
190
+ 3. **Assumptions**: State any assumptions guiding the research as individual items;
191
+ 4. **Relevance*: Specify who benefits from solving the problem as individual items.
192
+
193
+ **Note**: Each item must be in one concise sentence. **Only** output "Scope", "Challenges", "Assumptions" and "Relevance".
194
+ """
195
+
196
+
197
+ ### 5. Evidence extraction from main paper text for "Objectives"
198
+
199
+ OBJECTIVE_PROMPT = """
200
+ You are given the **title**, **objectives** and **introduction** of a research paper. Solely from the given introduction, extract a list of **2 to 5 pieces of evidence** to support these objectives, each evidence should be described in a **minimal draft (min 10 words and max 20 words)** for PowerPoint presentation.
201
+
202
+ Each piece of evidence must:
203
+ 1. Be directly relevant to the objectives.
204
+ 2. Be clear and concise.
205
+ 3. Be unique, not repeating other evidence.
206
+
207
+ **Note**: Strictly output the itemized evidences ONLY.
208
+ """
209
+
210
+ ### 6. Evidence extraction from main paper text for "Conclusion"
211
+
212
+ CONCLUSION_PROMT = """
213
+ You are given the **title**, **birief conclusion**, and **full text conclusion** and **introduction** of a research paper. From the given conclusion and introduction, extract the **conclusion** for PowerPoint presentation, ensuring it includes:
214
+
215
+ 1. **Summary of key results**: Highlight the main results.
216
+ 2. **Implications**: Explain the significance or impact of these findings.
217
+ 3. **Future directions**: Mention any suggestions for future research or applications.
218
+ 4. **Final takeaway**: Provide the overall takeaway message of the study.
219
+
220
+ **Note**: Only output the conclusion. Limit each point in a minimal concise draft (at least 10 words).”
221
+ """
222
+
223
+ ### 7. Evidence extraction from main paper text for "Experimental results" (iterative)
224
+
225
+ RESULT_PROMPT_DICT = {
226
+ "system_instruction": """Given the title, the main results of an experimental study, and a paragraph from a research paper, your task is to extract and summarize evidence from the paragraph that supports the 'main results'.
227
+
228
+ Follow these steps for each paragraph:
229
+ 1. **Detect Evidence**: Check if the paragraph contains:
230
+ 1) Any evidence supporting the main results, or
231
+ 2) Experimental study information, including:
232
+ - **Dataset**: Details on datasets, preprocessing, or train/test splits.
233
+ - **Model Description**: Information of baselines, hyperparameters, and training.
234
+ - **Evaluation Metrics**: Relevant metrics like accuracy, F1 score, and their justification.
235
+ - **Comparative Analysis**: Comparisons with baselines, ablation studies, statistical significance.
236
+ - **Runtime & Scalability**: Computational complexity and scalability.
237
+ 2. **Response**: Choose 'YES' or 'NO':
238
+ - If 'YES', extract and summarize the evidence or experimental details in 200 words. Ensure the summary is:
239
+ - Clear and concise
240
+ - Well-formatted for easy reading
241
+ - Focused on key points: dataset, model Description, evaluation metrics, comparative analysis and runtime & scalability.
242
+ - If 'NO', just respond with 'NO EVIDENCE'.
243
+ """,
244
+
245
+ "iterative_prompt": """Summarize the experimental details or evidence supporting the 'main results' in 200 words from the following paragraph (with title and content) if experiment-related information is detected. Follow these instructions:
246
+
247
+ 1. List 2 to 4 itemized points.
248
+ 2. Each point must specify the type ('Evidence' or 'Experimental Setup') and provide a minimal draft sentence of content (max 15 words).
249
+
250
+ **Note**: Only provide the itemized summary.
251
+ """,
252
+
253
+ "final_prompt": """Using the **title**, the **main results** of an experimental study, and a list of experiment summaries from the research paper, follow these steps to summarize the results:
254
+
255
+ 1. **Evidence Summary**: prive a numbered, itemized summary of **2-3** key points. Keep each point brief and focused (only 1 sentence).
256
+
257
+ 2. **Experimental Summary**: Based all 'Experimental Setup' points and provide a concise summary covering the following aspects:
258
+ 1) **Datasets**: List only the names of all datasets or benchmarks used.
259
+ 2) **Baselines**: List only the names of all models/algorithms used.
260
+ 3) **Metrics**: List only the evaluation metrics used for model performance, such as accuracy, F1-score, recall, precision, AUC, etc.
261
+ 4) **Results**: Summarize key comparisons and ablation results, focusing on the most important details.
262
+
263
+ **Note**: Only output the “Evidence Summary” and “Experimental Summary”
264
+ """
265
+ }
266
+
267
+ ## Methodology extraction
268
+
269
+ METHOD_PROMPT_DICT = {
270
+ "system_instruction": """Given the **title**, the **method overview**, and a paragraph of a research paper. You task is identify and extract text being relevant to 'method overview' from the given paragraph for PowerPoint presentation.
271
+
272
+ Follow these steps:
273
+ 1. **Method Information Detection**: Check if the paragraph contains:
274
+ 1) Any mention of the **method overview** or
275
+ 2) Specific method details, such as:
276
+ - **Problem Definition**: The task, input, and expected output.
277
+ - **Model Architecture**: Structure, key components, and learning type.
278
+ - **Algorithm**: Steps of the method.
279
+ - **Training Process**: Training data, optimization method, and loss function.
280
+ 2. **Response**: Choose 'YES' or 'NO':
281
+ - If 'YES', summarize the method details in a minimal draft with max 20 words, ensuring it is:
282
+ - Clear and concise
283
+ - Well-formatted for readability
284
+ - Focused on key points.
285
+ - If 'NO', simply respond with 'NO Information'.
286
+ """,
287
+ "iterative_prompt": """Summarize the method description in 200 words from the following paragraph (with title and content) if method-related information is found. Follow these steps:
288
+
289
+ 1. List **2 to 4** method steps in numbered format..
290
+ 2. Ensure each step is related to the **method overview**.
291
+ 3. Keep each step clear and concise (only minimal draft with max 15 words).
292
+
293
+ **Note**: Only output the itemized method steps.
294
+ """,
295
+
296
+ "final_prompt": """Using **title**, **method overview**, and a list of itemized method step summary from a research paper, follow these instructions to summarize the method description::
297
+
298
+ 1. Provide a numbered list of **3-6 method steps** detailing the **method overview**.
299
+ 2. Keep each step clear and concise (only 1 sentence).
300
+
301
+ **Note**: Only output the itemized method steps.
302
+ """
303
+ }
304
+
305
+ SLIDES_REVISION_PROMPT = """You are an expert research assistant. Revise the following research paper slides to enhance clarity and readability while preserving the original markdown structure. Keep all first-level markdown headers unchanged. Sections are separated by '{}'. Follow these guidelines:
306
+
307
+ 1. Simplify language and make content more concise, especially in the outline.
308
+ 2. Preserve the logical flow and overall structure.
309
+ 3. Make key points and conclusions clear and easy to follow.
310
+ 4. Use bullet points where appropriate for better clarity.
311
+ 5. Minimize jargon to ensure accessibility for a broad academic audience.
312
+
313
+ """.format(SLIDE_SEP)
314
+
315
+ def make_api_call(model, messages, max_tokens, temperature):
316
+ try:
317
+ response = call_llama_chat(messages=messages, model=model, temperature=temperature, max_tokens=max_tokens)
318
+ return response
319
+ except Exception as e:
320
+ return f"Failed to generate final answer. Error: {str(e)}", {}
321
+
322
+ def convert_to_dict(input_string: str):
323
+ # Split the string by the delimiter (e.g., semicolon)
324
+ lines = input_string.strip().split('\n')
325
+ # Initialize an empty dictionary
326
+ result_dict = {}
327
+ # Iterate over each line
328
+ for line in lines:
329
+ # Split each line into key and value by the delimiter (e.g., colon)
330
+ if ':' in line:
331
+ key, value = line.split(':', 1) # Split only on the first occurrence
332
+ # Strip any whitespace and store in the dictionary
333
+ result_dict[key.strip()] = value.strip()
334
+ return result_dict
335
+
336
+
337
+ class Paper2Slides(object):
338
+ def __init__(self, paper_contents: dict, model: str, max_tokens = 512, temprature=0.1):
339
+ self.paper_contents = paper_contents
340
+ if not self.valid_paper_checking():
341
+ print('Not a valid paper structure, cannot generate slides')
342
+ exit(1)
343
+ self.model = MODEL_ALIAS[model]
344
+ self.is_rate_limitation = ('405B' in self.model) or ('70B' in self.model)
345
+ self.temprature = temprature
346
+ self.max_failure_attempt_each_step = 3
347
+ if '405B' in self.model:
348
+ self.sleep_time = 0.25
349
+ else:
350
+ self.sleep_time = 0.25
351
+ self.max_tokens = max_tokens
352
+ print('{} model is used for slides generation!\nRate limitation = {}'.format(self.model, self.is_rate_limitation))
353
+ self.revise_model = MODEL_ALIAS['llama3_70b']
354
+
355
+ def valid_paper_checking(self):
356
+ try:
357
+ assert 'abstract' in self.paper_contents, 'No abstract is detected'
358
+ assert 'title' in self.paper_contents, 'No title is detected'
359
+ paper_structure = self.paper_contents['structure']
360
+ introduction_idx_array = paper_structure['Introduction']
361
+ conclusion_idx_array = paper_structure['Conclusion']
362
+ assert introduction_idx_array[0] >=0, 'No introduction is detected'
363
+ assert conclusion_idx_array[0] >=0, 'No conclusion is detected'
364
+ except AssertionError as e:
365
+ print(f"AssertionError: {e}")
366
+ return False
367
+ return True
368
+
369
+ def step(self, messages):
370
+ result = self.run(messages=messages)
371
+ if 'Failed' in result:
372
+ time.sleep(self.sleep_time)
373
+ if self.is_rate_limitation:
374
+ print('sleep {} seconds'.format(self.sleep_time))
375
+ time.sleep(self.sleep_time)
376
+ return result
377
+
378
+ def run(self, messages):
379
+ for attempt in range(self.max_failure_attempt_each_step):
380
+ try:
381
+ response = make_api_call(messages=messages, model=self.model, max_tokens=self.max_tokens, temperature=self.temprature)
382
+ return response
383
+ except Exception as e:
384
+ if attempt == self.max_failure_attempt_each_step - 1:
385
+ return "Failed to generate step after {} attempts. $ERROR$: {}".format(self.max_failure_attempt_each_step, str(e))
386
+ else:
387
+ return "Failed to generate step. $ERROR$: {}".format(str(e))
388
+ time.sleep(2) # Wait for 1 second before retrying
389
+ return 'Failed to generate reasoning step.'
390
+
391
+
392
+ def abstract_summary(self):
393
+ """
394
+ Extract the outline for the slides from abstract
395
+ """
396
+ assert len(self.paper_contents['title']) > 0 and len(self.paper_contents['abstract']) > 512
397
+ prompt = "**title**: {}\n\n**abstract**: {}".format(self.paper_contents['title'], self.paper_contents['abstract'])
398
+ messages = [
399
+ {"role": "system", "content": SCHOLAR_PROMPT},
400
+ {"role": "system", "content": ABSTRACT_SUMMARY_PROMPT},
401
+ {"role": "user", "content": prompt},
402
+ {"role": "assistant", "content": "I will extract the evidences following my instructions."}
403
+ ]
404
+ abstract_summary = self.step(messages=messages)
405
+ try:
406
+ abstract_summary_dict = json.loads(abstract_summary)
407
+ except Exception as e:
408
+ abstract_summary_dict = convert_to_dict(input_string=abstract_summary)
409
+
410
+ trim_abstract_summary_dict = {}
411
+ for k, v in abstract_summary_dict.items():
412
+ trim_abstract_summary_dict[trim_string(k)] = v
413
+ return trim_abstract_summary_dict
414
+
415
+ def support_background(self, background: str, introduction: str):
416
+ """
417
+ Extract support evidences for background from introduction
418
+ """
419
+ prompt = "**title**: {}\n\n**promblem background**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], background, introduction)
420
+ messages = [
421
+ {"role": "system", "content": SCHOLAR_PROMPT},
422
+ {"role": "system", "content": BACKGROUD_EVIDENCE_PROMPT},
423
+ {"role": "user", "content": prompt},
424
+ {"role": "assistant", "content": "I will extract the evidences following my instructions."}
425
+ ]
426
+ evidences = self.step(messages=messages)
427
+ # print('Background evidences = {}'.format(evidences))
428
+ step_num = 1
429
+ return evidences, step_num
430
+
431
+ def support_research_problem(self, research_problem: str, introduction: str):
432
+ """
433
+ Extract support evidences for research problem from introduction
434
+ """
435
+ prompt = "**title**: {}\n\n**research problem**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], research_problem, introduction)
436
+ messages = [
437
+ {"role": "system", "content": SCHOLAR_PROMPT},
438
+ {"role": "system", "content": RESEARCH_PROBLEM_PROMPT},
439
+ {"role": "user", "content": prompt},
440
+ {"role": "assistant", "content": "I will extract the evidences following my instructions."}
441
+ ]
442
+ evidences = self.step(messages=messages)
443
+ step_num = 1
444
+ return evidences, step_num
445
+
446
+ def support_objectives(self, objectives: str, introduction: str):
447
+ """
448
+ Extract support evidences for objectives from introduction
449
+ """
450
+ prompt = "**title**: {}\n\n**objectives**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], objectives, introduction)
451
+ messages = [
452
+ {"role": "system", "content": SCHOLAR_PROMPT},
453
+ {"role": "system", "content": OBJECTIVE_PROMPT},
454
+ {"role": "user", "content": prompt},
455
+ {"role": "assistant", "content": "I will extract the evidences following my instructions."}
456
+ ]
457
+ evidences = self.step(messages=messages)
458
+ step_num = 1
459
+ return evidences, step_num
460
+
461
+ def support_conclusion(self, conclusion: str, introduction: str, conclusion_text: str, step_wise=True):
462
+ """
463
+ Expand conclusion based on full-text conclusion and introducton.
464
+ If step_wise = True:
465
+ 1. Summarize introduction while focusing on conclusion part
466
+ 2. Extract conclusion points from introduction summary and full-context conclusion.
467
+ """
468
+ step_num = 0
469
+ prompt = "**title**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], introduction)
470
+ if step_wise:
471
+ messages = [
472
+ {"role": "system", "content": SCHOLAR_PROMPT},
473
+ {"role": "system", "content": "Given a **tititle** and **introduction** of a research paper, summarize and extract conclusion related information in about 200 words."},
474
+ {"role": "user", "content": prompt},
475
+ {"role": "assistant", "content": "I will extract the conclusion following my instructions."}
476
+ ]
477
+ instruction_conclusion_summary = self.step(messages=messages)
478
+ step_num = step_num + 1
479
+ else:
480
+ instruction_conclusion_summary = introduction
481
+
482
+ prompt = "**title**: {}\n\n**brief conclusion**: {}\n\n**conclusion**: \n\n{}**introduction**: {}".format(self.paper_contents['title'], conclusion, conclusion_text, instruction_conclusion_summary)
483
+ messages = [
484
+ {"role": "system", "content": SCHOLAR_PROMPT},
485
+ {"role": "system", "content": CONCLUSION_PROMT},
486
+ {"role": "user", "content": prompt},
487
+ {"role": "assistant", "content": "I will extract the conclusions following my instructions."}
488
+ ]
489
+ evidences = self.step(messages=messages)
490
+ step_num = step_num + 1
491
+ return evidences, step_num
492
+
493
+ def support_experiment_results(self, main_results: str, paragraph_list: list):
494
+ step_num = 0
495
+ prompt = "**title**: {}\n\n**main results**: {}\n\n".format(self.paper_contents['title'], main_results)
496
+ iterative_sys_prompt = RESULT_PROMPT_DICT['iterative_prompt']
497
+ messages = [
498
+ {"role": "system", "content": SCHOLAR_PROMPT},
499
+ {"role": "system", "content": RESULT_PROMPT_DICT['system_instruction']},
500
+ {"role": "user", "content": prompt},
501
+ {"role": "system", "content": iterative_sys_prompt},
502
+ ]
503
+
504
+ follow_instruction = {"role": "assistant", "content": "I will extract the experimental information following my instructions."}
505
+
506
+ paragraph_summary_array = []
507
+ for para_idx in range(len(paragraph_list)):
508
+ para_input_prompt = "Paragraph title: {}\n\nContent: {}\n\n".format(paragraph_list[para_idx]['title'], paragraph_list[para_idx]['content'])
509
+ user_input = {'role': 'user', 'content': para_input_prompt}
510
+ messages.append(user_input)
511
+ messages.append(follow_instruction)
512
+ para_summary = self.step(messages=messages)
513
+ step_num = step_num + 1
514
+ paragraph_summary_array.append(para_summary)
515
+ messages.pop()
516
+ messages.pop()
517
+
518
+ ## Experimental result summary
519
+
520
+ prompt = "**title**: {}\n\n**main results**: {}\n\n".format(self.paper_contents['title'], main_results)
521
+ summary_prompt = '\n'.join(['**summary** {}:\n\n{}'.format(idx+1, summary) for idx, summary in enumerate(paragraph_summary_array)])
522
+ input_prompt = prompt + summary_prompt
523
+
524
+ messages = [
525
+ {"role": "system", "content": SCHOLAR_PROMPT},
526
+ {"role": "system", "content": RESULT_PROMPT_DICT['final_prompt']},
527
+ {"role": "user", "content": input_prompt},
528
+ {"role": "assistant", "content": "I will summarize the experimental results following my instructions."},
529
+ ]
530
+
531
+ result_summary = self.step(messages=messages)
532
+ step_num = step_num + 1
533
+ return result_summary, step_num
534
+
535
+ def experiment_paragraph_extraction(self,):
536
+ intro_idx = self.paper_contents['structure']['Introduction'][0]
537
+ conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
538
+ experiment_idx_array = self.paper_contents['structure']['Experiments']
539
+ if len(experiment_idx_array) == 0:
540
+ experiment_idx_array = [_ for _ in range(intro_idx+1, conclusion_idx)]
541
+ assert len(experiment_idx_array) > 0 and max(experiment_idx_array) < len(self.paper_contents['main_text'])
542
+ experiment_idx_array = [intro_idx] + experiment_idx_array
543
+ paragraphs = [self.paper_contents['main_text'][_] for _ in experiment_idx_array]
544
+ return paragraphs
545
+
546
+ def support_methodology(self, method_overview: str, paragraph_list: list):
547
+ step_num = 0
548
+ prompt = "**title**: {}\n\n**method overview**: {}\n\n".format(self.paper_contents['title'], method_overview)
549
+ iterative_sys_prompt = METHOD_PROMPT_DICT['iterative_prompt']
550
+ messages = [
551
+ {"role": "system", "content": SCHOLAR_PROMPT},
552
+ {"role": "system", "content": METHOD_PROMPT_DICT['system_instruction']},
553
+ {"role": "user", "content": prompt},
554
+ {"role": "system", "content": iterative_sys_prompt},
555
+ ]
556
+
557
+ follow_instruction = {"role": "assistant", "content": "I will extract the method information following my instructions."}
558
+
559
+ method_summary_array = []
560
+ for para_idx in range(len(paragraph_list)):
561
+ para_input_prompt = "Paragraph title: {}\n\nContent: {}\n\n".format(paragraph_list[para_idx]['title'], paragraph_list[para_idx]['content'])
562
+ user_input = {'role': 'user', 'content': para_input_prompt}
563
+ messages.append(user_input)
564
+ messages.append(follow_instruction)
565
+ method_summary = self.step(messages=messages)
566
+ step_num = step_num + 1
567
+ method_summary_array.append(method_summary)
568
+ messages.pop()
569
+ messages.pop()
570
+
571
+ ## Method summary
572
+ prompt = "**title**: {}\n\n**method overview**: {}\n\n".format(self.paper_contents['title'], method_overview)
573
+ method_summary_prompt = '\n'.join(['**method summary** {}:\n\n{}'.format(idx+1, summary) for idx, summary in enumerate(method_summary_array)])
574
+ input_prompt = prompt + method_summary_prompt
575
+
576
+ messages = [
577
+ {"role": "system", "content": SCHOLAR_PROMPT},
578
+ {"role": "system", "content": METHOD_PROMPT_DICT['final_prompt']},
579
+ {"role": "user", "content": input_prompt},
580
+ {"role": "assistant", "content": "I will generate a step-by-step method summary following my instructions."},
581
+ ]
582
+ method_summary = self.step(messages=messages)
583
+ step_num = step_num + 1
584
+ return method_summary, step_num
585
+
586
+ def method_paragraph_extraction(self,):
587
+ intro_idx = self.paper_contents['structure']['Introduction'][0]
588
+ conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
589
+ method_idx_array = self.paper_contents['structure']['Methods']
590
+ if len(method_idx_array) == 0:
591
+ method_idx_array = [_ for _ in range(intro_idx+1, conclusion_idx)]
592
+ assert len(method_idx_array) > 0 and max(method_idx_array) < len(self.paper_contents['main_text'])
593
+ method_idx_array = [intro_idx] + method_idx_array
594
+ paragraphs = [self.paper_contents['main_text'][_] for _ in method_idx_array]
595
+ return paragraphs
596
+
597
+ def generate_slides(self, verbose=False, revision=True):
598
+ ## Step 1: Paper content extraction
599
+ intro_idx = self.paper_contents['structure']['Introduction'][0]
600
+ introduction = self.paper_contents['main_text'][intro_idx]['content']
601
+ assert len(introduction) > 512, 'introduction = {}, content = {}'.format(introduction, self.paper_contents['main_text'])
602
+ conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
603
+ conclusion = self.paper_contents['main_text'][conclusion_idx]['content']
604
+ assert len(conclusion) > 128, 'conclusion = {}, content = {}'.format(introduction, self.paper_contents['main_text'])
605
+ method_paragraphs = self.method_paragraph_extraction()
606
+ experiment_paragraphs = self.experiment_paragraph_extraction()
607
+
608
+ start_time = time.time()
609
+ ## Step 2: slides structure extraction from abstract
610
+ model_call_number = 0
611
+ print('Slides structure generation')
612
+ slides = {'Title': self.paper_contents['title']}
613
+ outline_dict = self.abstract_summary()
614
+ model_call_number += 1
615
+ slides['Outline'] = outline_dict
616
+
617
+ print('Slides generation...')
618
+ background = outline_dict.get('Background', '')
619
+ slides['Background'], b_steps = self.support_background(background=background, introduction=introduction)
620
+ model_call_number += b_steps
621
+
622
+ research_problem = outline_dict.get('Research problem', '')
623
+ slides['Research problem'], r_steps = self.support_research_problem(research_problem=research_problem, introduction=introduction)
624
+ model_call_number += r_steps
625
+
626
+ objectives = outline_dict.get('Objectives', '')
627
+ slides['Objectives'], o_steps = self.support_objectives(objectives=objectives, introduction=introduction)
628
+ model_call_number += o_steps
629
+
630
+ brief_conclusion = outline_dict.get('Conclusions', '')
631
+ slides['Conclusions'], c_steps = self.support_conclusion(conclusion=brief_conclusion, introduction=introduction, conclusion_text=conclusion, step_wise=True)
632
+ model_call_number += c_steps
633
+
634
+ results = outline_dict.get('Results', '')
635
+ result_summary, res_steps = self.support_experiment_results(main_results=results, paragraph_list=experiment_paragraphs)
636
+ slides['Results'] = result_summary
637
+ model_call_number += res_steps
638
+
639
+ methodology = outline_dict.get('Methodology', '')
640
+ method_summary, m_steps = self.support_methodology(method_overview=methodology, paragraph_list=method_paragraphs)
641
+ model_call_number += m_steps
642
+ slides['Methodology'] = method_summary
643
+ runtime = time.time() - start_time
644
+ print('Slide generation takes {:.4f} seconds with {} function calls'.format(runtime, model_call_number))
645
+ if verbose:
646
+ slides_content = self.slides2markdown_v2(slides=slides)
647
+ if revision:
648
+ slides_content = self.slides_revision(slide_content=slides_content)
649
+ slides_array = markdown_to_slide_dicts(full_markdown=slides_content)
650
+ revised_slides = {k: v for d in slides_array for k, v in d.items()}
651
+ if verbose:
652
+ print('Json format:\n{}'.format(json.dumps(revised_slides, indent=4)))
653
+ print('\n' * 3)
654
+ print('paper keywords:\n{}'.format(self.paper_contents.keys()))
655
+ return revised_slides
656
+ if verbose:
657
+ print('Generated slides:\n{}'.format(slides_content))
658
+ print('Json format:\n{}'.format(json.dumps(slides, indent=4)))
659
+ return slides
660
+
661
+ def slides_revision(self, slide_content: str):
662
+ messages = [
663
+ {"role": "system", "content": SLIDES_REVISION_PROMPT},
664
+ {"role": "user", "content": slide_content},
665
+ {"role": "assistant", "content": "I will revise the representation slides following my instructions."}
666
+ ]
667
+ print('Slides final revision')
668
+ revised_slides = make_api_call(model=self.revise_model, messages=messages, max_tokens=2048, temperature=self.temprature)
669
+ return revised_slides
670
+
671
+ def slides2markdown(self, slides: dict):
672
+ slides_content = ''
673
+ slides_content += '**Title**\n{}\n\n'.format(slides['Title'])
674
+ slides_content += '{}\n'.format(SLIDE_SEP)
675
+ slides_content += '**Outline**\n\n'
676
+ outline_dict = slides['Outline']
677
+ for sect_name, sect_content in outline_dict.items():
678
+ slides_content += '{}\n--\t\t{}\n\n'.format(sect_name, sect_content)
679
+ slides_content += '{}\n'.format(SLIDE_SEP)
680
+ for sect_name in outline_dict.keys():
681
+ if sect_name in slides:
682
+ slides_content += '**{}**\n\n'.format(sect_name)
683
+ slides_content += '{}\n\n'.format(slides[sect_name])
684
+ slides_content += '{}\n'.format(SLIDE_SEP)
685
+ return slides_content
686
+
687
+ def slides2markdown_v2(self, slides: dict, indent=0):
688
+ slides_content = dict_to_markdown_list(d=slides, indent=indent)
689
+ return slides_content
690
+
691
+ def save_to_slides(self, slides: dict, logo_path='logo.png', file_name='slides.pptx'):
692
+ authors = self.paper_contents.get('author', None)
693
+ if isinstance(authors, list):
694
+ authors = authors[0]
695
+ else:
696
+ authors = None
697
+ # print('authors', authors)
698
+ dict2ppt = Dict2PPT(logo_path=logo_path)
699
+ dict2ppt.build_slides(slide_dict=slides, authors=authors)
700
+ dict2ppt.save(file_name=file_name)
701
+ full_path = os.path.abspath(file_name)
702
+ return full_path
703
+
pdf_helper.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdf4llm
2
+ import re
3
+
4
+ def py4llm_pdf_reader(pdf_path: str):
5
+ md_text = pdf4llm.to_markdown(pdf_path)
6
+ return md_text
7
+
8
+ def split_markdown_sections(text):
9
+ # Regex to match headers (e.g., #, ##, ###)
10
+ header_pattern = r'^(#{1,6})\s*(.+)$'
11
+
12
+ # Find all headers and their positions
13
+ matches = list(re.finditer(header_pattern, text, re.MULTILINE))
14
+
15
+ sections = []
16
+
17
+ # Iterate over all header matches and split text
18
+ for i, match in enumerate(matches):
19
+ header = match.group(0) # Full header text: number of # and header name
20
+ level = len(match.group(1)) # Header level (number of #)
21
+ title = match.group(2) # Header title
22
+
23
+ # Find the start position of the section (right after the header)
24
+ start_pos = match.end()
25
+
26
+ # Find the end position (start of the next header or end of the document)
27
+ if i + 1 < len(matches):
28
+ end_pos = matches[i + 1].start()
29
+ else:
30
+ end_pos = len(text)
31
+
32
+ # Extract section content between this header and the next one
33
+ section_content = text[start_pos:end_pos].strip()
34
+
35
+ # Store the section as a tuple: (header level, header title, section content)
36
+ sections.append({'level': level, 'title': title, 'content': section_content})
37
+
38
+ return sections
39
+
40
+
41
+ class PDFPaper4LLMParser(object):
42
+ def __init__(self, write_images=False, page_chunks=False) -> None:
43
+ self.write_images = write_images
44
+ self.page_chunks = page_chunks
45
+
46
+ def pdf2text(self, pdf_path: str):
47
+ md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
48
+ if self.page_chunks:
49
+ text_array = []
50
+ for md_text_i in md_text:
51
+ text_array.append(md_text_i['text'])
52
+ markdown_text = '\n'.join(text_array)
53
+ else:
54
+ markdown_text = md_text
55
+ return markdown_text
56
+
57
+ def structured_paper_content(self, markdown_sections: list):
58
+ """
59
+ markdown_sections: list of dictionary, each dictionary consists of
60
+ 1. level
61
+ 2. title
62
+ 3. content
63
+
64
+ Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
65
+ """
66
+ assert len(markdown_sections) > 0
67
+ struct_sections = {}
68
+ start_section = markdown_sections[0]
69
+ title_level = start_section['level']
70
+
71
+ main_text_idx = -1
72
+ meta_data = []
73
+ for sec_idx, section in enumerate(markdown_sections):
74
+ level_i = section['level']
75
+ title_i = section['title']
76
+ content_i = section['content']
77
+ if level_i == title_level and sec_idx == 0:
78
+ struct_sections['title'] = title_i
79
+ if len(content_i) > 0:
80
+ meta_data.append(content_i)
81
+ else:
82
+ if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
83
+ struct_sections['abstract'] = content_i
84
+ main_text_idx = sec_idx + 1
85
+ break
86
+ else:
87
+ meta_data.append(title_i + content_i)
88
+ struct_sections['author'] = meta_data
89
+ if main_text_idx == -1 and len(markdown_sections) > 0:
90
+ main_text_idx = 0
91
+ assert main_text_idx >= 0
92
+ main_text_list = markdown_sections[main_text_idx:]
93
+ struct_sections['main_text'] = main_text_list
94
+ return struct_sections
95
+
96
+ def run(self, pdf_path: str, verbose=True):
97
+ markdown_text = self.pdf2text(pdf_path=pdf_path)
98
+ sections = split_markdown_sections(text=markdown_text)
99
+ struct_sections = self.structured_paper_content(markdown_sections=sections)
100
+ if verbose:
101
+ paper_text = ''
102
+ for k, v in struct_sections.items():
103
+ if k == 'title':
104
+ paper_text += '\nTitle: ' + v + '\n\n'
105
+ elif k == 'abstract':
106
+ paper_text += '\nAbstract: \n' + v + '\n\n'
107
+ elif k == 'author':
108
+ paper_text += '\nAuthor: \n' + '\n'.join(v) + '\n\n'
109
+ elif k == 'main_text':
110
+ for section in v:
111
+ paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
112
+ print(paper_text)
113
+ return struct_sections
114
+
115
+
116
+ def dict_to_markdown_list(d: dict, indent=0):
117
+ lines = []
118
+ for key, value in d.items():
119
+ prefix = ' ' * indent + f"- **{key}**: "
120
+ if isinstance(value, dict):
121
+ lines.append(prefix)
122
+ lines.append(dict_to_markdown_list(value, indent + 1))
123
+ else:
124
+ lines.append(prefix + str(value))
125
+ return "\n".join(lines)
126
+
127
+
128
+ def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
129
+ return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]
130
+
131
+
132
+ def parse_slide_to_dict(slide: str):
133
+ lines = slide.splitlines()
134
+ result = {}
135
+ current_key = None
136
+ sub_items = []
137
+
138
+ for line in lines:
139
+ line = line.strip()
140
+
141
+ # Capture headings (### or ##)
142
+ heading_match = re.match(r"^#{2,3}\s+(.*)", line)
143
+ if heading_match:
144
+ if current_key and sub_items:
145
+ result[current_key] = sub_items
146
+ sub_items = []
147
+ current_key = heading_match.group(1).strip()
148
+ continue
149
+
150
+ # Capture numbered list
151
+ numbered_match = re.match(r"^\d+\.\s+(.*)", line)
152
+ if numbered_match:
153
+ sub_items.append(numbered_match.group(1).strip())
154
+ continue
155
+
156
+ # Capture bulleted list
157
+ bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
158
+ if bullet_match:
159
+ sub_items.append(bullet_match.group(1).strip())
160
+ continue
161
+
162
+ # Capture nested bullets
163
+ nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
164
+ if nested_bullet_match:
165
+ sub_items.append(nested_bullet_match.group(1).strip())
166
+ continue
167
+
168
+ # Fallback: add as freeform text
169
+ if current_key:
170
+ sub_items.append(line)
171
+
172
+ # Save the last block
173
+ if current_key and sub_items:
174
+ result[current_key] = sub_items
175
+
176
+ return result
177
+
178
+
179
+ def markdown_to_slide_dicts(full_markdown: str):
180
+ slides = split_markdown_slides(full_markdown)
181
+ return [parse_slide_to_dict(slide) for slide in slides]
pptx_utils.py ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pptx import Presentation
2
+ from pptx.dml.color import RGBColor
3
+ from pptx.util import Inches
4
+ from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
5
+ from pptx.util import Pt
6
+ import string
7
+ from datetime import datetime
8
+ import os
9
+ import re
10
+
11
+ def clean_leading_numbering(text):
12
+ # Remove leading numbering like: "1. ", "1) ", "(1) ", "- 1. ", etc.
13
+ return re.sub(r'^[\s\(\-\.\d\)]*', '', text)
14
+
15
+ def is_logo_exist(file_path: str):
16
+ print(file_path)
17
+ if os.path.exists(file_path):
18
+ # print("File exists.")
19
+ return True
20
+ else:
21
+ print("File does not exist.")
22
+ return False
23
+
24
+ class Dict2PPT:
25
+ def __init__(self, logo_path: str = 'logo.png', title_size: int = 32, content_size: int=24) -> None:
26
+ self.title_font_size = Pt(title_size)
27
+ self.content_font_size = Pt(content_size)
28
+ self.logo_path = logo_path
29
+ self.prs = Presentation()
30
+
31
+ def _title_preprocess(self, title: str):
32
+ words = title.split()
33
+ capitalized_words = [word.capitalize() for word in words]
34
+ result = ' '.join(capitalized_words)
35
+ return result
36
+
37
+ def _add_time_footnote(self, slide):
38
+ # Get slide dimensions
39
+ slide_width = self.prs.slide_width
40
+ slide_height = self.prs.slide_height
41
+
42
+ # Prepare date text
43
+ date_str = datetime.today().strftime("%B %d, %Y") # e.g., March 26, 2025
44
+
45
+ # Set textbox size
46
+ textbox_width = Inches(3) # You can adjust this
47
+ textbox_height = Inches(0.3)
48
+ left = (slide_width - textbox_width) / 2 # Center horizontally
49
+ top = slide_height - Inches(0.5) # Near bottom
50
+
51
+ textbox = slide.shapes.add_textbox(left, top, textbox_width, textbox_height)
52
+ text_frame = textbox.text_frame
53
+ p = text_frame.paragraphs[0]
54
+ run = p.add_run()
55
+ run.text = date_str
56
+ run.font.size = Pt(12)
57
+ p.alignment = PP_ALIGN.CENTER # ✅ Center text horizontally
58
+
59
+ def _add_logo(self, slide):
60
+ # Define logo path and size
61
+ # logo_path = "logo.png" # Replace with your actual logo path
62
+
63
+ if not is_logo_exist(file_path=self.logo_path):
64
+ return
65
+ logo_width = Inches(1.0) # Resize logo as needed
66
+ logo_height = Inches(1.0)
67
+
68
+ # Calculate position for top-right corner
69
+ slide_width = self.prs.slide_width
70
+ right_margin = Inches(0.2) # Optional small margin from edge
71
+ top = Inches(0.2)
72
+
73
+ # Position: from right edge minus logo width
74
+ left = slide_width - logo_width - right_margin
75
+
76
+ # Add logo
77
+ slide.shapes.add_picture(self.logo_path, left, top, width=logo_width, height=logo_height)
78
+
79
+ def _set_background_color(self, slide):
80
+ fill = slide.background.fill
81
+ fill.solid() # Use solid color
82
+ fill.fore_color.rgb = RGBColor(240, 248, 255) # RGB for a light blue
83
+
84
+ def title_slide(self, title: str, authors: str):
85
+ title_slide_layout = self.prs.slide_layouts[0] # Title Slide
86
+ slide = self.prs.slides.add_slide(title_slide_layout)
87
+
88
+ self._set_background_color(slide=slide)
89
+ self._add_logo(slide=slide)
90
+
91
+ title_shape = slide.shapes.title
92
+ title_shape.text = title
93
+ title_paragraph = title_shape.text_frame.paragraphs[0]
94
+ for run in title_paragraph.runs:
95
+ run.font.bold = True
96
+ run.font.name = 'Times New Roman'
97
+ run.font.size = Pt(36) # e.g., 44 pt
98
+
99
+ author_shape = slide.placeholders[1]
100
+ today = datetime.today().strftime("%B %d, %Y") # e.g., March 25, 2025
101
+ # print('authors', authors)
102
+ # if authors:
103
+ # author_shape.text = '\n' + authors + '\n' + today
104
+ # else:
105
+ # author_shape.text = '\nAuthor Here\n' + today
106
+ author_shape.text = '\nAuthor Here\n' + today
107
+ # Set subtitle font size
108
+ author_paragraph = author_shape.text_frame.paragraphs[1] # 0 is blank line, 1 is actual text
109
+ for run in author_paragraph.runs:
110
+ run.font.name = 'Times New Roman'
111
+ run.font.size = Pt(24) # Set subtitle font size to 28 pt
112
+
113
+ def outline_slide(self, outline: dict):
114
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
115
+ slide = self.prs.slides.add_slide(content_slide_layout)
116
+ self._set_background_color(slide=slide)
117
+ self._add_logo(slide=slide)
118
+ title_shape = slide.shapes.title
119
+ title_shape.text = 'Outline'
120
+ title_paragraph = title_shape.text_frame.paragraphs[0]
121
+ title_paragraph.alignment = PP_ALIGN.LEFT
122
+ for run in title_paragraph.runs:
123
+ run.font.bold = True
124
+ run.font.name = 'Times New Roman'
125
+ run.font.size = Pt(36) # e.g., 36 pt
126
+
127
+ # Clear existing content
128
+ content_shape = slide.placeholders[1]
129
+ text_frame = content_shape.text_frame
130
+ text_frame.clear()
131
+
132
+ # Add topic
133
+ for topic, desc in outline.items():
134
+ p1 = text_frame.add_paragraph()
135
+ p1.text = topic
136
+ p1.level = 0
137
+ p1.font.size = Pt(20)
138
+ p1.font.name = 'Times New Roman'
139
+ p1.font.bold = True
140
+ p1.alignment = PP_ALIGN.LEFT
141
+
142
+ # Line 2: description (indented)
143
+ if len(desc) > 0:
144
+ p2 = text_frame.add_paragraph()
145
+ p2.text = desc
146
+ p2.level = 1 # Indented bullet
147
+ p2.font.size = Pt(12)
148
+ p2.font.name = 'Times New Roman'
149
+ p2.alignment = PP_ALIGN.LEFT
150
+
151
+ self._add_time_footnote(slide=slide)
152
+
153
+ def _outline_preprocess_(self, outline):
154
+ if isinstance(outline, dict):
155
+ clean_outline = {}
156
+ for topic, desc in outline.items():
157
+ topic = topic.strip().strip(string.punctuation).strip()
158
+ desc = desc.strip().strip(string.punctuation).strip()
159
+ clean_outline[topic] = desc
160
+ return clean_outline
161
+
162
+ elif isinstance(outline, str):
163
+ sentences = outline.split('\n')
164
+ sentences = [text.strip().strip(string.punctuation).strip() for text in sentences]
165
+ sent_dict = {}
166
+ for sent in sentences:
167
+ tokens = sent.split(':')
168
+ if len(tokens) == 1:
169
+ sent_dict[tokens[0]] = ''
170
+ else:
171
+ key = tokens[0].strip().strip(string.punctuation).strip()
172
+ value = ''.join(tokens[1:])
173
+ value = value.strip().strip(string.punctuation).strip()
174
+ sent_dict[key] = value
175
+ return sent_dict
176
+ else:
177
+ print('Wrong format')
178
+ return {}
179
+
180
+ def _background_preprocess(self, background: str):
181
+ background_array = []
182
+ sentences = background.strip().splitlines()
183
+ for sent in sentences:
184
+ sent = clean_leading_numbering(sent)
185
+ background_array.append(sent.strip().strip(string.punctuation).strip())
186
+ return background_array
187
+
188
+ def background_slide(self, background):
189
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
190
+ slide = self.prs.slides.add_slide(content_slide_layout)
191
+ self._set_background_color(slide=slide)
192
+ self._add_logo(slide=slide)
193
+ title_shape = slide.shapes.title
194
+ title_shape.text = 'Background'
195
+ title_paragraph = title_shape.text_frame.paragraphs[0]
196
+ title_paragraph.alignment = PP_ALIGN.LEFT
197
+ for run in title_paragraph.runs:
198
+ run.font.bold = True
199
+ run.font.name = 'Times New Roman'
200
+ run.font.size = Pt(36) # e.g., 36 pt
201
+
202
+ # Clear existing content
203
+ content_shape = slide.placeholders[1]
204
+ text_frame = content_shape.text_frame
205
+ text_frame.clear()
206
+ # ✅ Vertically center content inside the placeholder
207
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
208
+
209
+ # Add topic + indented description as two lines
210
+ background_item_num = len(background)
211
+ fontsize = 22
212
+ if background_item_num >= 4 and background_item_num <= 6:
213
+ fontsize = 20
214
+ elif background_item_num >7:
215
+ fontsize = 18
216
+ for idx, topic in enumerate(background, start=1):
217
+ p1 = text_frame.add_paragraph()
218
+ p1.text = f"{idx}. {topic}"
219
+ p1.level = 0
220
+ p1.font.size = Pt(fontsize)
221
+ p1.font.name = 'Times New Roman'
222
+ # p1.font.bold = True
223
+ p1.alignment = PP_ALIGN.LEFT
224
+
225
+ self._add_time_footnote(slide=slide)
226
+
227
+ def _problem_define_preprocess(self, problem_desc: str):
228
+ from collections import OrderedDict
229
+ def split_text_by_headers(text, headers):
230
+ sections = OrderedDict({header: [] for header in headers})
231
+ current = None
232
+ for line in text.strip().strip(string.punctuation).splitlines():
233
+ line_clean = line.strip().strip(string.punctuation).strip()
234
+ if len(line_clean) == 0:
235
+ continue
236
+ # Check if line matches any of the section headers
237
+ matched = [h for h in headers if h.lower() == line_clean.lower()]
238
+ if matched:
239
+ current = matched[0]
240
+ continue
241
+ if current:
242
+ cleaned_line = clean_leading_numbering(text=line_clean)
243
+ cleaned_line = cleaned_line.strip().strip(string.punctuation).strip()
244
+ sections[current].append(cleaned_line)
245
+
246
+ # Convert lists to joined text blocks
247
+ return {k: v for k, v in sections.items()}
248
+
249
+ sections = ["Scope", "Challenges", "Assumptions", "Relevance"]
250
+ problem_dict = {}
251
+ if any([_ in problem_desc for _ in sections]):
252
+ problem_dict = split_text_by_headers(text=problem_desc, headers=sections)
253
+
254
+ if all([len(v)==0 for k, v in problem_dict.items()]) or len(problem_dict) == 0:
255
+ problem_dict = {}
256
+ cleaned_sentences = []
257
+ sentences = problem_desc.strip().strip(string.punctuation).splitlines()
258
+ for sent in sentences:
259
+ cleaned_line = clean_leading_numbering(text=sent)
260
+ cleaned_line = cleaned_line.strip().strip(string.punctuation).strip()
261
+ cleaned_sentences.append(cleaned_line)
262
+ problem_dict['Scope'] = cleaned_sentences
263
+
264
+ return problem_dict
265
+
266
+ def problem_def_slide(self, problems):
267
+ sections = ["Scope", "Challenges", "Assumptions", "Relevance"]
268
+ scope = problems.get('Scope', [])
269
+ challenges = problems.get('Challenges', [])
270
+ assumptions = problems.get('Assumptions', [])
271
+ relevance = problems.get('Relevance', [])
272
+ for sect_name in sections:
273
+ section_contents = problems.get(sect_name, [])
274
+ if len(section_contents) == 0:
275
+ continue
276
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
277
+ slide = self.prs.slides.add_slide(content_slide_layout)
278
+ self._set_background_color(slide=slide)
279
+ self._add_logo(slide=slide)
280
+ title_shape = slide.shapes.title
281
+ if sect_name == 'Scope':
282
+ title_shape.text = 'Problem Definition'
283
+ elif sect_name in {'Challenges', 'Assumptions'}:
284
+ title_shape.text = 'Problem Definition - {}'.format(sect_name)
285
+ else:
286
+ title_shape.text = 'Interested Practitioners'
287
+ title_paragraph = title_shape.text_frame.paragraphs[0]
288
+ title_paragraph.alignment = PP_ALIGN.LEFT
289
+ for run in title_paragraph.runs:
290
+ run.font.bold = True
291
+ run.font.name = 'Times New Roman'
292
+ run.font.size = Pt(36) # e.g., 36 pt
293
+
294
+ # Clear existing content
295
+ content_shape = slide.placeholders[1]
296
+ text_frame = content_shape.text_frame
297
+ text_frame.clear()
298
+ # ✅ Vertically center content inside the placeholder
299
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
300
+ fontsize = 20
301
+ for idx, topic in enumerate(section_contents, start=1):
302
+ p1 = text_frame.add_paragraph()
303
+ p1.text = f"{idx}. {topic}"
304
+ p1.level = 0
305
+ p1.font.size = Pt(fontsize)
306
+ p1.font.name = 'Times New Roman'
307
+ # p1.font.bold = True
308
+ p1.alignment = PP_ALIGN.LEFT
309
+
310
+ self._add_time_footnote(slide=slide)
311
+
312
+ def _objective_preprocess(self, objective: str):
313
+ objective_array = []
314
+ sentences = objective.strip().splitlines()
315
+ for sent in sentences:
316
+ sent = clean_leading_numbering(text=sent)
317
+ objective_array.append(sent.strip().strip(string.punctuation).strip())
318
+ return objective_array
319
+
320
+ def objective_slide(self, objectives):
321
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
322
+ slide = self.prs.slides.add_slide(content_slide_layout)
323
+ self._set_background_color(slide=slide)
324
+ self._add_logo(slide=slide)
325
+ title_shape = slide.shapes.title
326
+ title_shape.text = 'Objectives & How'
327
+ title_paragraph = title_shape.text_frame.paragraphs[0]
328
+ title_paragraph.alignment = PP_ALIGN.LEFT
329
+ for run in title_paragraph.runs:
330
+ run.font.bold = True
331
+ run.font.name = 'Times New Roman'
332
+ run.font.size = Pt(36) # e.g., 36 pt
333
+
334
+ # Clear existing content
335
+ content_shape = slide.placeholders[1]
336
+ text_frame = content_shape.text_frame
337
+ text_frame.clear()
338
+ # ✅ Vertically center content inside the placeholder
339
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
340
+
341
+ objective_item_num = len(objectives)
342
+ fontsize = 24
343
+ if objective_item_num >= 4 and objective_item_num <= 6:
344
+ fontsize = 22
345
+ elif objective_item_num >7:
346
+ fontsize = 20
347
+ for idx, topic in enumerate(objectives, start=1):
348
+ p1 = text_frame.add_paragraph()
349
+ p1.text = f"{idx}. {topic}"
350
+ p1.level = 0
351
+ p1.font.size = Pt(fontsize)
352
+ p1.font.name = 'Times New Roman'
353
+ # p1.font.bold = True
354
+ p1.alignment = PP_ALIGN.LEFT
355
+
356
+ self._add_time_footnote(slide=slide)
357
+
358
+ def _method_preprocess(self, methodology: str):
359
+ method_array = []
360
+ sentences = methodology.strip().splitlines()
361
+ for sent in sentences:
362
+ sent_trim = clean_leading_numbering(text=sent)
363
+ sent_trim = sent_trim.strip().strip(string.punctuation).strip()
364
+ method_array.append(sent_trim)
365
+ return method_array
366
+
367
+ def method_slide(self, methods):
368
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
369
+ slide = self.prs.slides.add_slide(content_slide_layout)
370
+ self._set_background_color(slide=slide)
371
+ self._add_logo(slide=slide)
372
+
373
+ title_shape = slide.shapes.title
374
+ title_shape.text = 'Proposed Method'
375
+ title_paragraph = title_shape.text_frame.paragraphs[0]
376
+ title_paragraph.alignment = PP_ALIGN.LEFT
377
+ for run in title_paragraph.runs:
378
+ run.font.bold = True
379
+ run.font.name = 'Times New Roman'
380
+ run.font.size = Pt(36) # e.g., 36 pt
381
+
382
+ # Clear existing content
383
+ content_shape = slide.placeholders[1]
384
+ text_frame = content_shape.text_frame
385
+ text_frame.clear()
386
+ # ✅ Vertically center content inside the placeholder
387
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
388
+
389
+ fontsize = 20
390
+ for idx, step in enumerate(methods, start=1):
391
+ p = text_frame.add_paragraph()
392
+ run1 = p.add_run()
393
+ run1.text = "Step {}. ".format(idx)
394
+ run1.font.bold = True
395
+ run1.font.size = Pt(fontsize)
396
+
397
+ # Second run: normal text
398
+ run2 = p.add_run()
399
+ run2.text = step
400
+ run2.font.bold = False
401
+ run2.font.size = Pt(fontsize)
402
+ p.font.name = 'Times New Roman'
403
+ p.alignment = PP_ALIGN.LEFT
404
+
405
+ self._add_time_footnote(slide=slide)
406
+
407
+ def _experiment_preprocess(self, experiment: str):
408
+ def split_sections_by_keywords(text: str, keyword1: str, keyword2: str) -> dict:
409
+ lines = text.strip().splitlines()
410
+ part1_lines = []
411
+ part2_lines = []
412
+ current_section = None
413
+ for line in lines:
414
+ stripped = clean_leading_numbering(line)
415
+ stripped = stripped.strip().strip(string.punctuation).strip()
416
+ if len(stripped) == 0:
417
+ continue
418
+ if keyword1 in stripped:
419
+ current_section = keyword1
420
+ continue
421
+ elif keyword2 in stripped:
422
+ current_section = keyword2
423
+ continue
424
+
425
+ if current_section == keyword1:
426
+ tokens = stripped.split(':')
427
+ key = tokens[0].strip().strip(string.punctuation).strip()
428
+ if len(tokens) > 1:
429
+ parse_stripped = key + ": " + ':'.join(tokens[1:]).strip().strip(string.punctuation).strip()
430
+ else:
431
+ parse_stripped = key
432
+ part1_lines.append(parse_stripped)
433
+ elif current_section == keyword2:
434
+ tokens = stripped.split(':')
435
+ key = tokens[0].strip().strip(string.punctuation).strip()
436
+ if len(tokens) > 1:
437
+ parse_stripped = (key, ':'.join(tokens[1:]))
438
+ else:
439
+ parse_stripped = (key, '')
440
+ part2_lines.append(parse_stripped)
441
+ return {
442
+ keyword1: part1_lines,
443
+ keyword2: part2_lines
444
+ }
445
+
446
+ experiment_dict = {}
447
+ sentences = experiment.strip().splitlines()
448
+ evidence_keyword = 'Evidence Summary'
449
+ exp_summary_keyword = 'Experimental Summary'
450
+ if (evidence_keyword in experiment) and (exp_summary_keyword in experiment):
451
+ experiment_dict = split_sections_by_keywords(text=experiment, keyword1=evidence_keyword, keyword2=exp_summary_keyword)
452
+ else:
453
+ experiment_array = []
454
+ for sent in sentences:
455
+ sent = clean_leading_numbering(sent)
456
+ sent = sent.strip().strip(string.punctuation).strip()
457
+ experiment_array.append(sent)
458
+ experiment_dict[exp_summary_keyword] = experiment_array
459
+ return experiment_dict
460
+
461
+ def experiment_slide(self, experiments):
462
+ evidence_keyword = 'Evidence Summary'
463
+ exp_summary_keyword = 'Experimental Summary'
464
+ if len(experiments) == 1:
465
+ experiments_part1 = experiments[exp_summary_keyword]
466
+ experiments_part2 = []
467
+ else:
468
+ assert len(experiments) == 2
469
+ experiments_part1 = experiments[exp_summary_keyword]
470
+ experiments_part2 = experiments[evidence_keyword]
471
+
472
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
473
+ slide = self.prs.slides.add_slide(content_slide_layout)
474
+ self._set_background_color(slide=slide)
475
+ self._add_logo(slide=slide)
476
+ title_shape = slide.shapes.title
477
+ title_shape.text = 'Experimental Study'
478
+ title_paragraph = title_shape.text_frame.paragraphs[0]
479
+ title_paragraph.alignment = PP_ALIGN.LEFT
480
+ for run in title_paragraph.runs:
481
+ run.font.bold = True
482
+ run.font.name = 'Times New Roman'
483
+ run.font.size = Pt(36) # e.g., 36 pt
484
+
485
+ # Clear existing content
486
+ content_shape = slide.placeholders[1]
487
+ text_frame = content_shape.text_frame
488
+ text_frame.clear()
489
+ # ✅ Vertically center content inside the placeholder
490
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
491
+
492
+ fontsize = 20
493
+ if len(experiments_part2) == 0:
494
+ for idx, sent in enumerate(experiments_part1, start=1):
495
+ p1 = text_frame.add_paragraph()
496
+ p1.text = f"{idx}. {sent}"
497
+ p1.level = 0
498
+ p1.font.size = Pt(fontsize)
499
+ p1.font.name = 'Times New Roman'
500
+ p1.font.bold = True
501
+ p1.alignment = PP_ALIGN.LEFT
502
+ else:
503
+ for idx, step in enumerate(experiments_part1, start=1):
504
+ key, value = step
505
+ if len(value) == 0:
506
+ continue
507
+ p = text_frame.add_paragraph()
508
+ run1 = p.add_run()
509
+ run1.text = key
510
+ run1.font.bold = True
511
+ run1.font.size = Pt(fontsize)
512
+
513
+ # Second run: normal text
514
+ run2 = p.add_run()
515
+ run2.text = value
516
+ run2.font.bold = False
517
+ run2.font.size = Pt(fontsize)
518
+ p.font.name = 'Times New Roman'
519
+ p.alignment = PP_ALIGN.LEFT
520
+
521
+ self._add_time_footnote(slide=slide)
522
+
523
+ ###experimental study in multiple pages
524
+ if len(experiments_part2) > 0:
525
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
526
+ slide_2 = self.prs.slides.add_slide(content_slide_layout)
527
+ self._set_background_color(slide=slide_2)
528
+ self._add_logo(slide=slide_2)
529
+ title_shape = slide_2.shapes.title
530
+ title_shape.text = 'Experimental Study (Summary)'
531
+ title_paragraph = title_shape.text_frame.paragraphs[0]
532
+ title_paragraph.alignment = PP_ALIGN.LEFT
533
+ for run in title_paragraph.runs:
534
+ run.font.bold = True
535
+ run.font.name = 'Times New Roman'
536
+ run.font.size = Pt(36) # e.g., 36 pt
537
+
538
+ # Clear existing content
539
+ content_shape = slide_2.placeholders[1]
540
+ text_frame = content_shape.text_frame
541
+ text_frame.clear()
542
+ # ✅ Vertically center content inside the placeholder
543
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
544
+ self._add_time_footnote(slide=slide_2)
545
+ for idx, sent in enumerate(experiments_part2, start=1):
546
+ p1 = text_frame.add_paragraph()
547
+ p1.text = f"{idx}. {sent}"
548
+ p1.level = 0
549
+ p1.font.size = Pt(fontsize)
550
+ p1.font.name = 'Times New Roman'
551
+ p1.alignment = PP_ALIGN.LEFT
552
+
553
+ def _conclusion_preprocess(self, conclusion: str):
554
+ conclusion_dict = {}
555
+ sentences = conclusion.strip().splitlines()
556
+ for sent in sentences:
557
+ trim_sent = sent.strip().strip(string.punctuation).strip()
558
+ trim_sent = clean_leading_numbering(text=trim_sent)
559
+ if len(trim_sent) == 0 or trim_sent.lower().startswith('conclusion'):
560
+ continue
561
+ else:
562
+ tokens = trim_sent.split(':')
563
+ key = tokens[0].strip().strip(string.punctuation).strip()
564
+ if len(tokens) == 1:
565
+ conclusion_dict[key] = ''
566
+ else:
567
+ value = ':'.join(tokens[1:]).strip().strip(string.punctuation).strip()
568
+ conclusion_dict[key] = value
569
+ return conclusion_dict
570
+
571
+ def conclusion_slide(self, conclusion):
572
+ content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
573
+ slide = self.prs.slides.add_slide(content_slide_layout)
574
+ self._set_background_color(slide=slide)
575
+ self._add_logo(slide=slide)
576
+ title_shape = slide.shapes.title
577
+ title_shape.text = 'Conclusions & Future Work'
578
+ title_paragraph = title_shape.text_frame.paragraphs[0]
579
+ title_paragraph.alignment = PP_ALIGN.LEFT
580
+ for run in title_paragraph.runs:
581
+ run.font.bold = True
582
+ run.font.name = 'Times New Roman'
583
+ run.font.size = Pt(36) # e.g., 36 pt
584
+
585
+ # Clear existing content
586
+ content_shape = slide.placeholders[1]
587
+ text_frame = content_shape.text_frame
588
+ text_frame.clear()
589
+ # ✅ Vertically center content inside the placeholder
590
+ text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
591
+
592
+ # Add topic
593
+ for topic, desc in conclusion.items():
594
+ if len(desc) == 0:
595
+ continue
596
+ p1 = text_frame.add_paragraph()
597
+ p1.text = topic
598
+ p1.level = 0
599
+ p1.font.size = Pt(20)
600
+ p1.font.name = 'Times New Roman'
601
+ p1.font.bold = True
602
+ p1.alignment = PP_ALIGN.LEFT
603
+
604
+ # Line 2: description (indented)
605
+ p2 = text_frame.add_paragraph()
606
+ p2.text = desc
607
+ p2.level = 1 # Indented bullet
608
+ p2.font.size = Pt(16)
609
+ p2.font.italic = True
610
+ p2.font.name = 'Times New Roman'
611
+ p2.alignment = PP_ALIGN.LEFT
612
+
613
+ self._add_time_footnote(slide=slide)
614
+
615
+ def build_slides(self, slide_dict: dict, authors: str = 'Author here'):
616
+ title = slide_dict.get('Title', '')
617
+ title = self._title_preprocess(title=title)
618
+ self.title_slide(title=title, authors=authors)
619
+
620
+ outline = slide_dict.get('Outline', {})
621
+ outline = self._outline_preprocess_(outline=outline)
622
+ assert len(outline) > 0, 'No outline detected!!!'
623
+ self.outline_slide(outline=outline)
624
+
625
+ background = slide_dict.get('Background', '')
626
+ if background:
627
+ background = self._background_preprocess(background=background)
628
+ self.background_slide(background=background)
629
+
630
+ problem_definition = slide_dict.get('Research problem', '')
631
+ # print('problem_definition', problem_definition)
632
+ if problem_definition:
633
+ problems = self._problem_define_preprocess(problem_desc=problem_definition)
634
+ # print('problems', problems)
635
+ self.problem_def_slide(problems=problems)
636
+
637
+ objectives = slide_dict.get('Objectives', '')
638
+ if objectives:
639
+ objectives = self._objective_preprocess(objective=objectives)
640
+ self.objective_slide(objectives=objectives)
641
+
642
+ methodology = slide_dict.get('Methodology', '')
643
+ if methodology:
644
+ methodology = self._method_preprocess(methodology=methodology)
645
+ # print('Method', methodology)
646
+ self.method_slide(methods=methodology)
647
+
648
+ experimental_study = slide_dict.get('Results', '')
649
+ if experimental_study:
650
+ experiments = self._experiment_preprocess(experiment=experimental_study)
651
+ # print('experiments', experiments)
652
+ self.experiment_slide(experiments=experiments)
653
+
654
+ conclusion = slide_dict.get('Conclusions', '')
655
+ if conclusion:
656
+ conclusion = self._conclusion_preprocess(conclusion=conclusion)
657
+ self.conclusion_slide(conclusion=conclusion)
658
+
659
+ self.qa_slides()
660
+ print('Done!!')
661
+
662
+ def qa_slides(self):
663
+ # Add a blank slide (usually layout 6 is blank)
664
+ blank_slide_layout = self.prs.slide_layouts[6]
665
+ slide = self.prs.slides.add_slide(blank_slide_layout)
666
+ self._set_background_color(slide=slide)
667
+ self._add_logo(slide=slide)
668
+
669
+ # Add a textbox in the center
670
+ left = Inches(2)
671
+ top = Inches(2.5)
672
+ width = Inches(6)
673
+ height = Inches(2)
674
+
675
+ textbox = slide.shapes.add_textbox(left, top, width, height)
676
+ text_frame = textbox.text_frame
677
+ text_frame.clear()
678
+
679
+ # Add "Thank you"
680
+ p1 = text_frame.add_paragraph()
681
+ p1.text = "Thank you!"
682
+ p1.font.size = Pt(44)
683
+ p1.font.bold = True
684
+ p1.alignment = PP_ALIGN.CENTER
685
+
686
+ # Add "Q & A"
687
+ p2 = text_frame.add_paragraph()
688
+ p2.text = "\nQ & A"
689
+ p2.font.size = Pt(36)
690
+ p2.alignment = PP_ALIGN.CENTER
691
+
692
+ self._add_time_footnote(slide=slide)
693
+
694
+ def save(self, file_name='slides.pptx'):
695
+ self.prs.save(file_name)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pdf4llm
3
+ openai
4
+ python-dotenv
5
+ python-pptx
sambaAPI.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from openai import OpenAI
3
+ import os
4
+
5
+ MODEL_ALIAS = {'llama3_8b': 'Meta-Llama-3.1-8B-Instruct',
6
+ 'llama3_70b': 'Meta-Llama-3.1-70B-Instruct',
7
+ 'llama3_3_70b': 'Meta-Llama-3.3-70B-Instruct',
8
+ 'llama3_405b': 'Meta-Llama-3.1-405B-Instruct',
9
+ 'llama3_1b': "Meta-Llama-3.2-1B-Instruct",
10
+ 'llama3_3b': "Meta-Llama-3.2-3B-Instruct"}
11
+
12
+ load_dotenv()
13
+
14
+ client = OpenAI(
15
+ base_url="https://api.sambanova.ai/v1", # the endpoint IP running on vLLM cloud.sambanova.ai, https://api.sambanova.ai. fast-api.snova.ai
16
+ api_key=os.environ.get("SAMBA_API_KEY"),
17
+ )
18
+
19
+
20
+ def call_llama(system_prompt, prompt, model="Meta-Llama-3.1-8B-Instruct", **kwargs):
21
+ """
22
+ kwargs:
23
+ temperature = 0.1,
24
+ top_p = 0.1
25
+ max_tokens = 50
26
+ """
27
+ try:
28
+ completion = client.chat.completions.create(
29
+ model=model,
30
+ messages=[
31
+ {"role": "system", "content": system_prompt},
32
+ {"role": "user", "content": prompt}
33
+ ],
34
+ stream=True,
35
+ **kwargs,
36
+ )
37
+ response = ""
38
+ for chunk in completion:
39
+ response += chunk.choices[0].delta.content or ""
40
+ return response
41
+ except Exception as e:
42
+ print('API Error = {}'.format(e))
43
+ return ""
44
+
45
+ def call_llama_chat(messages, model="Meta-Llama-3.1-8B-Instruct", **kwargs):
46
+ """
47
+ kwargs:
48
+ temperature = 0.1,
49
+ top_p = 0.1
50
+ """
51
+ try:
52
+ completion = client.chat.completions.create(
53
+ model=model,
54
+ messages=messages,
55
+ stream=True,
56
+ **kwargs,
57
+ )
58
+ response = ""
59
+ for chunk in completion:
60
+ response += chunk.choices[0].delta.content or ""
61
+ return response
62
+ except Exception as e:
63
+ print('API Error = {}'.format(e))
64
+ return ""
utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from paper2slides import Paper2Slides, PaperReader
3
+ import os
4
+
5
+ def read_json(file_path: str):
6
+ try:
7
+ with open(file_path, "r") as json_file:
8
+ data = json.load(json_file)
9
+ return data
10
+ except FileNotFoundError:
11
+ print(f"Error: The file '{file_path}' was not found.")
12
+ return None
13
+ except json.JSONDecodeError:
14
+ print(f"Error: The file '{file_path}' is not a valid JSON.")
15
+ return None
16
+
17
+
18
+ def get_file_name(full_path: str):
19
+ file_name = os.path.splitext(os.path.basename(full_path))[0]
20
+ return file_name
21
+
22
+
23
+ def run_pdf2text(paper_pdf_path: str, save_json_name: str):
24
+ reader = PaperReader()
25
+ paper_content = reader.run(paper_file_name=paper_pdf_path)
26
+ # Save the dictionary as a JSON file
27
+ with open(save_json_name, 'w') as json_file:
28
+ json.dump(paper_content, json_file, indent=4)
29
+ return paper_content
30
+
31
+
32
+ def run_paper2slides(paper_json_name: str, model='llama3_70b', temprature=0.2, logo_path='logo.png', save_file_name:str='slides.pptx'):
33
+ paper_content = read_json(paper_json_name)
34
+ paper2slides = Paper2Slides(paper_contents=paper_content, model=model, temprature=temprature)
35
+ slides = paper2slides.generate_slides(verbose=False, revision=False)
36
+ with open('slides.json', 'w') as f:
37
+ json.dump(slides, f, indent=4) # indent=4 makes it pretty-printed
38
+ assert isinstance(slides, dict)
39
+ return paper2slides.save_to_slides(slides=slides, logo_path=logo_path, file_name=save_file_name)