Upload 7 files
Browse files- logo.png +0 -0
- paper2slides.py +703 -0
- pdf_helper.py +181 -0
- pptx_utils.py +695 -0
- requirements.txt +5 -0
- sambaAPI.py +64 -0
- utils.py +39 -0
logo.png
ADDED
![]() |
paper2slides.py
ADDED
@@ -0,0 +1,703 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
slide_datasource = {
|
2 |
+
'introduction': ['abstract', 'Introduction'],
|
3 |
+
'objective': ['abstract', 'Introduction'],
|
4 |
+
'methodoloy': ['abstract', 'Introduction', 'Conclusion', 'Methods'],
|
5 |
+
'results': ['abstract', 'Experiments', 'Conclusion'],
|
6 |
+
'conclusion': ['abstract', 'Introduction', 'Conclusion'],
|
7 |
+
}
|
8 |
+
|
9 |
+
from pdf_helper import PDFPaper4LLMParser, dict_to_markdown_list
|
10 |
+
from sambaAPI import call_llama_chat, MODEL_ALIAS
|
11 |
+
from pdf_helper import markdown_to_slide_dicts
|
12 |
+
from pptx_utils import Dict2PPT, os
|
13 |
+
import json
|
14 |
+
import time
|
15 |
+
import string
|
16 |
+
|
17 |
+
SLIDE_SEP = '<slide_sep>'
|
18 |
+
|
19 |
+
def trim_string(s):
|
20 |
+
return s.strip(string.whitespace + string.punctuation)
|
21 |
+
|
22 |
+
section_title_key_phrases = {
|
23 |
+
'Introduction': ['introduction'],
|
24 |
+
'Related Works': ['related work'],
|
25 |
+
'Methods': ['method', 'approach'],
|
26 |
+
'Experiments': ['experiment'],
|
27 |
+
'Conclusion': ['conclusion'],
|
28 |
+
'Acknowledgements': ['acknowledgement'],
|
29 |
+
'References': ['references', ' references'], #
|
30 |
+
}
|
31 |
+
|
32 |
+
def find_string_index(string_list, target: str):
|
33 |
+
"""
|
34 |
+
Returns the index of the target string in the list.
|
35 |
+
If the target is not found, returns -1.
|
36 |
+
|
37 |
+
Parameters:
|
38 |
+
string_list (list): A list of strings
|
39 |
+
target (str): The string to find in the list
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
int: The index of the target string, or -1 if not found
|
43 |
+
"""
|
44 |
+
try:
|
45 |
+
return string_list.index(target)
|
46 |
+
except ValueError:
|
47 |
+
return -1
|
48 |
+
|
49 |
+
|
50 |
+
def get_section_category(section_name: str):
|
51 |
+
"""
|
52 |
+
Scientist paper section name mapping
|
53 |
+
"""
|
54 |
+
for key, phrases in section_title_key_phrases.items():
|
55 |
+
for phrase in phrases:
|
56 |
+
if phrase in section_name.lower():
|
57 |
+
return key
|
58 |
+
return 'Other'
|
59 |
+
|
60 |
+
|
61 |
+
class PaperReader(object):
|
62 |
+
def __init__(self, page_chunks=False):
|
63 |
+
self.paper_reader = PDFPaper4LLMParser(page_chunks=page_chunks)
|
64 |
+
|
65 |
+
def pdf2text(self, paper_pdf_path: str):
|
66 |
+
paper_content = self.paper_reader.run(pdf_path=paper_pdf_path, verbose=False)
|
67 |
+
return paper_content
|
68 |
+
|
69 |
+
def structurize(self, main_text_array: list):
|
70 |
+
section_names = [_['title'] for _ in main_text_array]
|
71 |
+
section_name_topics = [get_section_category(_) for _ in section_names]
|
72 |
+
introduction_idx = find_string_index(section_name_topics, target='Introduction')
|
73 |
+
refference_idx = find_string_index(section_name_topics, target='References')
|
74 |
+
experiment_idx = find_string_index(section_name_topics, target='Experiments')
|
75 |
+
conclusion_idx = find_string_index(section_name_topics, target='Conclusion')
|
76 |
+
if refference_idx > 0:
|
77 |
+
for idx in range(len(section_name_topics)):
|
78 |
+
if idx < refference_idx:
|
79 |
+
if section_name_topics[idx] == 'Other':
|
80 |
+
section_name_topics[idx] = 'Methods'
|
81 |
+
elif idx > refference_idx:
|
82 |
+
if not ('appendix' in section_name_topics[idx].lower()):
|
83 |
+
section_name_topics[idx] = 'Appendix: ' + section_name_topics[idx]
|
84 |
+
else:
|
85 |
+
continue
|
86 |
+
# print(section_name_topics)
|
87 |
+
if experiment_idx > 0:
|
88 |
+
for idx in range(experiment_idx +1, refference_idx):
|
89 |
+
if section_name_topics[idx] == 'Methods':
|
90 |
+
section_name_topics[idx] = 'Experiments'
|
91 |
+
# print(section_name_topics)
|
92 |
+
experiment_idx = find_string_index(section_name_topics, target='Experiments')
|
93 |
+
method_idx = find_string_index(section_name_topics, target='Methods')
|
94 |
+
relatedwork_idx = find_string_index(section_name_topics, target='Related Works')
|
95 |
+
ack_idx = find_string_index(section_name_topics, target='Acknowledgements')
|
96 |
+
|
97 |
+
paper_structure_dict = {
|
98 |
+
'Introduction': [introduction_idx],
|
99 |
+
'Related Works': [relatedwork_idx],
|
100 |
+
'References': [refference_idx],
|
101 |
+
'Conclusion': [conclusion_idx],
|
102 |
+
'Acknowledgements': [ack_idx]
|
103 |
+
}
|
104 |
+
|
105 |
+
## Experiments and methodology
|
106 |
+
method_idx_array = []
|
107 |
+
if method_idx >=0:
|
108 |
+
for idx in range(method_idx, len(section_name_topics)):
|
109 |
+
if section_name_topics[idx] == 'Methods':
|
110 |
+
method_idx_array.append(idx)
|
111 |
+
else:
|
112 |
+
break
|
113 |
+
else:
|
114 |
+
if introduction_idx >=0 and conclusion_idx >=0:
|
115 |
+
for idx in range(introduction_idx+1, conclusion_idx):
|
116 |
+
if section_name_topics[idx] == 'Methods':
|
117 |
+
method_idx_array.append(idx)
|
118 |
+
else:
|
119 |
+
break
|
120 |
+
|
121 |
+
|
122 |
+
exp_idx_array = []
|
123 |
+
if experiment_idx >=0:
|
124 |
+
for idx in range(experiment_idx, len(section_name_topics)):
|
125 |
+
if section_name_topics[idx] == 'Experiments':
|
126 |
+
exp_idx_array.append(idx)
|
127 |
+
else:
|
128 |
+
break
|
129 |
+
else:
|
130 |
+
if introduction_idx >=0 and conclusion_idx >=0:
|
131 |
+
for idx in range(introduction_idx+1, conclusion_idx):
|
132 |
+
if section_name_topics[idx] == 'Experiments':
|
133 |
+
exp_idx_array.append(idx)
|
134 |
+
else:
|
135 |
+
break
|
136 |
+
|
137 |
+
paper_structure_dict['Experiments'] = exp_idx_array
|
138 |
+
paper_structure_dict['Methods'] = method_idx_array
|
139 |
+
return section_name_topics, paper_structure_dict
|
140 |
+
|
141 |
+
def run(self, paper_file_name: str):
|
142 |
+
start_time = time.time()
|
143 |
+
paper_content = self.pdf2text(paper_pdf_path=paper_file_name)
|
144 |
+
section_name_topics, paper_structure_dict = self.structurize(main_text_array=paper_content['main_text'])
|
145 |
+
paper_content['structure'] = paper_structure_dict
|
146 |
+
paper_content['section_topic'] = section_name_topics
|
147 |
+
print('Runtime for pdf2text = {:.4f} seconds.'.format(time.time() - start_time))
|
148 |
+
return paper_content
|
149 |
+
|
150 |
+
### 1. General System Prompt
|
151 |
+
|
152 |
+
SCHOLAR_PROMPT = """
|
153 |
+
You are an assistant being skilled at critically reading and analyzing academic papers to extract key insights, trends, and findings.
|
154 |
+
"""
|
155 |
+
|
156 |
+
### 2. Paper Outline Generation from Abstract
|
157 |
+
|
158 |
+
ABSTRACT_SUMMARY_PROMPT = """
|
159 |
+
You are given the **title** and **abstract** of an academic paper. Please first identity the research topic, and then extract the following aspects in a minimal title draft (max 15 words) for PowerPoint presentation:
|
160 |
+
|
161 |
+
1. **Background**: Introduces the research context and importance.
|
162 |
+
2. **Research Problem**: Identifies the specific problem or knowledge gap.
|
163 |
+
3. **Objectives**: States the research goals or hypotheses.
|
164 |
+
4. **Methodology**: Summarizes the research design and key methods.
|
165 |
+
5. **Results**: Highlights the most significant findings.
|
166 |
+
6. **Conclusions**: Provides the main takeaways and their relation to the research question.
|
167 |
+
|
168 |
+
Reminder: Strictly output in JSON format **only**, using the keys: "Research topic", "Background", "Research problem", "Objectives", "Methodology", "Results" and "Conclusions".
|
169 |
+
"""
|
170 |
+
|
171 |
+
### 3. Evidence extraction from main paper text for "Background"
|
172 |
+
BACKGROUD_EVIDENCE_PROMPT = """
|
173 |
+
You are given the **title**, briefly description of **problem backgroud** and **introduction** of a research paper. From the introduction, extract an itemized list of **1 to 3 pieces of evidence** that support the problem background, each evidence should be described in a **minimal draft (min 10 words and max 25 words)** for PowerPoint presentation.
|
174 |
+
|
175 |
+
Each piece of evidence must:
|
176 |
+
1. Be directly relevant to the problem background.
|
177 |
+
2. Be clear and concise.
|
178 |
+
3. Be unique, not repeating other evidence.
|
179 |
+
|
180 |
+
**Important**: Strictly output the itemized evidences ONLY.
|
181 |
+
"""
|
182 |
+
|
183 |
+
|
184 |
+
### 4. Evidence extraction from main paper text for "Research Problem"
|
185 |
+
RESEARCH_PROBLEM_PROMPT = """
|
186 |
+
You are given the **title**, briefly description of **research problem** and **introduction** of a research paper. Solely from the given introduction, extract the definition of the research problem for PowerPoint presentation, focusing on:
|
187 |
+
|
188 |
+
1. **Scope**: Define the problem’s boundaries as individual items;
|
189 |
+
2. **Challenges**: Identify key gaps or obstacles the research addresses as individual items;
|
190 |
+
3. **Assumptions**: State any assumptions guiding the research as individual items;
|
191 |
+
4. **Relevance*: Specify who benefits from solving the problem as individual items.
|
192 |
+
|
193 |
+
**Note**: Each item must be in one concise sentence. **Only** output "Scope", "Challenges", "Assumptions" and "Relevance".
|
194 |
+
"""
|
195 |
+
|
196 |
+
|
197 |
+
### 5. Evidence extraction from main paper text for "Objectives"
|
198 |
+
|
199 |
+
OBJECTIVE_PROMPT = """
|
200 |
+
You are given the **title**, **objectives** and **introduction** of a research paper. Solely from the given introduction, extract a list of **2 to 5 pieces of evidence** to support these objectives, each evidence should be described in a **minimal draft (min 10 words and max 20 words)** for PowerPoint presentation.
|
201 |
+
|
202 |
+
Each piece of evidence must:
|
203 |
+
1. Be directly relevant to the objectives.
|
204 |
+
2. Be clear and concise.
|
205 |
+
3. Be unique, not repeating other evidence.
|
206 |
+
|
207 |
+
**Note**: Strictly output the itemized evidences ONLY.
|
208 |
+
"""
|
209 |
+
|
210 |
+
### 6. Evidence extraction from main paper text for "Conclusion"
|
211 |
+
|
212 |
+
CONCLUSION_PROMT = """
|
213 |
+
You are given the **title**, **birief conclusion**, and **full text conclusion** and **introduction** of a research paper. From the given conclusion and introduction, extract the **conclusion** for PowerPoint presentation, ensuring it includes:
|
214 |
+
|
215 |
+
1. **Summary of key results**: Highlight the main results.
|
216 |
+
2. **Implications**: Explain the significance or impact of these findings.
|
217 |
+
3. **Future directions**: Mention any suggestions for future research or applications.
|
218 |
+
4. **Final takeaway**: Provide the overall takeaway message of the study.
|
219 |
+
|
220 |
+
**Note**: Only output the conclusion. Limit each point in a minimal concise draft (at least 10 words).”
|
221 |
+
"""
|
222 |
+
|
223 |
+
### 7. Evidence extraction from main paper text for "Experimental results" (iterative)
|
224 |
+
|
225 |
+
RESULT_PROMPT_DICT = {
|
226 |
+
"system_instruction": """Given the title, the main results of an experimental study, and a paragraph from a research paper, your task is to extract and summarize evidence from the paragraph that supports the 'main results'.
|
227 |
+
|
228 |
+
Follow these steps for each paragraph:
|
229 |
+
1. **Detect Evidence**: Check if the paragraph contains:
|
230 |
+
1) Any evidence supporting the main results, or
|
231 |
+
2) Experimental study information, including:
|
232 |
+
- **Dataset**: Details on datasets, preprocessing, or train/test splits.
|
233 |
+
- **Model Description**: Information of baselines, hyperparameters, and training.
|
234 |
+
- **Evaluation Metrics**: Relevant metrics like accuracy, F1 score, and their justification.
|
235 |
+
- **Comparative Analysis**: Comparisons with baselines, ablation studies, statistical significance.
|
236 |
+
- **Runtime & Scalability**: Computational complexity and scalability.
|
237 |
+
2. **Response**: Choose 'YES' or 'NO':
|
238 |
+
- If 'YES', extract and summarize the evidence or experimental details in 200 words. Ensure the summary is:
|
239 |
+
- Clear and concise
|
240 |
+
- Well-formatted for easy reading
|
241 |
+
- Focused on key points: dataset, model Description, evaluation metrics, comparative analysis and runtime & scalability.
|
242 |
+
- If 'NO', just respond with 'NO EVIDENCE'.
|
243 |
+
""",
|
244 |
+
|
245 |
+
"iterative_prompt": """Summarize the experimental details or evidence supporting the 'main results' in 200 words from the following paragraph (with title and content) if experiment-related information is detected. Follow these instructions:
|
246 |
+
|
247 |
+
1. List 2 to 4 itemized points.
|
248 |
+
2. Each point must specify the type ('Evidence' or 'Experimental Setup') and provide a minimal draft sentence of content (max 15 words).
|
249 |
+
|
250 |
+
**Note**: Only provide the itemized summary.
|
251 |
+
""",
|
252 |
+
|
253 |
+
"final_prompt": """Using the **title**, the **main results** of an experimental study, and a list of experiment summaries from the research paper, follow these steps to summarize the results:
|
254 |
+
|
255 |
+
1. **Evidence Summary**: prive a numbered, itemized summary of **2-3** key points. Keep each point brief and focused (only 1 sentence).
|
256 |
+
|
257 |
+
2. **Experimental Summary**: Based all 'Experimental Setup' points and provide a concise summary covering the following aspects:
|
258 |
+
1) **Datasets**: List only the names of all datasets or benchmarks used.
|
259 |
+
2) **Baselines**: List only the names of all models/algorithms used.
|
260 |
+
3) **Metrics**: List only the evaluation metrics used for model performance, such as accuracy, F1-score, recall, precision, AUC, etc.
|
261 |
+
4) **Results**: Summarize key comparisons and ablation results, focusing on the most important details.
|
262 |
+
|
263 |
+
**Note**: Only output the “Evidence Summary” and “Experimental Summary”
|
264 |
+
"""
|
265 |
+
}
|
266 |
+
|
267 |
+
## Methodology extraction
|
268 |
+
|
269 |
+
METHOD_PROMPT_DICT = {
|
270 |
+
"system_instruction": """Given the **title**, the **method overview**, and a paragraph of a research paper. You task is identify and extract text being relevant to 'method overview' from the given paragraph for PowerPoint presentation.
|
271 |
+
|
272 |
+
Follow these steps:
|
273 |
+
1. **Method Information Detection**: Check if the paragraph contains:
|
274 |
+
1) Any mention of the **method overview** or
|
275 |
+
2) Specific method details, such as:
|
276 |
+
- **Problem Definition**: The task, input, and expected output.
|
277 |
+
- **Model Architecture**: Structure, key components, and learning type.
|
278 |
+
- **Algorithm**: Steps of the method.
|
279 |
+
- **Training Process**: Training data, optimization method, and loss function.
|
280 |
+
2. **Response**: Choose 'YES' or 'NO':
|
281 |
+
- If 'YES', summarize the method details in a minimal draft with max 20 words, ensuring it is:
|
282 |
+
- Clear and concise
|
283 |
+
- Well-formatted for readability
|
284 |
+
- Focused on key points.
|
285 |
+
- If 'NO', simply respond with 'NO Information'.
|
286 |
+
""",
|
287 |
+
"iterative_prompt": """Summarize the method description in 200 words from the following paragraph (with title and content) if method-related information is found. Follow these steps:
|
288 |
+
|
289 |
+
1. List **2 to 4** method steps in numbered format..
|
290 |
+
2. Ensure each step is related to the **method overview**.
|
291 |
+
3. Keep each step clear and concise (only minimal draft with max 15 words).
|
292 |
+
|
293 |
+
**Note**: Only output the itemized method steps.
|
294 |
+
""",
|
295 |
+
|
296 |
+
"final_prompt": """Using **title**, **method overview**, and a list of itemized method step summary from a research paper, follow these instructions to summarize the method description::
|
297 |
+
|
298 |
+
1. Provide a numbered list of **3-6 method steps** detailing the **method overview**.
|
299 |
+
2. Keep each step clear and concise (only 1 sentence).
|
300 |
+
|
301 |
+
**Note**: Only output the itemized method steps.
|
302 |
+
"""
|
303 |
+
}
|
304 |
+
|
305 |
+
SLIDES_REVISION_PROMPT = """You are an expert research assistant. Revise the following research paper slides to enhance clarity and readability while preserving the original markdown structure. Keep all first-level markdown headers unchanged. Sections are separated by '{}'. Follow these guidelines:
|
306 |
+
|
307 |
+
1. Simplify language and make content more concise, especially in the outline.
|
308 |
+
2. Preserve the logical flow and overall structure.
|
309 |
+
3. Make key points and conclusions clear and easy to follow.
|
310 |
+
4. Use bullet points where appropriate for better clarity.
|
311 |
+
5. Minimize jargon to ensure accessibility for a broad academic audience.
|
312 |
+
|
313 |
+
""".format(SLIDE_SEP)
|
314 |
+
|
315 |
+
def make_api_call(model, messages, max_tokens, temperature):
|
316 |
+
try:
|
317 |
+
response = call_llama_chat(messages=messages, model=model, temperature=temperature, max_tokens=max_tokens)
|
318 |
+
return response
|
319 |
+
except Exception as e:
|
320 |
+
return f"Failed to generate final answer. Error: {str(e)}", {}
|
321 |
+
|
322 |
+
def convert_to_dict(input_string: str):
|
323 |
+
# Split the string by the delimiter (e.g., semicolon)
|
324 |
+
lines = input_string.strip().split('\n')
|
325 |
+
# Initialize an empty dictionary
|
326 |
+
result_dict = {}
|
327 |
+
# Iterate over each line
|
328 |
+
for line in lines:
|
329 |
+
# Split each line into key and value by the delimiter (e.g., colon)
|
330 |
+
if ':' in line:
|
331 |
+
key, value = line.split(':', 1) # Split only on the first occurrence
|
332 |
+
# Strip any whitespace and store in the dictionary
|
333 |
+
result_dict[key.strip()] = value.strip()
|
334 |
+
return result_dict
|
335 |
+
|
336 |
+
|
337 |
+
class Paper2Slides(object):
|
338 |
+
def __init__(self, paper_contents: dict, model: str, max_tokens = 512, temprature=0.1):
|
339 |
+
self.paper_contents = paper_contents
|
340 |
+
if not self.valid_paper_checking():
|
341 |
+
print('Not a valid paper structure, cannot generate slides')
|
342 |
+
exit(1)
|
343 |
+
self.model = MODEL_ALIAS[model]
|
344 |
+
self.is_rate_limitation = ('405B' in self.model) or ('70B' in self.model)
|
345 |
+
self.temprature = temprature
|
346 |
+
self.max_failure_attempt_each_step = 3
|
347 |
+
if '405B' in self.model:
|
348 |
+
self.sleep_time = 0.25
|
349 |
+
else:
|
350 |
+
self.sleep_time = 0.25
|
351 |
+
self.max_tokens = max_tokens
|
352 |
+
print('{} model is used for slides generation!\nRate limitation = {}'.format(self.model, self.is_rate_limitation))
|
353 |
+
self.revise_model = MODEL_ALIAS['llama3_70b']
|
354 |
+
|
355 |
+
def valid_paper_checking(self):
|
356 |
+
try:
|
357 |
+
assert 'abstract' in self.paper_contents, 'No abstract is detected'
|
358 |
+
assert 'title' in self.paper_contents, 'No title is detected'
|
359 |
+
paper_structure = self.paper_contents['structure']
|
360 |
+
introduction_idx_array = paper_structure['Introduction']
|
361 |
+
conclusion_idx_array = paper_structure['Conclusion']
|
362 |
+
assert introduction_idx_array[0] >=0, 'No introduction is detected'
|
363 |
+
assert conclusion_idx_array[0] >=0, 'No conclusion is detected'
|
364 |
+
except AssertionError as e:
|
365 |
+
print(f"AssertionError: {e}")
|
366 |
+
return False
|
367 |
+
return True
|
368 |
+
|
369 |
+
def step(self, messages):
|
370 |
+
result = self.run(messages=messages)
|
371 |
+
if 'Failed' in result:
|
372 |
+
time.sleep(self.sleep_time)
|
373 |
+
if self.is_rate_limitation:
|
374 |
+
print('sleep {} seconds'.format(self.sleep_time))
|
375 |
+
time.sleep(self.sleep_time)
|
376 |
+
return result
|
377 |
+
|
378 |
+
def run(self, messages):
|
379 |
+
for attempt in range(self.max_failure_attempt_each_step):
|
380 |
+
try:
|
381 |
+
response = make_api_call(messages=messages, model=self.model, max_tokens=self.max_tokens, temperature=self.temprature)
|
382 |
+
return response
|
383 |
+
except Exception as e:
|
384 |
+
if attempt == self.max_failure_attempt_each_step - 1:
|
385 |
+
return "Failed to generate step after {} attempts. $ERROR$: {}".format(self.max_failure_attempt_each_step, str(e))
|
386 |
+
else:
|
387 |
+
return "Failed to generate step. $ERROR$: {}".format(str(e))
|
388 |
+
time.sleep(2) # Wait for 1 second before retrying
|
389 |
+
return 'Failed to generate reasoning step.'
|
390 |
+
|
391 |
+
|
392 |
+
def abstract_summary(self):
|
393 |
+
"""
|
394 |
+
Extract the outline for the slides from abstract
|
395 |
+
"""
|
396 |
+
assert len(self.paper_contents['title']) > 0 and len(self.paper_contents['abstract']) > 512
|
397 |
+
prompt = "**title**: {}\n\n**abstract**: {}".format(self.paper_contents['title'], self.paper_contents['abstract'])
|
398 |
+
messages = [
|
399 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
400 |
+
{"role": "system", "content": ABSTRACT_SUMMARY_PROMPT},
|
401 |
+
{"role": "user", "content": prompt},
|
402 |
+
{"role": "assistant", "content": "I will extract the evidences following my instructions."}
|
403 |
+
]
|
404 |
+
abstract_summary = self.step(messages=messages)
|
405 |
+
try:
|
406 |
+
abstract_summary_dict = json.loads(abstract_summary)
|
407 |
+
except Exception as e:
|
408 |
+
abstract_summary_dict = convert_to_dict(input_string=abstract_summary)
|
409 |
+
|
410 |
+
trim_abstract_summary_dict = {}
|
411 |
+
for k, v in abstract_summary_dict.items():
|
412 |
+
trim_abstract_summary_dict[trim_string(k)] = v
|
413 |
+
return trim_abstract_summary_dict
|
414 |
+
|
415 |
+
def support_background(self, background: str, introduction: str):
|
416 |
+
"""
|
417 |
+
Extract support evidences for background from introduction
|
418 |
+
"""
|
419 |
+
prompt = "**title**: {}\n\n**promblem background**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], background, introduction)
|
420 |
+
messages = [
|
421 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
422 |
+
{"role": "system", "content": BACKGROUD_EVIDENCE_PROMPT},
|
423 |
+
{"role": "user", "content": prompt},
|
424 |
+
{"role": "assistant", "content": "I will extract the evidences following my instructions."}
|
425 |
+
]
|
426 |
+
evidences = self.step(messages=messages)
|
427 |
+
# print('Background evidences = {}'.format(evidences))
|
428 |
+
step_num = 1
|
429 |
+
return evidences, step_num
|
430 |
+
|
431 |
+
def support_research_problem(self, research_problem: str, introduction: str):
|
432 |
+
"""
|
433 |
+
Extract support evidences for research problem from introduction
|
434 |
+
"""
|
435 |
+
prompt = "**title**: {}\n\n**research problem**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], research_problem, introduction)
|
436 |
+
messages = [
|
437 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
438 |
+
{"role": "system", "content": RESEARCH_PROBLEM_PROMPT},
|
439 |
+
{"role": "user", "content": prompt},
|
440 |
+
{"role": "assistant", "content": "I will extract the evidences following my instructions."}
|
441 |
+
]
|
442 |
+
evidences = self.step(messages=messages)
|
443 |
+
step_num = 1
|
444 |
+
return evidences, step_num
|
445 |
+
|
446 |
+
def support_objectives(self, objectives: str, introduction: str):
|
447 |
+
"""
|
448 |
+
Extract support evidences for objectives from introduction
|
449 |
+
"""
|
450 |
+
prompt = "**title**: {}\n\n**objectives**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], objectives, introduction)
|
451 |
+
messages = [
|
452 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
453 |
+
{"role": "system", "content": OBJECTIVE_PROMPT},
|
454 |
+
{"role": "user", "content": prompt},
|
455 |
+
{"role": "assistant", "content": "I will extract the evidences following my instructions."}
|
456 |
+
]
|
457 |
+
evidences = self.step(messages=messages)
|
458 |
+
step_num = 1
|
459 |
+
return evidences, step_num
|
460 |
+
|
461 |
+
def support_conclusion(self, conclusion: str, introduction: str, conclusion_text: str, step_wise=True):
|
462 |
+
"""
|
463 |
+
Expand conclusion based on full-text conclusion and introducton.
|
464 |
+
If step_wise = True:
|
465 |
+
1. Summarize introduction while focusing on conclusion part
|
466 |
+
2. Extract conclusion points from introduction summary and full-context conclusion.
|
467 |
+
"""
|
468 |
+
step_num = 0
|
469 |
+
prompt = "**title**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], introduction)
|
470 |
+
if step_wise:
|
471 |
+
messages = [
|
472 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
473 |
+
{"role": "system", "content": "Given a **tititle** and **introduction** of a research paper, summarize and extract conclusion related information in about 200 words."},
|
474 |
+
{"role": "user", "content": prompt},
|
475 |
+
{"role": "assistant", "content": "I will extract the conclusion following my instructions."}
|
476 |
+
]
|
477 |
+
instruction_conclusion_summary = self.step(messages=messages)
|
478 |
+
step_num = step_num + 1
|
479 |
+
else:
|
480 |
+
instruction_conclusion_summary = introduction
|
481 |
+
|
482 |
+
prompt = "**title**: {}\n\n**brief conclusion**: {}\n\n**conclusion**: \n\n{}**introduction**: {}".format(self.paper_contents['title'], conclusion, conclusion_text, instruction_conclusion_summary)
|
483 |
+
messages = [
|
484 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
485 |
+
{"role": "system", "content": CONCLUSION_PROMT},
|
486 |
+
{"role": "user", "content": prompt},
|
487 |
+
{"role": "assistant", "content": "I will extract the conclusions following my instructions."}
|
488 |
+
]
|
489 |
+
evidences = self.step(messages=messages)
|
490 |
+
step_num = step_num + 1
|
491 |
+
return evidences, step_num
|
492 |
+
|
493 |
+
def support_experiment_results(self, main_results: str, paragraph_list: list):
|
494 |
+
step_num = 0
|
495 |
+
prompt = "**title**: {}\n\n**main results**: {}\n\n".format(self.paper_contents['title'], main_results)
|
496 |
+
iterative_sys_prompt = RESULT_PROMPT_DICT['iterative_prompt']
|
497 |
+
messages = [
|
498 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
499 |
+
{"role": "system", "content": RESULT_PROMPT_DICT['system_instruction']},
|
500 |
+
{"role": "user", "content": prompt},
|
501 |
+
{"role": "system", "content": iterative_sys_prompt},
|
502 |
+
]
|
503 |
+
|
504 |
+
follow_instruction = {"role": "assistant", "content": "I will extract the experimental information following my instructions."}
|
505 |
+
|
506 |
+
paragraph_summary_array = []
|
507 |
+
for para_idx in range(len(paragraph_list)):
|
508 |
+
para_input_prompt = "Paragraph title: {}\n\nContent: {}\n\n".format(paragraph_list[para_idx]['title'], paragraph_list[para_idx]['content'])
|
509 |
+
user_input = {'role': 'user', 'content': para_input_prompt}
|
510 |
+
messages.append(user_input)
|
511 |
+
messages.append(follow_instruction)
|
512 |
+
para_summary = self.step(messages=messages)
|
513 |
+
step_num = step_num + 1
|
514 |
+
paragraph_summary_array.append(para_summary)
|
515 |
+
messages.pop()
|
516 |
+
messages.pop()
|
517 |
+
|
518 |
+
## Experimental result summary
|
519 |
+
|
520 |
+
prompt = "**title**: {}\n\n**main results**: {}\n\n".format(self.paper_contents['title'], main_results)
|
521 |
+
summary_prompt = '\n'.join(['**summary** {}:\n\n{}'.format(idx+1, summary) for idx, summary in enumerate(paragraph_summary_array)])
|
522 |
+
input_prompt = prompt + summary_prompt
|
523 |
+
|
524 |
+
messages = [
|
525 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
526 |
+
{"role": "system", "content": RESULT_PROMPT_DICT['final_prompt']},
|
527 |
+
{"role": "user", "content": input_prompt},
|
528 |
+
{"role": "assistant", "content": "I will summarize the experimental results following my instructions."},
|
529 |
+
]
|
530 |
+
|
531 |
+
result_summary = self.step(messages=messages)
|
532 |
+
step_num = step_num + 1
|
533 |
+
return result_summary, step_num
|
534 |
+
|
535 |
+
def experiment_paragraph_extraction(self,):
|
536 |
+
intro_idx = self.paper_contents['structure']['Introduction'][0]
|
537 |
+
conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
|
538 |
+
experiment_idx_array = self.paper_contents['structure']['Experiments']
|
539 |
+
if len(experiment_idx_array) == 0:
|
540 |
+
experiment_idx_array = [_ for _ in range(intro_idx+1, conclusion_idx)]
|
541 |
+
assert len(experiment_idx_array) > 0 and max(experiment_idx_array) < len(self.paper_contents['main_text'])
|
542 |
+
experiment_idx_array = [intro_idx] + experiment_idx_array
|
543 |
+
paragraphs = [self.paper_contents['main_text'][_] for _ in experiment_idx_array]
|
544 |
+
return paragraphs
|
545 |
+
|
546 |
+
def support_methodology(self, method_overview: str, paragraph_list: list):
|
547 |
+
step_num = 0
|
548 |
+
prompt = "**title**: {}\n\n**method overview**: {}\n\n".format(self.paper_contents['title'], method_overview)
|
549 |
+
iterative_sys_prompt = METHOD_PROMPT_DICT['iterative_prompt']
|
550 |
+
messages = [
|
551 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
552 |
+
{"role": "system", "content": METHOD_PROMPT_DICT['system_instruction']},
|
553 |
+
{"role": "user", "content": prompt},
|
554 |
+
{"role": "system", "content": iterative_sys_prompt},
|
555 |
+
]
|
556 |
+
|
557 |
+
follow_instruction = {"role": "assistant", "content": "I will extract the method information following my instructions."}
|
558 |
+
|
559 |
+
method_summary_array = []
|
560 |
+
for para_idx in range(len(paragraph_list)):
|
561 |
+
para_input_prompt = "Paragraph title: {}\n\nContent: {}\n\n".format(paragraph_list[para_idx]['title'], paragraph_list[para_idx]['content'])
|
562 |
+
user_input = {'role': 'user', 'content': para_input_prompt}
|
563 |
+
messages.append(user_input)
|
564 |
+
messages.append(follow_instruction)
|
565 |
+
method_summary = self.step(messages=messages)
|
566 |
+
step_num = step_num + 1
|
567 |
+
method_summary_array.append(method_summary)
|
568 |
+
messages.pop()
|
569 |
+
messages.pop()
|
570 |
+
|
571 |
+
## Method summary
|
572 |
+
prompt = "**title**: {}\n\n**method overview**: {}\n\n".format(self.paper_contents['title'], method_overview)
|
573 |
+
method_summary_prompt = '\n'.join(['**method summary** {}:\n\n{}'.format(idx+1, summary) for idx, summary in enumerate(method_summary_array)])
|
574 |
+
input_prompt = prompt + method_summary_prompt
|
575 |
+
|
576 |
+
messages = [
|
577 |
+
{"role": "system", "content": SCHOLAR_PROMPT},
|
578 |
+
{"role": "system", "content": METHOD_PROMPT_DICT['final_prompt']},
|
579 |
+
{"role": "user", "content": input_prompt},
|
580 |
+
{"role": "assistant", "content": "I will generate a step-by-step method summary following my instructions."},
|
581 |
+
]
|
582 |
+
method_summary = self.step(messages=messages)
|
583 |
+
step_num = step_num + 1
|
584 |
+
return method_summary, step_num
|
585 |
+
|
586 |
+
def method_paragraph_extraction(self,):
|
587 |
+
intro_idx = self.paper_contents['structure']['Introduction'][0]
|
588 |
+
conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
|
589 |
+
method_idx_array = self.paper_contents['structure']['Methods']
|
590 |
+
if len(method_idx_array) == 0:
|
591 |
+
method_idx_array = [_ for _ in range(intro_idx+1, conclusion_idx)]
|
592 |
+
assert len(method_idx_array) > 0 and max(method_idx_array) < len(self.paper_contents['main_text'])
|
593 |
+
method_idx_array = [intro_idx] + method_idx_array
|
594 |
+
paragraphs = [self.paper_contents['main_text'][_] for _ in method_idx_array]
|
595 |
+
return paragraphs
|
596 |
+
|
597 |
+
def generate_slides(self, verbose=False, revision=True):
|
598 |
+
## Step 1: Paper content extraction
|
599 |
+
intro_idx = self.paper_contents['structure']['Introduction'][0]
|
600 |
+
introduction = self.paper_contents['main_text'][intro_idx]['content']
|
601 |
+
assert len(introduction) > 512, 'introduction = {}, content = {}'.format(introduction, self.paper_contents['main_text'])
|
602 |
+
conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
|
603 |
+
conclusion = self.paper_contents['main_text'][conclusion_idx]['content']
|
604 |
+
assert len(conclusion) > 128, 'conclusion = {}, content = {}'.format(introduction, self.paper_contents['main_text'])
|
605 |
+
method_paragraphs = self.method_paragraph_extraction()
|
606 |
+
experiment_paragraphs = self.experiment_paragraph_extraction()
|
607 |
+
|
608 |
+
start_time = time.time()
|
609 |
+
## Step 2: slides structure extraction from abstract
|
610 |
+
model_call_number = 0
|
611 |
+
print('Slides structure generation')
|
612 |
+
slides = {'Title': self.paper_contents['title']}
|
613 |
+
outline_dict = self.abstract_summary()
|
614 |
+
model_call_number += 1
|
615 |
+
slides['Outline'] = outline_dict
|
616 |
+
|
617 |
+
print('Slides generation...')
|
618 |
+
background = outline_dict.get('Background', '')
|
619 |
+
slides['Background'], b_steps = self.support_background(background=background, introduction=introduction)
|
620 |
+
model_call_number += b_steps
|
621 |
+
|
622 |
+
research_problem = outline_dict.get('Research problem', '')
|
623 |
+
slides['Research problem'], r_steps = self.support_research_problem(research_problem=research_problem, introduction=introduction)
|
624 |
+
model_call_number += r_steps
|
625 |
+
|
626 |
+
objectives = outline_dict.get('Objectives', '')
|
627 |
+
slides['Objectives'], o_steps = self.support_objectives(objectives=objectives, introduction=introduction)
|
628 |
+
model_call_number += o_steps
|
629 |
+
|
630 |
+
brief_conclusion = outline_dict.get('Conclusions', '')
|
631 |
+
slides['Conclusions'], c_steps = self.support_conclusion(conclusion=brief_conclusion, introduction=introduction, conclusion_text=conclusion, step_wise=True)
|
632 |
+
model_call_number += c_steps
|
633 |
+
|
634 |
+
results = outline_dict.get('Results', '')
|
635 |
+
result_summary, res_steps = self.support_experiment_results(main_results=results, paragraph_list=experiment_paragraphs)
|
636 |
+
slides['Results'] = result_summary
|
637 |
+
model_call_number += res_steps
|
638 |
+
|
639 |
+
methodology = outline_dict.get('Methodology', '')
|
640 |
+
method_summary, m_steps = self.support_methodology(method_overview=methodology, paragraph_list=method_paragraphs)
|
641 |
+
model_call_number += m_steps
|
642 |
+
slides['Methodology'] = method_summary
|
643 |
+
runtime = time.time() - start_time
|
644 |
+
print('Slide generation takes {:.4f} seconds with {} function calls'.format(runtime, model_call_number))
|
645 |
+
if verbose:
|
646 |
+
slides_content = self.slides2markdown_v2(slides=slides)
|
647 |
+
if revision:
|
648 |
+
slides_content = self.slides_revision(slide_content=slides_content)
|
649 |
+
slides_array = markdown_to_slide_dicts(full_markdown=slides_content)
|
650 |
+
revised_slides = {k: v for d in slides_array for k, v in d.items()}
|
651 |
+
if verbose:
|
652 |
+
print('Json format:\n{}'.format(json.dumps(revised_slides, indent=4)))
|
653 |
+
print('\n' * 3)
|
654 |
+
print('paper keywords:\n{}'.format(self.paper_contents.keys()))
|
655 |
+
return revised_slides
|
656 |
+
if verbose:
|
657 |
+
print('Generated slides:\n{}'.format(slides_content))
|
658 |
+
print('Json format:\n{}'.format(json.dumps(slides, indent=4)))
|
659 |
+
return slides
|
660 |
+
|
661 |
+
def slides_revision(self, slide_content: str):
|
662 |
+
messages = [
|
663 |
+
{"role": "system", "content": SLIDES_REVISION_PROMPT},
|
664 |
+
{"role": "user", "content": slide_content},
|
665 |
+
{"role": "assistant", "content": "I will revise the representation slides following my instructions."}
|
666 |
+
]
|
667 |
+
print('Slides final revision')
|
668 |
+
revised_slides = make_api_call(model=self.revise_model, messages=messages, max_tokens=2048, temperature=self.temprature)
|
669 |
+
return revised_slides
|
670 |
+
|
671 |
+
def slides2markdown(self, slides: dict):
|
672 |
+
slides_content = ''
|
673 |
+
slides_content += '**Title**\n{}\n\n'.format(slides['Title'])
|
674 |
+
slides_content += '{}\n'.format(SLIDE_SEP)
|
675 |
+
slides_content += '**Outline**\n\n'
|
676 |
+
outline_dict = slides['Outline']
|
677 |
+
for sect_name, sect_content in outline_dict.items():
|
678 |
+
slides_content += '{}\n--\t\t{}\n\n'.format(sect_name, sect_content)
|
679 |
+
slides_content += '{}\n'.format(SLIDE_SEP)
|
680 |
+
for sect_name in outline_dict.keys():
|
681 |
+
if sect_name in slides:
|
682 |
+
slides_content += '**{}**\n\n'.format(sect_name)
|
683 |
+
slides_content += '{}\n\n'.format(slides[sect_name])
|
684 |
+
slides_content += '{}\n'.format(SLIDE_SEP)
|
685 |
+
return slides_content
|
686 |
+
|
687 |
+
def slides2markdown_v2(self, slides: dict, indent=0):
|
688 |
+
slides_content = dict_to_markdown_list(d=slides, indent=indent)
|
689 |
+
return slides_content
|
690 |
+
|
691 |
+
def save_to_slides(self, slides: dict, logo_path='logo.png', file_name='slides.pptx'):
|
692 |
+
authors = self.paper_contents.get('author', None)
|
693 |
+
if isinstance(authors, list):
|
694 |
+
authors = authors[0]
|
695 |
+
else:
|
696 |
+
authors = None
|
697 |
+
# print('authors', authors)
|
698 |
+
dict2ppt = Dict2PPT(logo_path=logo_path)
|
699 |
+
dict2ppt.build_slides(slide_dict=slides, authors=authors)
|
700 |
+
dict2ppt.save(file_name=file_name)
|
701 |
+
full_path = os.path.abspath(file_name)
|
702 |
+
return full_path
|
703 |
+
|
pdf_helper.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pdf4llm
|
2 |
+
import re
|
3 |
+
|
4 |
+
def py4llm_pdf_reader(pdf_path: str):
|
5 |
+
md_text = pdf4llm.to_markdown(pdf_path)
|
6 |
+
return md_text
|
7 |
+
|
8 |
+
def split_markdown_sections(text):
|
9 |
+
# Regex to match headers (e.g., #, ##, ###)
|
10 |
+
header_pattern = r'^(#{1,6})\s*(.+)$'
|
11 |
+
|
12 |
+
# Find all headers and their positions
|
13 |
+
matches = list(re.finditer(header_pattern, text, re.MULTILINE))
|
14 |
+
|
15 |
+
sections = []
|
16 |
+
|
17 |
+
# Iterate over all header matches and split text
|
18 |
+
for i, match in enumerate(matches):
|
19 |
+
header = match.group(0) # Full header text: number of # and header name
|
20 |
+
level = len(match.group(1)) # Header level (number of #)
|
21 |
+
title = match.group(2) # Header title
|
22 |
+
|
23 |
+
# Find the start position of the section (right after the header)
|
24 |
+
start_pos = match.end()
|
25 |
+
|
26 |
+
# Find the end position (start of the next header or end of the document)
|
27 |
+
if i + 1 < len(matches):
|
28 |
+
end_pos = matches[i + 1].start()
|
29 |
+
else:
|
30 |
+
end_pos = len(text)
|
31 |
+
|
32 |
+
# Extract section content between this header and the next one
|
33 |
+
section_content = text[start_pos:end_pos].strip()
|
34 |
+
|
35 |
+
# Store the section as a tuple: (header level, header title, section content)
|
36 |
+
sections.append({'level': level, 'title': title, 'content': section_content})
|
37 |
+
|
38 |
+
return sections
|
39 |
+
|
40 |
+
|
41 |
+
class PDFPaper4LLMParser(object):
|
42 |
+
def __init__(self, write_images=False, page_chunks=False) -> None:
|
43 |
+
self.write_images = write_images
|
44 |
+
self.page_chunks = page_chunks
|
45 |
+
|
46 |
+
def pdf2text(self, pdf_path: str):
|
47 |
+
md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
|
48 |
+
if self.page_chunks:
|
49 |
+
text_array = []
|
50 |
+
for md_text_i in md_text:
|
51 |
+
text_array.append(md_text_i['text'])
|
52 |
+
markdown_text = '\n'.join(text_array)
|
53 |
+
else:
|
54 |
+
markdown_text = md_text
|
55 |
+
return markdown_text
|
56 |
+
|
57 |
+
def structured_paper_content(self, markdown_sections: list):
|
58 |
+
"""
|
59 |
+
markdown_sections: list of dictionary, each dictionary consists of
|
60 |
+
1. level
|
61 |
+
2. title
|
62 |
+
3. content
|
63 |
+
|
64 |
+
Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
|
65 |
+
"""
|
66 |
+
assert len(markdown_sections) > 0
|
67 |
+
struct_sections = {}
|
68 |
+
start_section = markdown_sections[0]
|
69 |
+
title_level = start_section['level']
|
70 |
+
|
71 |
+
main_text_idx = -1
|
72 |
+
meta_data = []
|
73 |
+
for sec_idx, section in enumerate(markdown_sections):
|
74 |
+
level_i = section['level']
|
75 |
+
title_i = section['title']
|
76 |
+
content_i = section['content']
|
77 |
+
if level_i == title_level and sec_idx == 0:
|
78 |
+
struct_sections['title'] = title_i
|
79 |
+
if len(content_i) > 0:
|
80 |
+
meta_data.append(content_i)
|
81 |
+
else:
|
82 |
+
if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
|
83 |
+
struct_sections['abstract'] = content_i
|
84 |
+
main_text_idx = sec_idx + 1
|
85 |
+
break
|
86 |
+
else:
|
87 |
+
meta_data.append(title_i + content_i)
|
88 |
+
struct_sections['author'] = meta_data
|
89 |
+
if main_text_idx == -1 and len(markdown_sections) > 0:
|
90 |
+
main_text_idx = 0
|
91 |
+
assert main_text_idx >= 0
|
92 |
+
main_text_list = markdown_sections[main_text_idx:]
|
93 |
+
struct_sections['main_text'] = main_text_list
|
94 |
+
return struct_sections
|
95 |
+
|
96 |
+
def run(self, pdf_path: str, verbose=True):
|
97 |
+
markdown_text = self.pdf2text(pdf_path=pdf_path)
|
98 |
+
sections = split_markdown_sections(text=markdown_text)
|
99 |
+
struct_sections = self.structured_paper_content(markdown_sections=sections)
|
100 |
+
if verbose:
|
101 |
+
paper_text = ''
|
102 |
+
for k, v in struct_sections.items():
|
103 |
+
if k == 'title':
|
104 |
+
paper_text += '\nTitle: ' + v + '\n\n'
|
105 |
+
elif k == 'abstract':
|
106 |
+
paper_text += '\nAbstract: \n' + v + '\n\n'
|
107 |
+
elif k == 'author':
|
108 |
+
paper_text += '\nAuthor: \n' + '\n'.join(v) + '\n\n'
|
109 |
+
elif k == 'main_text':
|
110 |
+
for section in v:
|
111 |
+
paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
|
112 |
+
print(paper_text)
|
113 |
+
return struct_sections
|
114 |
+
|
115 |
+
|
116 |
+
def dict_to_markdown_list(d: dict, indent=0):
|
117 |
+
lines = []
|
118 |
+
for key, value in d.items():
|
119 |
+
prefix = ' ' * indent + f"- **{key}**: "
|
120 |
+
if isinstance(value, dict):
|
121 |
+
lines.append(prefix)
|
122 |
+
lines.append(dict_to_markdown_list(value, indent + 1))
|
123 |
+
else:
|
124 |
+
lines.append(prefix + str(value))
|
125 |
+
return "\n".join(lines)
|
126 |
+
|
127 |
+
|
128 |
+
def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
|
129 |
+
return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]
|
130 |
+
|
131 |
+
|
132 |
+
def parse_slide_to_dict(slide: str):
|
133 |
+
lines = slide.splitlines()
|
134 |
+
result = {}
|
135 |
+
current_key = None
|
136 |
+
sub_items = []
|
137 |
+
|
138 |
+
for line in lines:
|
139 |
+
line = line.strip()
|
140 |
+
|
141 |
+
# Capture headings (### or ##)
|
142 |
+
heading_match = re.match(r"^#{2,3}\s+(.*)", line)
|
143 |
+
if heading_match:
|
144 |
+
if current_key and sub_items:
|
145 |
+
result[current_key] = sub_items
|
146 |
+
sub_items = []
|
147 |
+
current_key = heading_match.group(1).strip()
|
148 |
+
continue
|
149 |
+
|
150 |
+
# Capture numbered list
|
151 |
+
numbered_match = re.match(r"^\d+\.\s+(.*)", line)
|
152 |
+
if numbered_match:
|
153 |
+
sub_items.append(numbered_match.group(1).strip())
|
154 |
+
continue
|
155 |
+
|
156 |
+
# Capture bulleted list
|
157 |
+
bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
|
158 |
+
if bullet_match:
|
159 |
+
sub_items.append(bullet_match.group(1).strip())
|
160 |
+
continue
|
161 |
+
|
162 |
+
# Capture nested bullets
|
163 |
+
nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
|
164 |
+
if nested_bullet_match:
|
165 |
+
sub_items.append(nested_bullet_match.group(1).strip())
|
166 |
+
continue
|
167 |
+
|
168 |
+
# Fallback: add as freeform text
|
169 |
+
if current_key:
|
170 |
+
sub_items.append(line)
|
171 |
+
|
172 |
+
# Save the last block
|
173 |
+
if current_key and sub_items:
|
174 |
+
result[current_key] = sub_items
|
175 |
+
|
176 |
+
return result
|
177 |
+
|
178 |
+
|
179 |
+
def markdown_to_slide_dicts(full_markdown: str):
|
180 |
+
slides = split_markdown_slides(full_markdown)
|
181 |
+
return [parse_slide_to_dict(slide) for slide in slides]
|
pptx_utils.py
ADDED
@@ -0,0 +1,695 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pptx import Presentation
|
2 |
+
from pptx.dml.color import RGBColor
|
3 |
+
from pptx.util import Inches
|
4 |
+
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
|
5 |
+
from pptx.util import Pt
|
6 |
+
import string
|
7 |
+
from datetime import datetime
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
|
11 |
+
def clean_leading_numbering(text):
|
12 |
+
# Remove leading numbering like: "1. ", "1) ", "(1) ", "- 1. ", etc.
|
13 |
+
return re.sub(r'^[\s\(\-\.\d\)]*', '', text)
|
14 |
+
|
15 |
+
def is_logo_exist(file_path: str):
|
16 |
+
print(file_path)
|
17 |
+
if os.path.exists(file_path):
|
18 |
+
# print("File exists.")
|
19 |
+
return True
|
20 |
+
else:
|
21 |
+
print("File does not exist.")
|
22 |
+
return False
|
23 |
+
|
24 |
+
class Dict2PPT:
|
25 |
+
def __init__(self, logo_path: str = 'logo.png', title_size: int = 32, content_size: int=24) -> None:
|
26 |
+
self.title_font_size = Pt(title_size)
|
27 |
+
self.content_font_size = Pt(content_size)
|
28 |
+
self.logo_path = logo_path
|
29 |
+
self.prs = Presentation()
|
30 |
+
|
31 |
+
def _title_preprocess(self, title: str):
|
32 |
+
words = title.split()
|
33 |
+
capitalized_words = [word.capitalize() for word in words]
|
34 |
+
result = ' '.join(capitalized_words)
|
35 |
+
return result
|
36 |
+
|
37 |
+
def _add_time_footnote(self, slide):
|
38 |
+
# Get slide dimensions
|
39 |
+
slide_width = self.prs.slide_width
|
40 |
+
slide_height = self.prs.slide_height
|
41 |
+
|
42 |
+
# Prepare date text
|
43 |
+
date_str = datetime.today().strftime("%B %d, %Y") # e.g., March 26, 2025
|
44 |
+
|
45 |
+
# Set textbox size
|
46 |
+
textbox_width = Inches(3) # You can adjust this
|
47 |
+
textbox_height = Inches(0.3)
|
48 |
+
left = (slide_width - textbox_width) / 2 # Center horizontally
|
49 |
+
top = slide_height - Inches(0.5) # Near bottom
|
50 |
+
|
51 |
+
textbox = slide.shapes.add_textbox(left, top, textbox_width, textbox_height)
|
52 |
+
text_frame = textbox.text_frame
|
53 |
+
p = text_frame.paragraphs[0]
|
54 |
+
run = p.add_run()
|
55 |
+
run.text = date_str
|
56 |
+
run.font.size = Pt(12)
|
57 |
+
p.alignment = PP_ALIGN.CENTER # ✅ Center text horizontally
|
58 |
+
|
59 |
+
def _add_logo(self, slide):
|
60 |
+
# Define logo path and size
|
61 |
+
# logo_path = "logo.png" # Replace with your actual logo path
|
62 |
+
|
63 |
+
if not is_logo_exist(file_path=self.logo_path):
|
64 |
+
return
|
65 |
+
logo_width = Inches(1.0) # Resize logo as needed
|
66 |
+
logo_height = Inches(1.0)
|
67 |
+
|
68 |
+
# Calculate position for top-right corner
|
69 |
+
slide_width = self.prs.slide_width
|
70 |
+
right_margin = Inches(0.2) # Optional small margin from edge
|
71 |
+
top = Inches(0.2)
|
72 |
+
|
73 |
+
# Position: from right edge minus logo width
|
74 |
+
left = slide_width - logo_width - right_margin
|
75 |
+
|
76 |
+
# Add logo
|
77 |
+
slide.shapes.add_picture(self.logo_path, left, top, width=logo_width, height=logo_height)
|
78 |
+
|
79 |
+
def _set_background_color(self, slide):
|
80 |
+
fill = slide.background.fill
|
81 |
+
fill.solid() # Use solid color
|
82 |
+
fill.fore_color.rgb = RGBColor(240, 248, 255) # RGB for a light blue
|
83 |
+
|
84 |
+
def title_slide(self, title: str, authors: str):
|
85 |
+
title_slide_layout = self.prs.slide_layouts[0] # Title Slide
|
86 |
+
slide = self.prs.slides.add_slide(title_slide_layout)
|
87 |
+
|
88 |
+
self._set_background_color(slide=slide)
|
89 |
+
self._add_logo(slide=slide)
|
90 |
+
|
91 |
+
title_shape = slide.shapes.title
|
92 |
+
title_shape.text = title
|
93 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
94 |
+
for run in title_paragraph.runs:
|
95 |
+
run.font.bold = True
|
96 |
+
run.font.name = 'Times New Roman'
|
97 |
+
run.font.size = Pt(36) # e.g., 44 pt
|
98 |
+
|
99 |
+
author_shape = slide.placeholders[1]
|
100 |
+
today = datetime.today().strftime("%B %d, %Y") # e.g., March 25, 2025
|
101 |
+
# print('authors', authors)
|
102 |
+
# if authors:
|
103 |
+
# author_shape.text = '\n' + authors + '\n' + today
|
104 |
+
# else:
|
105 |
+
# author_shape.text = '\nAuthor Here\n' + today
|
106 |
+
author_shape.text = '\nAuthor Here\n' + today
|
107 |
+
# Set subtitle font size
|
108 |
+
author_paragraph = author_shape.text_frame.paragraphs[1] # 0 is blank line, 1 is actual text
|
109 |
+
for run in author_paragraph.runs:
|
110 |
+
run.font.name = 'Times New Roman'
|
111 |
+
run.font.size = Pt(24) # Set subtitle font size to 28 pt
|
112 |
+
|
113 |
+
def outline_slide(self, outline: dict):
|
114 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
115 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
116 |
+
self._set_background_color(slide=slide)
|
117 |
+
self._add_logo(slide=slide)
|
118 |
+
title_shape = slide.shapes.title
|
119 |
+
title_shape.text = 'Outline'
|
120 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
121 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
122 |
+
for run in title_paragraph.runs:
|
123 |
+
run.font.bold = True
|
124 |
+
run.font.name = 'Times New Roman'
|
125 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
126 |
+
|
127 |
+
# Clear existing content
|
128 |
+
content_shape = slide.placeholders[1]
|
129 |
+
text_frame = content_shape.text_frame
|
130 |
+
text_frame.clear()
|
131 |
+
|
132 |
+
# Add topic
|
133 |
+
for topic, desc in outline.items():
|
134 |
+
p1 = text_frame.add_paragraph()
|
135 |
+
p1.text = topic
|
136 |
+
p1.level = 0
|
137 |
+
p1.font.size = Pt(20)
|
138 |
+
p1.font.name = 'Times New Roman'
|
139 |
+
p1.font.bold = True
|
140 |
+
p1.alignment = PP_ALIGN.LEFT
|
141 |
+
|
142 |
+
# Line 2: description (indented)
|
143 |
+
if len(desc) > 0:
|
144 |
+
p2 = text_frame.add_paragraph()
|
145 |
+
p2.text = desc
|
146 |
+
p2.level = 1 # Indented bullet
|
147 |
+
p2.font.size = Pt(12)
|
148 |
+
p2.font.name = 'Times New Roman'
|
149 |
+
p2.alignment = PP_ALIGN.LEFT
|
150 |
+
|
151 |
+
self._add_time_footnote(slide=slide)
|
152 |
+
|
153 |
+
def _outline_preprocess_(self, outline):
|
154 |
+
if isinstance(outline, dict):
|
155 |
+
clean_outline = {}
|
156 |
+
for topic, desc in outline.items():
|
157 |
+
topic = topic.strip().strip(string.punctuation).strip()
|
158 |
+
desc = desc.strip().strip(string.punctuation).strip()
|
159 |
+
clean_outline[topic] = desc
|
160 |
+
return clean_outline
|
161 |
+
|
162 |
+
elif isinstance(outline, str):
|
163 |
+
sentences = outline.split('\n')
|
164 |
+
sentences = [text.strip().strip(string.punctuation).strip() for text in sentences]
|
165 |
+
sent_dict = {}
|
166 |
+
for sent in sentences:
|
167 |
+
tokens = sent.split(':')
|
168 |
+
if len(tokens) == 1:
|
169 |
+
sent_dict[tokens[0]] = ''
|
170 |
+
else:
|
171 |
+
key = tokens[0].strip().strip(string.punctuation).strip()
|
172 |
+
value = ''.join(tokens[1:])
|
173 |
+
value = value.strip().strip(string.punctuation).strip()
|
174 |
+
sent_dict[key] = value
|
175 |
+
return sent_dict
|
176 |
+
else:
|
177 |
+
print('Wrong format')
|
178 |
+
return {}
|
179 |
+
|
180 |
+
def _background_preprocess(self, background: str):
|
181 |
+
background_array = []
|
182 |
+
sentences = background.strip().splitlines()
|
183 |
+
for sent in sentences:
|
184 |
+
sent = clean_leading_numbering(sent)
|
185 |
+
background_array.append(sent.strip().strip(string.punctuation).strip())
|
186 |
+
return background_array
|
187 |
+
|
188 |
+
def background_slide(self, background):
|
189 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
190 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
191 |
+
self._set_background_color(slide=slide)
|
192 |
+
self._add_logo(slide=slide)
|
193 |
+
title_shape = slide.shapes.title
|
194 |
+
title_shape.text = 'Background'
|
195 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
196 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
197 |
+
for run in title_paragraph.runs:
|
198 |
+
run.font.bold = True
|
199 |
+
run.font.name = 'Times New Roman'
|
200 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
201 |
+
|
202 |
+
# Clear existing content
|
203 |
+
content_shape = slide.placeholders[1]
|
204 |
+
text_frame = content_shape.text_frame
|
205 |
+
text_frame.clear()
|
206 |
+
# ✅ Vertically center content inside the placeholder
|
207 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
208 |
+
|
209 |
+
# Add topic + indented description as two lines
|
210 |
+
background_item_num = len(background)
|
211 |
+
fontsize = 22
|
212 |
+
if background_item_num >= 4 and background_item_num <= 6:
|
213 |
+
fontsize = 20
|
214 |
+
elif background_item_num >7:
|
215 |
+
fontsize = 18
|
216 |
+
for idx, topic in enumerate(background, start=1):
|
217 |
+
p1 = text_frame.add_paragraph()
|
218 |
+
p1.text = f"{idx}. {topic}"
|
219 |
+
p1.level = 0
|
220 |
+
p1.font.size = Pt(fontsize)
|
221 |
+
p1.font.name = 'Times New Roman'
|
222 |
+
# p1.font.bold = True
|
223 |
+
p1.alignment = PP_ALIGN.LEFT
|
224 |
+
|
225 |
+
self._add_time_footnote(slide=slide)
|
226 |
+
|
227 |
+
def _problem_define_preprocess(self, problem_desc: str):
|
228 |
+
from collections import OrderedDict
|
229 |
+
def split_text_by_headers(text, headers):
|
230 |
+
sections = OrderedDict({header: [] for header in headers})
|
231 |
+
current = None
|
232 |
+
for line in text.strip().strip(string.punctuation).splitlines():
|
233 |
+
line_clean = line.strip().strip(string.punctuation).strip()
|
234 |
+
if len(line_clean) == 0:
|
235 |
+
continue
|
236 |
+
# Check if line matches any of the section headers
|
237 |
+
matched = [h for h in headers if h.lower() == line_clean.lower()]
|
238 |
+
if matched:
|
239 |
+
current = matched[0]
|
240 |
+
continue
|
241 |
+
if current:
|
242 |
+
cleaned_line = clean_leading_numbering(text=line_clean)
|
243 |
+
cleaned_line = cleaned_line.strip().strip(string.punctuation).strip()
|
244 |
+
sections[current].append(cleaned_line)
|
245 |
+
|
246 |
+
# Convert lists to joined text blocks
|
247 |
+
return {k: v for k, v in sections.items()}
|
248 |
+
|
249 |
+
sections = ["Scope", "Challenges", "Assumptions", "Relevance"]
|
250 |
+
problem_dict = {}
|
251 |
+
if any([_ in problem_desc for _ in sections]):
|
252 |
+
problem_dict = split_text_by_headers(text=problem_desc, headers=sections)
|
253 |
+
|
254 |
+
if all([len(v)==0 for k, v in problem_dict.items()]) or len(problem_dict) == 0:
|
255 |
+
problem_dict = {}
|
256 |
+
cleaned_sentences = []
|
257 |
+
sentences = problem_desc.strip().strip(string.punctuation).splitlines()
|
258 |
+
for sent in sentences:
|
259 |
+
cleaned_line = clean_leading_numbering(text=sent)
|
260 |
+
cleaned_line = cleaned_line.strip().strip(string.punctuation).strip()
|
261 |
+
cleaned_sentences.append(cleaned_line)
|
262 |
+
problem_dict['Scope'] = cleaned_sentences
|
263 |
+
|
264 |
+
return problem_dict
|
265 |
+
|
266 |
+
def problem_def_slide(self, problems):
|
267 |
+
sections = ["Scope", "Challenges", "Assumptions", "Relevance"]
|
268 |
+
scope = problems.get('Scope', [])
|
269 |
+
challenges = problems.get('Challenges', [])
|
270 |
+
assumptions = problems.get('Assumptions', [])
|
271 |
+
relevance = problems.get('Relevance', [])
|
272 |
+
for sect_name in sections:
|
273 |
+
section_contents = problems.get(sect_name, [])
|
274 |
+
if len(section_contents) == 0:
|
275 |
+
continue
|
276 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
277 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
278 |
+
self._set_background_color(slide=slide)
|
279 |
+
self._add_logo(slide=slide)
|
280 |
+
title_shape = slide.shapes.title
|
281 |
+
if sect_name == 'Scope':
|
282 |
+
title_shape.text = 'Problem Definition'
|
283 |
+
elif sect_name in {'Challenges', 'Assumptions'}:
|
284 |
+
title_shape.text = 'Problem Definition - {}'.format(sect_name)
|
285 |
+
else:
|
286 |
+
title_shape.text = 'Interested Practitioners'
|
287 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
288 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
289 |
+
for run in title_paragraph.runs:
|
290 |
+
run.font.bold = True
|
291 |
+
run.font.name = 'Times New Roman'
|
292 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
293 |
+
|
294 |
+
# Clear existing content
|
295 |
+
content_shape = slide.placeholders[1]
|
296 |
+
text_frame = content_shape.text_frame
|
297 |
+
text_frame.clear()
|
298 |
+
# ✅ Vertically center content inside the placeholder
|
299 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
300 |
+
fontsize = 20
|
301 |
+
for idx, topic in enumerate(section_contents, start=1):
|
302 |
+
p1 = text_frame.add_paragraph()
|
303 |
+
p1.text = f"{idx}. {topic}"
|
304 |
+
p1.level = 0
|
305 |
+
p1.font.size = Pt(fontsize)
|
306 |
+
p1.font.name = 'Times New Roman'
|
307 |
+
# p1.font.bold = True
|
308 |
+
p1.alignment = PP_ALIGN.LEFT
|
309 |
+
|
310 |
+
self._add_time_footnote(slide=slide)
|
311 |
+
|
312 |
+
def _objective_preprocess(self, objective: str):
|
313 |
+
objective_array = []
|
314 |
+
sentences = objective.strip().splitlines()
|
315 |
+
for sent in sentences:
|
316 |
+
sent = clean_leading_numbering(text=sent)
|
317 |
+
objective_array.append(sent.strip().strip(string.punctuation).strip())
|
318 |
+
return objective_array
|
319 |
+
|
320 |
+
def objective_slide(self, objectives):
|
321 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
322 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
323 |
+
self._set_background_color(slide=slide)
|
324 |
+
self._add_logo(slide=slide)
|
325 |
+
title_shape = slide.shapes.title
|
326 |
+
title_shape.text = 'Objectives & How'
|
327 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
328 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
329 |
+
for run in title_paragraph.runs:
|
330 |
+
run.font.bold = True
|
331 |
+
run.font.name = 'Times New Roman'
|
332 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
333 |
+
|
334 |
+
# Clear existing content
|
335 |
+
content_shape = slide.placeholders[1]
|
336 |
+
text_frame = content_shape.text_frame
|
337 |
+
text_frame.clear()
|
338 |
+
# ✅ Vertically center content inside the placeholder
|
339 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
340 |
+
|
341 |
+
objective_item_num = len(objectives)
|
342 |
+
fontsize = 24
|
343 |
+
if objective_item_num >= 4 and objective_item_num <= 6:
|
344 |
+
fontsize = 22
|
345 |
+
elif objective_item_num >7:
|
346 |
+
fontsize = 20
|
347 |
+
for idx, topic in enumerate(objectives, start=1):
|
348 |
+
p1 = text_frame.add_paragraph()
|
349 |
+
p1.text = f"{idx}. {topic}"
|
350 |
+
p1.level = 0
|
351 |
+
p1.font.size = Pt(fontsize)
|
352 |
+
p1.font.name = 'Times New Roman'
|
353 |
+
# p1.font.bold = True
|
354 |
+
p1.alignment = PP_ALIGN.LEFT
|
355 |
+
|
356 |
+
self._add_time_footnote(slide=slide)
|
357 |
+
|
358 |
+
def _method_preprocess(self, methodology: str):
|
359 |
+
method_array = []
|
360 |
+
sentences = methodology.strip().splitlines()
|
361 |
+
for sent in sentences:
|
362 |
+
sent_trim = clean_leading_numbering(text=sent)
|
363 |
+
sent_trim = sent_trim.strip().strip(string.punctuation).strip()
|
364 |
+
method_array.append(sent_trim)
|
365 |
+
return method_array
|
366 |
+
|
367 |
+
def method_slide(self, methods):
|
368 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
369 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
370 |
+
self._set_background_color(slide=slide)
|
371 |
+
self._add_logo(slide=slide)
|
372 |
+
|
373 |
+
title_shape = slide.shapes.title
|
374 |
+
title_shape.text = 'Proposed Method'
|
375 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
376 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
377 |
+
for run in title_paragraph.runs:
|
378 |
+
run.font.bold = True
|
379 |
+
run.font.name = 'Times New Roman'
|
380 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
381 |
+
|
382 |
+
# Clear existing content
|
383 |
+
content_shape = slide.placeholders[1]
|
384 |
+
text_frame = content_shape.text_frame
|
385 |
+
text_frame.clear()
|
386 |
+
# ✅ Vertically center content inside the placeholder
|
387 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
388 |
+
|
389 |
+
fontsize = 20
|
390 |
+
for idx, step in enumerate(methods, start=1):
|
391 |
+
p = text_frame.add_paragraph()
|
392 |
+
run1 = p.add_run()
|
393 |
+
run1.text = "Step {}. ".format(idx)
|
394 |
+
run1.font.bold = True
|
395 |
+
run1.font.size = Pt(fontsize)
|
396 |
+
|
397 |
+
# Second run: normal text
|
398 |
+
run2 = p.add_run()
|
399 |
+
run2.text = step
|
400 |
+
run2.font.bold = False
|
401 |
+
run2.font.size = Pt(fontsize)
|
402 |
+
p.font.name = 'Times New Roman'
|
403 |
+
p.alignment = PP_ALIGN.LEFT
|
404 |
+
|
405 |
+
self._add_time_footnote(slide=slide)
|
406 |
+
|
407 |
+
def _experiment_preprocess(self, experiment: str):
|
408 |
+
def split_sections_by_keywords(text: str, keyword1: str, keyword2: str) -> dict:
|
409 |
+
lines = text.strip().splitlines()
|
410 |
+
part1_lines = []
|
411 |
+
part2_lines = []
|
412 |
+
current_section = None
|
413 |
+
for line in lines:
|
414 |
+
stripped = clean_leading_numbering(line)
|
415 |
+
stripped = stripped.strip().strip(string.punctuation).strip()
|
416 |
+
if len(stripped) == 0:
|
417 |
+
continue
|
418 |
+
if keyword1 in stripped:
|
419 |
+
current_section = keyword1
|
420 |
+
continue
|
421 |
+
elif keyword2 in stripped:
|
422 |
+
current_section = keyword2
|
423 |
+
continue
|
424 |
+
|
425 |
+
if current_section == keyword1:
|
426 |
+
tokens = stripped.split(':')
|
427 |
+
key = tokens[0].strip().strip(string.punctuation).strip()
|
428 |
+
if len(tokens) > 1:
|
429 |
+
parse_stripped = key + ": " + ':'.join(tokens[1:]).strip().strip(string.punctuation).strip()
|
430 |
+
else:
|
431 |
+
parse_stripped = key
|
432 |
+
part1_lines.append(parse_stripped)
|
433 |
+
elif current_section == keyword2:
|
434 |
+
tokens = stripped.split(':')
|
435 |
+
key = tokens[0].strip().strip(string.punctuation).strip()
|
436 |
+
if len(tokens) > 1:
|
437 |
+
parse_stripped = (key, ':'.join(tokens[1:]))
|
438 |
+
else:
|
439 |
+
parse_stripped = (key, '')
|
440 |
+
part2_lines.append(parse_stripped)
|
441 |
+
return {
|
442 |
+
keyword1: part1_lines,
|
443 |
+
keyword2: part2_lines
|
444 |
+
}
|
445 |
+
|
446 |
+
experiment_dict = {}
|
447 |
+
sentences = experiment.strip().splitlines()
|
448 |
+
evidence_keyword = 'Evidence Summary'
|
449 |
+
exp_summary_keyword = 'Experimental Summary'
|
450 |
+
if (evidence_keyword in experiment) and (exp_summary_keyword in experiment):
|
451 |
+
experiment_dict = split_sections_by_keywords(text=experiment, keyword1=evidence_keyword, keyword2=exp_summary_keyword)
|
452 |
+
else:
|
453 |
+
experiment_array = []
|
454 |
+
for sent in sentences:
|
455 |
+
sent = clean_leading_numbering(sent)
|
456 |
+
sent = sent.strip().strip(string.punctuation).strip()
|
457 |
+
experiment_array.append(sent)
|
458 |
+
experiment_dict[exp_summary_keyword] = experiment_array
|
459 |
+
return experiment_dict
|
460 |
+
|
461 |
+
def experiment_slide(self, experiments):
|
462 |
+
evidence_keyword = 'Evidence Summary'
|
463 |
+
exp_summary_keyword = 'Experimental Summary'
|
464 |
+
if len(experiments) == 1:
|
465 |
+
experiments_part1 = experiments[exp_summary_keyword]
|
466 |
+
experiments_part2 = []
|
467 |
+
else:
|
468 |
+
assert len(experiments) == 2
|
469 |
+
experiments_part1 = experiments[exp_summary_keyword]
|
470 |
+
experiments_part2 = experiments[evidence_keyword]
|
471 |
+
|
472 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
473 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
474 |
+
self._set_background_color(slide=slide)
|
475 |
+
self._add_logo(slide=slide)
|
476 |
+
title_shape = slide.shapes.title
|
477 |
+
title_shape.text = 'Experimental Study'
|
478 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
479 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
480 |
+
for run in title_paragraph.runs:
|
481 |
+
run.font.bold = True
|
482 |
+
run.font.name = 'Times New Roman'
|
483 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
484 |
+
|
485 |
+
# Clear existing content
|
486 |
+
content_shape = slide.placeholders[1]
|
487 |
+
text_frame = content_shape.text_frame
|
488 |
+
text_frame.clear()
|
489 |
+
# ✅ Vertically center content inside the placeholder
|
490 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
491 |
+
|
492 |
+
fontsize = 20
|
493 |
+
if len(experiments_part2) == 0:
|
494 |
+
for idx, sent in enumerate(experiments_part1, start=1):
|
495 |
+
p1 = text_frame.add_paragraph()
|
496 |
+
p1.text = f"{idx}. {sent}"
|
497 |
+
p1.level = 0
|
498 |
+
p1.font.size = Pt(fontsize)
|
499 |
+
p1.font.name = 'Times New Roman'
|
500 |
+
p1.font.bold = True
|
501 |
+
p1.alignment = PP_ALIGN.LEFT
|
502 |
+
else:
|
503 |
+
for idx, step in enumerate(experiments_part1, start=1):
|
504 |
+
key, value = step
|
505 |
+
if len(value) == 0:
|
506 |
+
continue
|
507 |
+
p = text_frame.add_paragraph()
|
508 |
+
run1 = p.add_run()
|
509 |
+
run1.text = key
|
510 |
+
run1.font.bold = True
|
511 |
+
run1.font.size = Pt(fontsize)
|
512 |
+
|
513 |
+
# Second run: normal text
|
514 |
+
run2 = p.add_run()
|
515 |
+
run2.text = value
|
516 |
+
run2.font.bold = False
|
517 |
+
run2.font.size = Pt(fontsize)
|
518 |
+
p.font.name = 'Times New Roman'
|
519 |
+
p.alignment = PP_ALIGN.LEFT
|
520 |
+
|
521 |
+
self._add_time_footnote(slide=slide)
|
522 |
+
|
523 |
+
###experimental study in multiple pages
|
524 |
+
if len(experiments_part2) > 0:
|
525 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
526 |
+
slide_2 = self.prs.slides.add_slide(content_slide_layout)
|
527 |
+
self._set_background_color(slide=slide_2)
|
528 |
+
self._add_logo(slide=slide_2)
|
529 |
+
title_shape = slide_2.shapes.title
|
530 |
+
title_shape.text = 'Experimental Study (Summary)'
|
531 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
532 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
533 |
+
for run in title_paragraph.runs:
|
534 |
+
run.font.bold = True
|
535 |
+
run.font.name = 'Times New Roman'
|
536 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
537 |
+
|
538 |
+
# Clear existing content
|
539 |
+
content_shape = slide_2.placeholders[1]
|
540 |
+
text_frame = content_shape.text_frame
|
541 |
+
text_frame.clear()
|
542 |
+
# ✅ Vertically center content inside the placeholder
|
543 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
544 |
+
self._add_time_footnote(slide=slide_2)
|
545 |
+
for idx, sent in enumerate(experiments_part2, start=1):
|
546 |
+
p1 = text_frame.add_paragraph()
|
547 |
+
p1.text = f"{idx}. {sent}"
|
548 |
+
p1.level = 0
|
549 |
+
p1.font.size = Pt(fontsize)
|
550 |
+
p1.font.name = 'Times New Roman'
|
551 |
+
p1.alignment = PP_ALIGN.LEFT
|
552 |
+
|
553 |
+
def _conclusion_preprocess(self, conclusion: str):
|
554 |
+
conclusion_dict = {}
|
555 |
+
sentences = conclusion.strip().splitlines()
|
556 |
+
for sent in sentences:
|
557 |
+
trim_sent = sent.strip().strip(string.punctuation).strip()
|
558 |
+
trim_sent = clean_leading_numbering(text=trim_sent)
|
559 |
+
if len(trim_sent) == 0 or trim_sent.lower().startswith('conclusion'):
|
560 |
+
continue
|
561 |
+
else:
|
562 |
+
tokens = trim_sent.split(':')
|
563 |
+
key = tokens[0].strip().strip(string.punctuation).strip()
|
564 |
+
if len(tokens) == 1:
|
565 |
+
conclusion_dict[key] = ''
|
566 |
+
else:
|
567 |
+
value = ':'.join(tokens[1:]).strip().strip(string.punctuation).strip()
|
568 |
+
conclusion_dict[key] = value
|
569 |
+
return conclusion_dict
|
570 |
+
|
571 |
+
def conclusion_slide(self, conclusion):
|
572 |
+
content_slide_layout = self.prs.slide_layouts[1] # title and Content Slide Layout
|
573 |
+
slide = self.prs.slides.add_slide(content_slide_layout)
|
574 |
+
self._set_background_color(slide=slide)
|
575 |
+
self._add_logo(slide=slide)
|
576 |
+
title_shape = slide.shapes.title
|
577 |
+
title_shape.text = 'Conclusions & Future Work'
|
578 |
+
title_paragraph = title_shape.text_frame.paragraphs[0]
|
579 |
+
title_paragraph.alignment = PP_ALIGN.LEFT
|
580 |
+
for run in title_paragraph.runs:
|
581 |
+
run.font.bold = True
|
582 |
+
run.font.name = 'Times New Roman'
|
583 |
+
run.font.size = Pt(36) # e.g., 36 pt
|
584 |
+
|
585 |
+
# Clear existing content
|
586 |
+
content_shape = slide.placeholders[1]
|
587 |
+
text_frame = content_shape.text_frame
|
588 |
+
text_frame.clear()
|
589 |
+
# ✅ Vertically center content inside the placeholder
|
590 |
+
text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE
|
591 |
+
|
592 |
+
# Add topic
|
593 |
+
for topic, desc in conclusion.items():
|
594 |
+
if len(desc) == 0:
|
595 |
+
continue
|
596 |
+
p1 = text_frame.add_paragraph()
|
597 |
+
p1.text = topic
|
598 |
+
p1.level = 0
|
599 |
+
p1.font.size = Pt(20)
|
600 |
+
p1.font.name = 'Times New Roman'
|
601 |
+
p1.font.bold = True
|
602 |
+
p1.alignment = PP_ALIGN.LEFT
|
603 |
+
|
604 |
+
# Line 2: description (indented)
|
605 |
+
p2 = text_frame.add_paragraph()
|
606 |
+
p2.text = desc
|
607 |
+
p2.level = 1 # Indented bullet
|
608 |
+
p2.font.size = Pt(16)
|
609 |
+
p2.font.italic = True
|
610 |
+
p2.font.name = 'Times New Roman'
|
611 |
+
p2.alignment = PP_ALIGN.LEFT
|
612 |
+
|
613 |
+
self._add_time_footnote(slide=slide)
|
614 |
+
|
615 |
+
def build_slides(self, slide_dict: dict, authors: str = 'Author here'):
|
616 |
+
title = slide_dict.get('Title', '')
|
617 |
+
title = self._title_preprocess(title=title)
|
618 |
+
self.title_slide(title=title, authors=authors)
|
619 |
+
|
620 |
+
outline = slide_dict.get('Outline', {})
|
621 |
+
outline = self._outline_preprocess_(outline=outline)
|
622 |
+
assert len(outline) > 0, 'No outline detected!!!'
|
623 |
+
self.outline_slide(outline=outline)
|
624 |
+
|
625 |
+
background = slide_dict.get('Background', '')
|
626 |
+
if background:
|
627 |
+
background = self._background_preprocess(background=background)
|
628 |
+
self.background_slide(background=background)
|
629 |
+
|
630 |
+
problem_definition = slide_dict.get('Research problem', '')
|
631 |
+
# print('problem_definition', problem_definition)
|
632 |
+
if problem_definition:
|
633 |
+
problems = self._problem_define_preprocess(problem_desc=problem_definition)
|
634 |
+
# print('problems', problems)
|
635 |
+
self.problem_def_slide(problems=problems)
|
636 |
+
|
637 |
+
objectives = slide_dict.get('Objectives', '')
|
638 |
+
if objectives:
|
639 |
+
objectives = self._objective_preprocess(objective=objectives)
|
640 |
+
self.objective_slide(objectives=objectives)
|
641 |
+
|
642 |
+
methodology = slide_dict.get('Methodology', '')
|
643 |
+
if methodology:
|
644 |
+
methodology = self._method_preprocess(methodology=methodology)
|
645 |
+
# print('Method', methodology)
|
646 |
+
self.method_slide(methods=methodology)
|
647 |
+
|
648 |
+
experimental_study = slide_dict.get('Results', '')
|
649 |
+
if experimental_study:
|
650 |
+
experiments = self._experiment_preprocess(experiment=experimental_study)
|
651 |
+
# print('experiments', experiments)
|
652 |
+
self.experiment_slide(experiments=experiments)
|
653 |
+
|
654 |
+
conclusion = slide_dict.get('Conclusions', '')
|
655 |
+
if conclusion:
|
656 |
+
conclusion = self._conclusion_preprocess(conclusion=conclusion)
|
657 |
+
self.conclusion_slide(conclusion=conclusion)
|
658 |
+
|
659 |
+
self.qa_slides()
|
660 |
+
print('Done!!')
|
661 |
+
|
662 |
+
def qa_slides(self):
|
663 |
+
# Add a blank slide (usually layout 6 is blank)
|
664 |
+
blank_slide_layout = self.prs.slide_layouts[6]
|
665 |
+
slide = self.prs.slides.add_slide(blank_slide_layout)
|
666 |
+
self._set_background_color(slide=slide)
|
667 |
+
self._add_logo(slide=slide)
|
668 |
+
|
669 |
+
# Add a textbox in the center
|
670 |
+
left = Inches(2)
|
671 |
+
top = Inches(2.5)
|
672 |
+
width = Inches(6)
|
673 |
+
height = Inches(2)
|
674 |
+
|
675 |
+
textbox = slide.shapes.add_textbox(left, top, width, height)
|
676 |
+
text_frame = textbox.text_frame
|
677 |
+
text_frame.clear()
|
678 |
+
|
679 |
+
# Add "Thank you"
|
680 |
+
p1 = text_frame.add_paragraph()
|
681 |
+
p1.text = "Thank you!"
|
682 |
+
p1.font.size = Pt(44)
|
683 |
+
p1.font.bold = True
|
684 |
+
p1.alignment = PP_ALIGN.CENTER
|
685 |
+
|
686 |
+
# Add "Q & A"
|
687 |
+
p2 = text_frame.add_paragraph()
|
688 |
+
p2.text = "\nQ & A"
|
689 |
+
p2.font.size = Pt(36)
|
690 |
+
p2.alignment = PP_ALIGN.CENTER
|
691 |
+
|
692 |
+
self._add_time_footnote(slide=slide)
|
693 |
+
|
694 |
+
def save(self, file_name='slides.pptx'):
|
695 |
+
self.prs.save(file_name)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pdf4llm
|
3 |
+
openai
|
4 |
+
python-dotenv
|
5 |
+
python-pptx
|
sambaAPI.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from openai import OpenAI
|
3 |
+
import os
|
4 |
+
|
5 |
+
MODEL_ALIAS = {'llama3_8b': 'Meta-Llama-3.1-8B-Instruct',
|
6 |
+
'llama3_70b': 'Meta-Llama-3.1-70B-Instruct',
|
7 |
+
'llama3_3_70b': 'Meta-Llama-3.3-70B-Instruct',
|
8 |
+
'llama3_405b': 'Meta-Llama-3.1-405B-Instruct',
|
9 |
+
'llama3_1b': "Meta-Llama-3.2-1B-Instruct",
|
10 |
+
'llama3_3b': "Meta-Llama-3.2-3B-Instruct"}
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
client = OpenAI(
|
15 |
+
base_url="https://api.sambanova.ai/v1", # the endpoint IP running on vLLM cloud.sambanova.ai, https://api.sambanova.ai. fast-api.snova.ai
|
16 |
+
api_key=os.environ.get("SAMBA_API_KEY"),
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
+
def call_llama(system_prompt, prompt, model="Meta-Llama-3.1-8B-Instruct", **kwargs):
|
21 |
+
"""
|
22 |
+
kwargs:
|
23 |
+
temperature = 0.1,
|
24 |
+
top_p = 0.1
|
25 |
+
max_tokens = 50
|
26 |
+
"""
|
27 |
+
try:
|
28 |
+
completion = client.chat.completions.create(
|
29 |
+
model=model,
|
30 |
+
messages=[
|
31 |
+
{"role": "system", "content": system_prompt},
|
32 |
+
{"role": "user", "content": prompt}
|
33 |
+
],
|
34 |
+
stream=True,
|
35 |
+
**kwargs,
|
36 |
+
)
|
37 |
+
response = ""
|
38 |
+
for chunk in completion:
|
39 |
+
response += chunk.choices[0].delta.content or ""
|
40 |
+
return response
|
41 |
+
except Exception as e:
|
42 |
+
print('API Error = {}'.format(e))
|
43 |
+
return ""
|
44 |
+
|
45 |
+
def call_llama_chat(messages, model="Meta-Llama-3.1-8B-Instruct", **kwargs):
|
46 |
+
"""
|
47 |
+
kwargs:
|
48 |
+
temperature = 0.1,
|
49 |
+
top_p = 0.1
|
50 |
+
"""
|
51 |
+
try:
|
52 |
+
completion = client.chat.completions.create(
|
53 |
+
model=model,
|
54 |
+
messages=messages,
|
55 |
+
stream=True,
|
56 |
+
**kwargs,
|
57 |
+
)
|
58 |
+
response = ""
|
59 |
+
for chunk in completion:
|
60 |
+
response += chunk.choices[0].delta.content or ""
|
61 |
+
return response
|
62 |
+
except Exception as e:
|
63 |
+
print('API Error = {}'.format(e))
|
64 |
+
return ""
|
utils.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from paper2slides import Paper2Slides, PaperReader
|
3 |
+
import os
|
4 |
+
|
5 |
+
def read_json(file_path: str):
|
6 |
+
try:
|
7 |
+
with open(file_path, "r") as json_file:
|
8 |
+
data = json.load(json_file)
|
9 |
+
return data
|
10 |
+
except FileNotFoundError:
|
11 |
+
print(f"Error: The file '{file_path}' was not found.")
|
12 |
+
return None
|
13 |
+
except json.JSONDecodeError:
|
14 |
+
print(f"Error: The file '{file_path}' is not a valid JSON.")
|
15 |
+
return None
|
16 |
+
|
17 |
+
|
18 |
+
def get_file_name(full_path: str):
|
19 |
+
file_name = os.path.splitext(os.path.basename(full_path))[0]
|
20 |
+
return file_name
|
21 |
+
|
22 |
+
|
23 |
+
def run_pdf2text(paper_pdf_path: str, save_json_name: str):
|
24 |
+
reader = PaperReader()
|
25 |
+
paper_content = reader.run(paper_file_name=paper_pdf_path)
|
26 |
+
# Save the dictionary as a JSON file
|
27 |
+
with open(save_json_name, 'w') as json_file:
|
28 |
+
json.dump(paper_content, json_file, indent=4)
|
29 |
+
return paper_content
|
30 |
+
|
31 |
+
|
32 |
+
def run_paper2slides(paper_json_name: str, model='llama3_70b', temprature=0.2, logo_path='logo.png', save_file_name:str='slides.pptx'):
|
33 |
+
paper_content = read_json(paper_json_name)
|
34 |
+
paper2slides = Paper2Slides(paper_contents=paper_content, model=model, temprature=temprature)
|
35 |
+
slides = paper2slides.generate_slides(verbose=False, revision=False)
|
36 |
+
with open('slides.json', 'w') as f:
|
37 |
+
json.dump(slides, f, indent=4) # indent=4 makes it pretty-printed
|
38 |
+
assert isinstance(slides, dict)
|
39 |
+
return paper2slides.save_to_slides(slides=slides, logo_path=logo_path, file_name=save_file_name)
|