ruslanmv commited on
Commit
d53ccad
Β·
verified Β·
1 Parent(s): 7552eec

Upload splitgpt.py

Browse files
Files changed (1) hide show
  1. splitgpt.py +345 -331
splitgpt.py CHANGED
@@ -1,331 +1,345 @@
1
- import os
2
- import json
3
- from dotenv import load_dotenv
4
- import fitz # PyMuPDF
5
- from langchain_openai import ChatOpenAI # Correct import from langchain-openai
6
- from langchain.schema import HumanMessage, SystemMessage # For creating structured chat messages
7
-
8
- QUESTIONS_PATH = "questions.json"
9
-
10
- # Load environment variables
11
- load_dotenv()
12
-
13
- def split_text_into_chunks(text: str, chunk_size: int) -> list:
14
- """
15
- Splits the text into chunks of a specified maximum size.
16
- """
17
- # Trim the text to remove leading/trailing whitespace and reduce multiple spaces to a single space
18
- cleaned_text = " ".join(text.split())
19
- words = cleaned_text.split(" ")
20
-
21
- chunks = []
22
- current_chunk = []
23
- current_length = 0
24
-
25
- for word in words:
26
- if current_length + len(word) + 1 > chunk_size:
27
- chunks.append(" ".join(current_chunk))
28
- current_chunk = [word]
29
- current_length = len(word)
30
- else:
31
- current_chunk.append(word)
32
- current_length += len(word) + 1
33
-
34
- if current_chunk:
35
- chunks.append(" ".join(current_chunk))
36
-
37
- return chunks
38
-
39
-
40
- def distribute_questions_across_chunks(n_chunks: int, n_questions: int) -> list:
41
- """
42
- Distributes a specified number of questions across a specified number of chunks.
43
- """
44
- questions_per_chunk = [1] * min(n_chunks, n_questions)
45
- remaining_questions = n_questions - len(questions_per_chunk)
46
-
47
- if remaining_questions > 0:
48
- for i in range(len(questions_per_chunk)):
49
- if remaining_questions == 0:
50
- break
51
- questions_per_chunk[i] += 1
52
- remaining_questions -= 1
53
-
54
- while len(questions_per_chunk) < n_chunks:
55
- questions_per_chunk.append(0)
56
-
57
- return questions_per_chunk
58
-
59
-
60
- def extract_text_from_pdf(pdf_path):
61
- text = ""
62
- try:
63
- print(f"[DEBUG] Opening PDF: {pdf_path}")
64
- with fitz.open(pdf_path) as pdf:
65
- print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
66
- for page in pdf:
67
- text += page.get_text()
68
- except Exception as e:
69
- print(f"Error reading PDF: {e}")
70
- raise RuntimeError("Unable to extract text from PDF.")
71
- return text
72
-
73
-
74
- def generate_questions_from_text(text, n_questions=5):
75
- openai_api_key = os.getenv("OPENAI_API_KEY")
76
-
77
- if not openai_api_key:
78
- raise RuntimeError(
79
- "OpenAI API key not found. Please add it to your .env file as OPENAI_API_KEY."
80
- )
81
-
82
- chat = ChatOpenAI(
83
- openai_api_key=openai_api_key, model="gpt-4", temperature=0.7, max_tokens=750
84
- )
85
-
86
- messages = [
87
- SystemMessage(
88
- content="You are an expert interviewer who generates concise technical interview questions. Do not enumerate the questions. Answer only with questions."
89
- ),
90
- HumanMessage(
91
- content=f"Based on the following content, generate {n_questions} technical interview questions:\n{text}"
92
- ),
93
- ]
94
-
95
- try:
96
- print(f"[DEBUG] Sending request to OpenAI with {n_questions} questions.")
97
- response = chat.invoke(messages)
98
- questions = response.content.strip().split("\n\n")
99
- questions = [q.strip() for q in questions if q.strip()]
100
- except Exception as e:
101
- print(f"[ERROR] Failed to generate questions: {e}")
102
- questions = ["An error occurred while generating questions."]
103
-
104
- return questions
105
-
106
-
107
- def save_questions(questions):
108
- with open(QUESTIONS_PATH, "w") as f:
109
- json.dump(questions, f, indent=4)
110
-
111
-
112
- def generate_and_save_questions_from_pdf(pdf_path, total_questions=5):
113
- print(f"[INFO] Generating questions from PDF: {pdf_path}")
114
-
115
- try:
116
- pdf_text = extract_text_from_pdf(pdf_path)
117
-
118
- if not pdf_text.strip():
119
- raise RuntimeError("The PDF content is empty or could not be read.")
120
-
121
- chunk_size = 2000
122
- chunks = split_text_into_chunks(pdf_text, chunk_size)
123
- n_chunks = len(chunks)
124
-
125
- questions_distribution = distribute_questions_across_chunks(n_chunks, total_questions)
126
- combined_questions = []
127
-
128
- for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
129
- print(f"[DEBUG] Processing chunk {i + 1} of {n_chunks}")
130
- if n_questions > 0:
131
- questions = generate_questions_from_text(chunk, n_questions=n_questions)
132
- combined_questions.extend(questions)
133
-
134
- if not combined_questions:
135
- raise RuntimeError("No questions generated from the PDF content.")
136
-
137
- print(f"[INFO] Total questions generated: {len(combined_questions)}")
138
- save_questions(combined_questions)
139
- print(f"[INFO] Questions saved to {QUESTIONS_PATH}")
140
-
141
- # Return a status message and the JSON object
142
- return "Questions generated successfully.", {"questions": combined_questions}
143
-
144
- except Exception as e:
145
- # Handle exceptions and return meaningful error messages
146
- error_message = f"Error during question generation: {str(e)}"
147
- print(f"[ERROR] {error_message}")
148
- return error_message, {"questions": []}
149
-
150
-
151
-
152
-
153
-
154
-
155
- import gradio as gr
156
- import json
157
- import os
158
- import time
159
-
160
- def generate_and_save_questions_from_pdf3_mock(pdf_path, total_questions=5):
161
- print(f"[INFO] Generating questions from PDF: {pdf_path}")
162
-
163
- if not os.path.exists(pdf_path):
164
- yield "❌ Error: PDF file not found.", {}
165
- return
166
-
167
- yield "πŸ“„ PDF uploaded successfully. Processing started...", {}
168
-
169
- try:
170
- # Simulate PDF text extraction and processing
171
- time.sleep(1)
172
- pdf_text = "This is some mock PDF text for testing purposes."
173
-
174
- if not pdf_text.strip():
175
- yield "❌ Error: The PDF content is empty or could not be read.", {}
176
- return
177
-
178
- chunk_size = 2000
179
- chunks = [pdf_text[i:i + chunk_size] for i in range(0, len(pdf_text), chunk_size)]
180
- n_chunks = len(chunks)
181
-
182
- yield f"πŸ”„ Splitting text into {n_chunks} chunks...", {}
183
-
184
- questions_distribution = [total_questions // n_chunks] * n_chunks
185
- combined_questions = []
186
-
187
- for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
188
- yield f"πŸ”„ Processing chunk {i + 1} of {n_chunks}...", {}
189
- time.sleep(1) # Simulating processing time
190
- combined_questions.append(f"Sample Question from Chunk {i + 1}")
191
-
192
- if not combined_questions:
193
- yield "❌ Error: No questions generated from the PDF content.", {}
194
- return
195
-
196
- yield f"βœ… Total {len(combined_questions)} questions generated. Saving questions...", {}
197
- save_path = "generated_questions_from_pdf.json"
198
- with open(save_path, "w") as f:
199
- json.dump({"questions": combined_questions}, f)
200
-
201
- yield "βœ… PDF processing complete. Questions saved successfully!", {"questions": combined_questions}
202
-
203
- except Exception as e:
204
- yield f"❌ Error during question generation: {str(e)}", {}
205
-
206
- def generate_and_save_questions_from_pdf3_v1(pdf_path, total_questions=5):
207
- print(f"[INFO] Generating questions from PDF: {pdf_path}")
208
-
209
- if not os.path.exists(pdf_path):
210
- yield "❌ Error: PDF file not found.", {}
211
- return
212
-
213
- yield "πŸ“„ PDF uploaded successfully. Processing started...", {}
214
-
215
- try:
216
- # Extract text from the PDF file
217
- pdf_text = extract_text_from_pdf(pdf_path)
218
-
219
- if not pdf_text.strip():
220
- yield "❌ Error: The PDF content is empty or could not be read.", {}
221
- return
222
-
223
- # Split the PDF content into chunks
224
- chunk_size = 2000 # Adjust this as necessary
225
- chunks = split_text_into_chunks(pdf_text, chunk_size)
226
- n_chunks = len(chunks)
227
-
228
- yield f"πŸ”„ Splitting text into {n_chunks} chunks...", {}
229
-
230
- # Distribute the total number of questions across chunks
231
- questions_distribution = distribute_questions_across_chunks(n_chunks, total_questions)
232
- combined_questions = []
233
-
234
- # Process each chunk and generate questions
235
- for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
236
- yield f"πŸ”„ Processing chunk {i + 1} of {n_chunks}...", {}
237
- if n_questions > 0:
238
- questions = generate_questions_from_text(chunk, n_questions=n_questions)
239
- combined_questions.extend(questions)
240
-
241
- if not combined_questions:
242
- yield "❌ Error: No questions generated from the PDF content.", {}
243
- return
244
-
245
- yield f"βœ… Total {len(combined_questions)} questions generated. Saving questions...", {}
246
-
247
- # Save generated questions to a file
248
- save_path = "generated_questions_from_pdf.json"
249
- with open(save_path, "w") as f:
250
- json.dump({"questions": combined_questions}, f)
251
-
252
- yield "βœ… PDF processing complete. Questions saved successfully!", {"questions": combined_questions}
253
-
254
- except Exception as e:
255
- error_message = f"❌ Error during question generation: {str(e)}"
256
- print(f"[ERROR] {error_message}")
257
- yield error_message, {}
258
-
259
- import json
260
- import os
261
-
262
- def generate_and_save_questions_from_pdf3(pdf_path, total_questions=5):
263
- print(f"[INFO] Generating questions from PDF: {pdf_path}")
264
-
265
- if not os.path.exists(pdf_path):
266
- yield "❌ Error: PDF file not found.", {}
267
- return
268
-
269
- yield "πŸ“„ PDF uploaded successfully. Processing started...", {}
270
-
271
- try:
272
- # Extract text from the PDF file
273
- pdf_text = extract_text_from_pdf(pdf_path)
274
-
275
- if not pdf_text.strip():
276
- yield "❌ Error: The PDF content is empty or could not be read.", {}
277
- return
278
-
279
- # Split the PDF content into chunks
280
- chunk_size = 2000 # Adjust this as necessary
281
- chunks = split_text_into_chunks(pdf_text, chunk_size)
282
- n_chunks = len(chunks)
283
-
284
- yield f"πŸ”„ Splitting text into {n_chunks} chunks...", {}
285
-
286
- # Distribute the total number of questions across chunks
287
- questions_distribution = distribute_questions_across_chunks(n_chunks, total_questions)
288
- combined_questions = []
289
-
290
- # Process each chunk and generate questions
291
- for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
292
- yield f"πŸ”„ Processing chunk {i + 1} of {n_chunks}...", {}
293
- if n_questions > 0:
294
- questions = generate_questions_from_text(chunk, n_questions=n_questions)
295
- combined_questions.extend(questions)
296
-
297
- if not combined_questions:
298
- yield "❌ Error: No questions generated from the PDF content.", {}
299
- return
300
-
301
- yield f"βœ… Total {len(combined_questions)} questions generated. Saving questions...", {}
302
-
303
- # Save the combined questions in `generated_questions_from_pdf.json` (detailed version)
304
- detailed_save_path = "generated_questions_from_pdf.json"
305
- with open(detailed_save_path, "w") as f:
306
- json.dump({"questions": combined_questions}, f)
307
-
308
- # Save only the questions (overwrite `questions.json` if it already exists)
309
- simple_save_path = "questions.json"
310
- with open(simple_save_path, "w") as f:
311
- json.dump(combined_questions, f)
312
-
313
- yield "βœ… PDF processing complete. Questions saved successfully!", {"questions": combined_questions}
314
-
315
- except Exception as e:
316
- error_message = f"❌ Error during question generation: {str(e)}"
317
- print(f"[ERROR] {error_message}")
318
- yield error_message, {}
319
-
320
-
321
-
322
- if __name__ == "__main__":
323
- pdf_path = "professional_machine_learning_engineer_exam_guide_english.pdf"
324
-
325
- try:
326
- generated_questions = generate_and_save_questions_from_pdf(
327
- pdf_path, total_questions=5
328
- )
329
- print(f"Generated Questions:\n{json.dumps(generated_questions, indent=2)}")
330
- except Exception as e:
331
- print(f"Failed to generate questions: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+ import fitz # PyMuPDF
5
+ from langchain_openai import ChatOpenAI # Correct import from langchain-openai
6
+ from langchain.schema import HumanMessage, SystemMessage # For creating structured chat messages
7
+
8
+ QUESTIONS_PATH = "questions.json"
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ def split_text_into_chunks(text: str, chunk_size: int) -> list:
14
+ """
15
+ Splits the text into chunks of a specified maximum size.
16
+ """
17
+ # Trim the text to remove leading/trailing whitespace and reduce multiple spaces to a single space
18
+ cleaned_text = " ".join(text.split())
19
+ words = cleaned_text.split(" ")
20
+
21
+ chunks = []
22
+ current_chunk = []
23
+ current_length = 0
24
+
25
+ for word in words:
26
+ if current_length + len(word) + 1 > chunk_size:
27
+ chunks.append(" ".join(current_chunk))
28
+ current_chunk = [word]
29
+ current_length = len(word)
30
+ else:
31
+ current_chunk.append(word)
32
+ current_length += len(word) + 1
33
+
34
+ if current_chunk:
35
+ chunks.append(" ".join(current_chunk))
36
+
37
+ return chunks
38
+
39
+
40
+ def distribute_questions_across_chunks(n_chunks: int, n_questions: int) -> list:
41
+ """
42
+ Distributes a specified number of questions across a specified number of chunks.
43
+ """
44
+ questions_per_chunk = [1] * min(n_chunks, n_questions)
45
+ remaining_questions = n_questions - len(questions_per_chunk)
46
+
47
+ if remaining_questions > 0:
48
+ for i in range(len(questions_per_chunk)):
49
+ if remaining_questions == 0:
50
+ break
51
+ questions_per_chunk[i] += 1
52
+ remaining_questions -= 1
53
+
54
+ while len(questions_per_chunk) < n_chunks:
55
+ questions_per_chunk.append(0)
56
+
57
+ return questions_per_chunk
58
+
59
+
60
+ def extract_text_from_pdf(pdf_path):
61
+ text = ""
62
+ try:
63
+ print(f"[DEBUG] Opening PDF: {pdf_path}")
64
+ with fitz.open(pdf_path) as pdf:
65
+ print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
66
+ for page in pdf:
67
+ text += page.get_text()
68
+ except Exception as e:
69
+ print(f"Error reading PDF: {e}")
70
+ raise RuntimeError("Unable to extract text from PDF.")
71
+ return text
72
+
73
+
74
+ def generate_questions_from_text(text, n_questions=5):
75
+ openai_api_key = os.getenv("OPENAI_API_KEY")
76
+
77
+ if not openai_api_key:
78
+ raise RuntimeError(
79
+ "OpenAI API key not found. Please add it to your .env file as OPENAI_API_KEY."
80
+ )
81
+
82
+ chat = ChatOpenAI(
83
+ openai_api_key=openai_api_key, model="gpt-4", temperature=0.7, max_tokens=750
84
+ )
85
+
86
+ messages = [
87
+ SystemMessage(
88
+ content="You are an expert interviewer who generates concise technical interview questions. Do not enumerate the questions. Answer only with questions."
89
+ ),
90
+ HumanMessage(
91
+ content=f"Based on the following content, generate {n_questions} technical interview questions:\n{text}"
92
+ ),
93
+ ]
94
+
95
+ try:
96
+ print(f"[DEBUG] Sending request to OpenAI with {n_questions} questions.")
97
+ response = chat.invoke(messages)
98
+ questions = response.content.strip().split("\n\n")
99
+ questions = [q.strip() for q in questions if q.strip()]
100
+ print(f"[DEBUG] Raw questions from LLM: {questions}")
101
+
102
+ formatted_questions = []
103
+ for i, q in enumerate(questions):
104
+ formatted_questions.append(f"Question {i+1}: {q}")
105
+
106
+ print(f"[DEBUG] Formatted questions: {formatted_questions}")
107
+ return formatted_questions
108
+ except Exception as e:
109
+ print(f"[ERROR] Failed to generate questions: {e}")
110
+ return ["An error occurred while generating questions."]
111
+
112
+
113
+
114
+
115
+ def save_questions(questions):
116
+ with open(QUESTIONS_PATH, "w") as f:
117
+ json.dump(questions, f, indent=4)
118
+
119
+
120
+
121
+ import os
122
+ import json
123
+ import re
124
+
125
+
126
+ def generate_and_save_questions_from_pdf3(pdf_path, total_questions=5):
127
+ print(f"[INFO] Generating questions from PDF: {pdf_path}")
128
+ print(f"[DEBUG] Number of total questions to generate: {total_questions}")
129
+
130
+ if not os.path.exists(pdf_path):
131
+ yield "❌ Error: PDF file not found.", []
132
+ return
133
+
134
+ yield "πŸ“„ PDF uploaded successfully. Processing started...", []
135
+
136
+ try:
137
+ # 1. Extract text from the PDF
138
+ pdf_text = extract_text_from_pdf(pdf_path)
139
+ if not pdf_text.strip():
140
+ yield "❌ Error: The PDF content is empty or could not be read.", []
141
+ return
142
+
143
+ # 2. Split the PDF content into chunks
144
+ chunk_size = 2000 # Adjust as necessary
145
+ chunks = split_text_into_chunks(pdf_text, chunk_size)
146
+ n_chunks = len(chunks)
147
+
148
+ yield f"πŸ”„ Splitting text into {n_chunks} chunks...", []
149
+
150
+ # 3. Distribute total_questions evenly across the chunks
151
+ base = total_questions // n_chunks
152
+ remainder = total_questions % n_chunks
153
+ questions_per_chunk = [base] * n_chunks
154
+ for i in range(remainder):
155
+ questions_per_chunk[i] += 1
156
+
157
+ print(f"[DEBUG] Questions per chunk distribution: {questions_per_chunk}")
158
+
159
+ combined_questions = []
160
+
161
+ # Helper function to split any chunk's output into individual questions
162
+ def split_into_individual_questions(text_block):
163
+ """
164
+ Attempts to split a text block that might contain multiple questions
165
+ (like '1. Some question? 2. Another question?') into separate items.
166
+ """
167
+ # 1) Remove any "Question X:" prefix (e.g., "Question 1: ")
168
+ text_block = re.sub(r'Question\s*\d+\s*:\s*', '', text_block, flags=re.IGNORECASE)
169
+
170
+ # 2) Split on patterns like "1. Something", "2. Something"
171
+ # This looks for one or more digits, then a dot, then whitespace: "(\d+\.\s+)"
172
+ splitted = re.split(r'\d+\.\s+', text_block.strip())
173
+
174
+ # 3) Clean up and filter out empty items
175
+ splitted = [s.strip() for s in splitted if s.strip()]
176
+
177
+ return splitted
178
+
179
+ # 4. Process each chunk and generate questions
180
+ for i, (chunk, n_questions) in enumerate(zip(chunks, questions_per_chunk)):
181
+ yield f"πŸ”„ Processing chunk {i+1} of {n_chunks} with {n_questions} questions...", []
182
+
183
+ if n_questions > 0:
184
+ # This function returns either a list of questions or a single string with multiple questions
185
+ questions_output = generate_questions_from_text(chunk, n_questions=n_questions)
186
+
187
+ if isinstance(questions_output, list):
188
+ # If it's already a list, we further ensure each item is split if needed
189
+ for item in questions_output:
190
+ combined_questions.extend(split_into_individual_questions(str(item)))
191
+ else:
192
+ # If it's a single string, we split it
193
+ combined_questions.extend(split_into_individual_questions(str(questions_output)))
194
+
195
+ # 5. Check if the number of generated questions matches the desired total
196
+ if len(combined_questions) != total_questions:
197
+ yield f"⚠️ Warning: Expected {total_questions}, but generated {len(combined_questions)}.", []
198
+
199
+ yield f"βœ… Total {len(combined_questions)} questions generated. Saving questions...", []
200
+
201
+ # 6. Save the combined questions in `generated_questions_from_pdf.json`
202
+ detailed_save_path = "generated_questions_from_pdf.json"
203
+ with open(detailed_save_path, "w", encoding="utf-8") as f:
204
+ json.dump({"questions": combined_questions}, f, indent=4, ensure_ascii=False)
205
+
206
+ # 7. Save only the questions (overwrite `questions.json` if it already exists)
207
+ #simple_save_path = "questions.json"
208
+ #with open(simple_save_path, "w", encoding="utf-8") as f:
209
+ # json.dump(combined_questions, f, indent=4, ensure_ascii=False)
210
+
211
+ save_questions(combined_questions)
212
+ print(f"[INFO] Questions saved to {QUESTIONS_PATH}")
213
+
214
+ yield "βœ… PDF processing complete. Questions saved successfully!", combined_questions
215
+
216
+ except Exception as e:
217
+ error_message = f"❌ Error during question generation: {str(e)}"
218
+ print(f"[ERROR] {error_message}")
219
+ yield error_message, []
220
+
221
+ def generate_questions_from_job_description_old(job_description, num_questions):
222
+ print(f"[DEBUG] Generating {num_questions} questions from job description.")
223
+
224
+ if not job_description.strip():
225
+ return "❌ Error: Job description is empty.", []
226
+
227
+ try:
228
+ questions = generate_questions_from_text(job_description, num_questions=num_questions)
229
+
230
+ if not questions:
231
+ return "❌ Error: No questions generated.", []
232
+
233
+ return "βœ… Questions generated successfully!", questions
234
+
235
+ except Exception as e:
236
+ error_message = f"❌ Error during question generation: {str(e)}"
237
+ print(f"[ERROR] {error_message}")
238
+ return error_message, []
239
+
240
+ import os
241
+ import json
242
+ import math
243
+ import re
244
+ import os
245
+ import json
246
+ import math
247
+ import re
248
+
249
+ def distribute_questions_evenly(total_questions, n_chunks):
250
+ base = total_questions // n_chunks
251
+ remainder = total_questions % n_chunks
252
+
253
+ questions_per_chunk = [base] * n_chunks
254
+
255
+ # Distribute the remainder by incrementing the first `remainder` chunks
256
+ for i in range(remainder):
257
+ questions_per_chunk[i] += 1
258
+
259
+ return questions_per_chunk
260
+
261
+
262
+ def generate_questions_from_job_description(job_description, total_questions=5):
263
+ print(f"[DEBUG] Generating {total_questions} questions from job description.")
264
+
265
+ if not job_description.strip():
266
+ return "❌ Error: Job description is empty.", []
267
+
268
+ try:
269
+ # 1. Split the job description into chunks
270
+ chunk_size = 2000 # Adjust as necessary
271
+ chunks = split_text_into_chunks(job_description, chunk_size)
272
+ n_chunks = len(chunks)
273
+
274
+ print(f"[DEBUG] Splitting text into {n_chunks} chunks...")
275
+
276
+ # 2. Distribute total_questions evenly across the chunks
277
+ questions_per_chunk = distribute_questions_evenly(total_questions, n_chunks)
278
+ print(f"[DEBUG] Questions per chunk distribution: {questions_per_chunk}")
279
+
280
+ combined_questions = []
281
+
282
+ # Helper function to split any chunk's output into individual questions
283
+ def split_into_individual_questions(text_block):
284
+ """
285
+ Attempts to split a text block that might contain multiple questions
286
+ (like '1. Some question? 2. Another question?') into separate items.
287
+ """
288
+ # Remove any "Question X:" prefix (e.g., "Question 1: ")
289
+ text_block = re.sub(r'Question\s*\d+\s*:\s*', '', text_block, flags=re.IGNORECASE)
290
+
291
+ # Split on patterns like "1. Something", "2. Something"
292
+ splitted = re.split(r'\d+\.\s+', text_block.strip())
293
+
294
+ # Clean up and filter out empty items
295
+ return [s.strip() for s in splitted if s.strip()]
296
+
297
+ # 3. Process each chunk and generate questions
298
+ for i, (chunk, n_questions) in enumerate(zip(chunks, questions_per_chunk)):
299
+ print(f"[DEBUG] Processing chunk {i+1} of {n_chunks} with {n_questions} questions...")
300
+
301
+ if n_questions > 0:
302
+ questions_output = generate_questions_from_text(chunk, n_questions=n_questions)
303
+
304
+ if isinstance(questions_output, list):
305
+ for item in questions_output:
306
+ combined_questions.extend(split_into_individual_questions(str(item)))
307
+ else:
308
+ combined_questions.extend(split_into_individual_questions(str(questions_output)))
309
+
310
+ if len(combined_questions) != total_questions:
311
+ print(f"⚠️ Warning: Expected {total_questions}, but generated {len(combined_questions)}.")
312
+
313
+ print(f"βœ… Total {len(combined_questions)} questions generated. Saving questions...")
314
+
315
+ # Save the combined questions in `generated_questions_from_job_description.json`
316
+ detailed_save_path = "generated_questions_from_job_description.json"
317
+ with open(detailed_save_path, "w", encoding="utf-8") as f:
318
+ json.dump({"questions": combined_questions}, f, indent=4, ensure_ascii=False)
319
+
320
+ # Save only the questions (overwrite `questions.json` if it already exists)
321
+ #simple_save_path = "questions.json"
322
+ #with open(simple_save_path, "w", encoding="utf-8") as f:
323
+ # json.dump(combined_questions, f, indent=4, ensure_ascii=False)
324
+
325
+ save_questions(combined_questions)
326
+ print(f"[INFO] Questions saved to {QUESTIONS_PATH}")
327
+ return "βœ… Job description processing complete. Questions saved successfully!", combined_questions
328
+
329
+ except Exception as e:
330
+ error_message = f"❌ Error during question generation: {str(e)}"
331
+ print(f"[ERROR] {error_message}")
332
+ return error_message, []
333
+
334
+
335
+ if __name__ == "__main__":
336
+ pdf_path = "professional_machine_learning_engineer_exam_guide_english.pdf" # Replace with your PDF path
337
+
338
+ try:
339
+ # Using the generator to get the results
340
+ for status, questions in generate_and_save_questions_from_pdf3(pdf_path, total_questions=5):
341
+ print(status) # Print the status message
342
+ if questions:
343
+ print(json.dumps(questions, indent=2)) # Print the questions if available
344
+ except Exception as e:
345
+ print(f"Failed to generate questions: {e}")