Spaces:

ruslanmv
/

Job-Interview

Running

App Files Files Community

Job-Interview / questions.py

ruslanmv

First commit

5798cfc 5 months ago

raw

history blame

4.93 kB

	import os
	import json
	from dotenv import load_dotenv
	import fitz # PyMuPDF
	from langchain_openai import ChatOpenAI # Correct import from langchain-openai
	from langchain.schema import HumanMessage, SystemMessage # For creating structured chat messages

	QUESTIONS_PATH = "questions.json"

	# Load environment variables
	load_dotenv()

	def split_text_into_chunks(text: str, chunk_size: int) -> list:
	"""
	Splits the text into chunks of a specified maximum size.
	"""
	# Trim the text to remove leading/trailing whitespace and reduce multiple spaces to a single space
	cleaned_text = " ".join(text.split())
	words = cleaned_text.split(" ")

	chunks = []
	current_chunk = []
	current_length = 0

	for word in words:
	if current_length + len(word) + 1 > chunk_size:
	chunks.append(" ".join(current_chunk))
	current_chunk = [word]
	current_length = len(word)
	else:
	current_chunk.append(word)
	current_length += len(word) + 1

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks


	def distribute_questions_across_chunks(n_chunks: int, n_questions: int) -> list:
	"""
	Distributes a specified number of questions across a specified number of chunks.
	"""
	questions_per_chunk = [1] * min(n_chunks, n_questions)
	remaining_questions = n_questions - len(questions_per_chunk)

	if remaining_questions > 0:
	for i in range(len(questions_per_chunk)):
	if remaining_questions == 0:
	break
	questions_per_chunk[i] += 1
	remaining_questions -= 1

	while len(questions_per_chunk) < n_chunks:
	questions_per_chunk.append(0)

	return questions_per_chunk


	def extract_text_from_pdf(pdf_path):
	text = ""
	try:
	print(f"[DEBUG] Opening PDF: {pdf_path}")
	with fitz.open(pdf_path) as pdf:
	print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
	for page in pdf:
	text += page.get_text()
	except Exception as e:
	print(f"Error reading PDF: {e}")
	raise RuntimeError("Unable to extract text from PDF.")
	return text


	def generate_questions_from_text(text, n_questions=5):
	openai_api_key = os.getenv("OPENAI_API_KEY")

	if not openai_api_key:
	raise RuntimeError(
	"OpenAI API key not found. Please add it to your .env file as OPENAI_API_KEY."
	)

	chat = ChatOpenAI(
	openai_api_key=openai_api_key, model="gpt-4", temperature=0.7, max_tokens=750
	)

	messages = [
	SystemMessage(
	content="You are an expert interviewer who generates concise technical interview questions. Do not enumerate the questions. Answer only with questions."
	),
	HumanMessage(
	content=f"Based on the following content, generate {n_questions} technical interview questions:\n{text}"
	),
	]

	try:
	print(f"[DEBUG] Sending request to OpenAI with {n_questions} questions.")
	response = chat.invoke(messages)
	questions = response.content.strip().split("\n\n")
	questions = [q.strip() for q in questions if q.strip()]
	except Exception as e:
	print(f"[ERROR] Failed to generate questions: {e}")
	questions = ["An error occurred while generating questions."]

	return questions


	def save_questions(questions):
	with open(QUESTIONS_PATH, "w") as f:
	json.dump(questions, f, indent=4)


	def generate_and_save_questions_from_pdf(pdf_path, total_questions=5):
	print(f"[INFO] Generating questions from PDF: {pdf_path}")
	pdf_text = extract_text_from_pdf(pdf_path)

	if not pdf_text.strip():
	raise RuntimeError("The PDF content is empty or could not be read.")

	chunk_size = 2000
	chunks = split_text_into_chunks(pdf_text, chunk_size)
	n_chunks = len(chunks)

	questions_distribution = distribute_questions_across_chunks(n_chunks, total_questions)
	combined_questions = []

	for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
	print(f"[DEBUG] Processing chunk {i + 1} of {n_chunks}")
	if n_questions > 0:
	questions = generate_questions_from_text(chunk, n_questions=n_questions)
	combined_questions.extend(questions)

	print(f"[INFO] Total questions generated: {len(combined_questions)}")
	save_questions(combined_questions)
	print(f"[INFO] Questions saved to {QUESTIONS_PATH}")
	return combined_questions


	if __name__ == "__main__":
	pdf_path = "professional_machine_learning_engineer_exam_guide_english.pdf"

	try:
	generated_questions = generate_and_save_questions_from_pdf(
	pdf_path, total_questions=5
	)
	print(f"Generated Questions:\n{json.dumps(generated_questions, indent=2)}")
	except Exception as e:
	print(f"Failed to generate questions: {e}")