from PIL import Image import pytesseract import os import pymupdf import spaces import torch import gradio as gr from prepare import prepare from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from langchain_community.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader from langchain_community.vectorstores.utils import filter_complex_metadata from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema.runnable import RunnablePassthrough from langchain_core.messages import AIMessage, HumanMessage from langchain_community.llms import HuggingFaceEndpoint from dotenv import load_dotenv from huggingface_hub import InferenceClient import huggingface_hub #zero = torch.Tensor([0]).cuda() load_dotenv() api_token = os.getenv("HF_TOKEN") huggingface_hub.login(token=api_token) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b') model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device) #@spaces.GPU def read_pdf(file_path): output = '' doc = pymupdf.open(file_path) for page in range(len(doc)): text = doc[page].get_text().encode("utf8") if text: output += text.decode('utf-8') else: image_list = doc[page].get_images() for image_index, img in enumerate(image_list, start=1): # enumerate the image list xref = img[0] # get the XREF of the image pix = pymupdf.Pixmap(doc, xref) # create a Pixmap if pix.n - pix.alpha > 3: # CMYK: convert to RGB first pix = pymupdf.Pixmap(pymupdf.csRGB, pix) path = "page_{}-image_{}.png".format(page, image_index) pix.save(path) # save the image as png img = Image.open(path) pix = None output += pytesseract.image_to_string(img, lang='vie') + '\n' os.remove(path) return output @spaces.GPU(duration=60) def LLM_Inference(cv_text): text = f''' You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details: **CV** {cv_text} **Information extraction and output format** 1. Candidate Information - Full Name - Contact Information (Phone, Email, Address, etc.) - Date of Birth (if available) 2. Education - Degree Name (e.g., Bachelor's, Master's, Ph.D.) - Field of Study (e.g., Computer Science, Business Administration) - Institution Name - Year(s) of Graduation 3. Professional Experience For each job, extract: - Job Title - Company Name - Duration (start and end dates) - Summarize key Responsibilities and Achievements 4. Skills - List of technical, soft, or industry-specific skills mentioned. 5. Certifications - Name of Certification - Issuing Organization - Year of Issuance 6. Language - List the languages mentioned in the CV along with proficiency levels (if specified). Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step ''' inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id, top_p=0.99, # Nucleus sampling - only consider top 90% probability mass top_k=1, # Top-k sampling - choose from top 50 tokens temperature=0.0 ) return tokenizer.decode(outputs[0], skip_special_tokens=True) def process(file_path): cv_text = read_pdf(file_path) cv_summary = LLM_Inference(cv_text) return cv_text, cv_summary # Create Gradio App interface = gr.Interface( fn=process, inputs=gr.File(label="Upload a PDF file"), outputs=[ gr.Textbox(label="PDF Content"), # Display PDF content gr.Textbox(label="CV Summary"), ], title="PDF Processor", description="Upload a PDF file and extract its content." ) # Launch the Gradio App if __name__ == "__main__": prepare() interface.launch()