Spaces:

Namanj46
/

llama_index

Runtime error

App Files Files Community

Namanj46 commited on Jul 16, 2024

Commit

ef33567

1 Parent(s): 61005f4

Initial commit

Browse files

Files changed (6) hide show

.ipynb_checkpoints/app-checkpoint.py +129 -0
.ipynb_checkpoints/config-checkpoint.env +1 -0
.ipynb_checkpoints/requirements-checkpoint.txt +1 -0
app.py +127 -58
config.env +1 -0
requirements.txt +4 -1

.ipynb_checkpoints/app-checkpoint.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+import openai
+import gradio as gr
+import pdfplumber
+from llama_index.core import Document, VectorStoreIndex, Settings
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core.postprocessor import MetadataReplacementPostProcessor
+from llama_index.core.node_parser import SentenceWindowNodeParser
+# Set your OpenAI API key here
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+openai.api_key = OPENAI_API_KEY
+# Get the current working directory and join with 'resumes'
+resume_path = 'resumes'
+if not os.path.exists(resume_path):
+    raise ValueError(f"Directory 'resumes' not found")
+# Function to load PDFs using pdfplumber
+def load_pdfs_with_pdfplumber(directory):
+    documents = []
+    for filename in os.listdir(directory):
+        if filename.endswith(".pdf"):
+            try:
+                with pdfplumber.open(os.path.join(directory, filename)) as pdf:
+                    text = ""
+                    for page in pdf.pages:
+                        text += page.extract_text() or ""
+                    documents.append(Document(text=text))
+            except Exception as e:
+                print(f"Error processing {filename}: {e}")
+    return documents
+# Load documents from the resume directory using pdfplumber
+documents = load_pdfs_with_pdfplumber(resume_path)
+print(f"Number of documents: {len(documents)}")
+# Set up the LLM (GPT-4o)
+llm = OpenAI(model="gpt-4o", temperature=0.9)
+# Set up the embedding model
+embed_model = OpenAIEmbedding(model="text-embedding-3-large")
+# Create sentence window node parser with default settings
+sentence_node_parser = SentenceWindowNodeParser.from_defaults(
+    window_size=3,
+    window_metadata_key="window",
+    original_text_metadata_key="original_text"
+)
+# Configure global settings
+Settings.llm = llm
+Settings.embed_model = embed_model
+Settings.node_parser = sentence_node_parser
+# Create index
+index = VectorStoreIndex.from_documents(documents)
+# Custom reranking function
+def custom_rerank(nodes, query):
+    rerank_prompt = (
+        "Given the following query and text chunks, rate each chunk's relevance "
+        "to the query on a scale of 1-10, where 10 is most relevant.\n\n"
+        f"Query: {query}\n\n"
+    )
+    for i, node in enumerate(nodes):
+        rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
+    rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
+    response = llm.complete(rerank_prompt)
+    try:
+        ratings = [int(r.strip()) for r in response.text.split(',')]
+        if len(ratings) != len(nodes):
+            raise ValueError("Number of ratings does not match number of nodes")
+        sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
+        return sorted_nodes[:5]  # Return top 5 reranked nodes
+    except Exception as e:
+        print(f"Error in reranking: {e}, returning original order")
+        return nodes[:5]
+# Create query engine
+query_engine = index.as_query_engine(
+    similarity_top_k=20,
+    node_postprocessors=[
+        MetadataReplacementPostProcessor("window")
+    ],
+)
+# Chatbot function
+def chatbot(message, history):
+    history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
+    full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
+    # Retrieve nodes
+    retrieved_nodes = query_engine.retrieve(full_query)
+    # Apply custom reranking
+    reranked_nodes = custom_rerank(retrieved_nodes, full_query)
+    # Synthesize answer from reranked nodes
+    context = "\n".join([node.get_content() for node in reranked_nodes])
+    response = llm.complete(
+        f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
+    )
+    return response.text
+# Create Gradio interface
+iface = gr.ChatInterface(
+    chatbot,
+    title="Resume Chatbot",
+    description="Ask questions about resumes in the database.",
+    theme="soft",
+    examples=[
+        "Out of all the resumes tell me three of them who have experience in SQL?",
+        "Give me key summary takeaways of the resumes who have experience in Project Management?",
+        "Give me the names of 10 candidates who have more than two years of experience in general?",
+    ],
+    retry_btn=None,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+    chatbot=gr.Chatbot(height=400),
+    textbox=gr.Textbox(scale=5)
+)
+# Launch the interface
+iface.launch(share=True)

.ipynb_checkpoints/config-checkpoint.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=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

.ipynb_checkpoints/requirements-checkpoint.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ huggingface_hub==0.22.2

app.py CHANGED Viewed

@@ -1,63 +1,132 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
-if __name__ == "__main__":
-    demo.launch()

+import os
+import openai
 import gradio as gr
+import pdfplumber
+from llama_index.core import Document, VectorStoreIndex, Settings
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core.postprocessor import MetadataReplacementPostProcessor
+from llama_index.core.node_parser import SentenceWindowNodeParser
+from dotenv import load_dotenv
+load_dotenv("config.env")
+# Set your OpenAI API key here
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+openai.api_key = OPENAI_API_KEY
+# Get the current working directory and join with 'resumes'
+resume_path = 'resumes'
+if not os.path.exists(resume_path):
+    raise ValueError(f"Directory 'resumes' not found")
+# Function to load PDFs using pdfplumber
+def load_pdfs_with_pdfplumber(directory):
+    documents = []
+    for filename in os.listdir(directory):
+        if filename.endswith(".pdf"):
+            try:
+                with pdfplumber.open(os.path.join(directory, filename)) as pdf:
+                    text = ""
+                    for page in pdf.pages:
+                        text += page.extract_text() or ""
+                    documents.append(Document(text=text))
+            except Exception as e:
+                print(f"Error processing {filename}: {e}")
+    return documents
+# Load documents from the resume directory using pdfplumber
+documents = load_pdfs_with_pdfplumber(resume_path)
+print(f"Number of documents: {len(documents)}")
+# Set up the LLM (GPT-4o)
+llm = OpenAI(model="gpt-4o", temperature=0.9)
+# Set up the embedding model
+embed_model = OpenAIEmbedding(model="text-embedding-3-large")
+# Create sentence window node parser with default settings
+sentence_node_parser = SentenceWindowNodeParser.from_defaults(
+    window_size=3,
+    window_metadata_key="window",
+    original_text_metadata_key="original_text"
+)
+# Configure global settings
+Settings.llm = llm
+Settings.embed_model = embed_model
+Settings.node_parser = sentence_node_parser
+# Create index
+index = VectorStoreIndex.from_documents(documents)
+# Custom reranking function
+def custom_rerank(nodes, query):
+    rerank_prompt = (
+        "Given the following query and text chunks, rate each chunk's relevance "
+        "to the query on a scale of 1-10, where 10 is most relevant.\n\n"
+        f"Query: {query}\n\n"
+    )
+    for i, node in enumerate(nodes):
+        rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
+    rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
+    response = llm.complete(rerank_prompt)
+    try:
+        ratings = [int(r.strip()) for r in response.text.split(',')]
+        if len(ratings) != len(nodes):
+            raise ValueError("Number of ratings does not match number of nodes")
+        sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
+        return sorted_nodes[:5]  # Return top 5 reranked nodes
+    except Exception as e:
+        print(f"Error in reranking: {e}, returning original order")
+        return nodes[:5]
+# Create query engine
+query_engine = index.as_query_engine(
+    similarity_top_k=20,
+    node_postprocessors=[
+        MetadataReplacementPostProcessor("window")
     ],
 )
+# Chatbot function
+def chatbot(message, history):
+    history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
+    full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
+    # Retrieve nodes
+    retrieved_nodes = query_engine.retrieve(full_query)
+    # Apply custom reranking
+    reranked_nodes = custom_rerank(retrieved_nodes, full_query)
+    # Synthesize answer from reranked nodes
+    context = "\n".join([node.get_content() for node in reranked_nodes])
+    response = llm.complete(
+        f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
+    )
+    return response.text
+# Create Gradio interface
+iface = gr.ChatInterface(
+    chatbot,
+    title="Resume Chatbot",
+    description="Ask questions about resumes in the database.",
+    theme="soft",
+    examples=[
+        "Out of all the resumes tell me three of them who have experience in SQL?",
+        "Give me key summary takeaways of the resumes who have experience in Project Management?",
+        "Give me the names of 10 candidates who have more than two years of experience in general?",
+    ],
+    retry_btn=None,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+    chatbot=gr.Chatbot(height=400),
+    textbox=gr.Textbox(scale=5)
+)
+# Launch the interface
+iface.launch(share=True)

config.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=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

requirements.txt CHANGED Viewed

	@@ -1 +1,4 @@
1	- ~~huggingface_hub==0.22.2~~

+gradio
+pdfplumber
+openai
+llama_index