Spaces:
Runtime error
Runtime error
adityasugandhi
commited on
Commit
·
919910a
1
Parent(s):
95e4875
,,
Browse files- Dockerfile +8 -7
- Front-End Design.docx +0 -0
- Inferencer.py +138 -0
- Personal LLM by stranzersweb.docx +0 -0
- Pipfile +11 -0
- Readme.Md +60 -0
- SystemDesign.png +0 -0
- __pycache__/Inferencer.cpython-310.pyc +0 -0
- __pycache__/dataloader.cpython-310.pyc +0 -0
- app.py +85 -0
- data/Aditya_test.txt +17 -0
- data/Aditya_train.txt +28 -0
- data/Resume_Vishwam_Shah_Back_end.pdf +0 -0
- data/mf.txt +17 -0
- dataloader.py +106 -0
- env.yaml +349 -0
- output_results.json +3 -0
- output_results.txt +3 -0
- rag_model.ipynb +328 -0
- req.txt +135 -0
- test.ipynb +470 -0
- test2.ipynb +272 -0
- test_trainer/runs/.DS_Store +0 -0
- test_trainer/runs/Feb22_22-15-01_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658123.bfs-v13-skynet.coaps.fsu.edu.3062760.0 +0 -0
- test_trainer/runs/Feb22_22-17-41_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658261.bfs-v13-skynet.coaps.fsu.edu.3062760.1 +0 -0
- test_trainer/runs/Feb22_22-17-41_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658535.bfs-v13-skynet.coaps.fsu.edu.3062760.2 +0 -0
- test_trainer/runs/Feb22_22-24-50_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658690.bfs-v13-skynet.coaps.fsu.edu.3062760.3 +0 -0
- utils/ExtractQA.py +27 -0
- utils/__init__.py +0 -0
- utils/dataloader.py +29 -0
- utils/prompt_builder.py +29 -0
Dockerfile
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
|
2 |
-
FROM python:3.10
|
3 |
|
4 |
-
# Set the working directory inside the container
|
5 |
WORKDIR /app
|
6 |
|
7 |
-
|
8 |
-
COPY requirements.txt /requirements.txt
|
9 |
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9.5-slim
|
|
|
2 |
|
|
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
COPY ./req.txt /app/req.txt
|
|
|
6 |
|
7 |
+
RUN pip install --upgrade pip && \
|
8 |
+
pip install --no-cache-dir -r req.txt
|
9 |
+
|
10 |
+
COPY . /app
|
11 |
+
|
12 |
+
CMD ["python", "app.py"]
|
Front-End Design.docx
ADDED
Binary file (14.2 kB). View file
|
|
Inferencer.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack import Pipeline
|
2 |
+
from haystack.utils import Secret
|
3 |
+
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
|
4 |
+
# from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
|
5 |
+
from haystack.components.readers import ExtractiveReader
|
6 |
+
# from haystack.components.generators import GPTGenerator
|
7 |
+
from haystack.components.builders.prompt_builder import PromptBuilder
|
8 |
+
from haystack.components.builders.answer_builder import AnswerBuilder
|
9 |
+
from haystack.components.generators import OpenAIGenerator
|
10 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
11 |
+
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
|
12 |
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
13 |
+
from dataloader import DataLoader
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
import os
|
16 |
+
load_dotenv() # Load variables from .env file
|
17 |
+
|
18 |
+
|
19 |
+
chroma_store_loader = DataLoader()
|
20 |
+
class Inferncer:
|
21 |
+
|
22 |
+
def __init__(self):
|
23 |
+
self.chroma_store = chroma_store_loader.chroma_store
|
24 |
+
self.InMemory_store = chroma_store_loader.InMemory_store
|
25 |
+
|
26 |
+
def OpenAI(self,query):
|
27 |
+
template = """
|
28 |
+
|
29 |
+
Utilize the provided context related to Aditya Sugandhi to answer the question. If the answer is not explicitly available in the given information, generate a response using the Language Model (LLM). Optimize the process for clarity and efficiency.
|
30 |
+
Context:
|
31 |
+
{% for context in answers %}
|
32 |
+
{{ context }}
|
33 |
+
{% endfor %}
|
34 |
+
Question: {{question}}
|
35 |
+
Answer:
|
36 |
+
"""
|
37 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
38 |
+
|
39 |
+
#ExtractiveReader to extract answers from the relevant context
|
40 |
+
api_key = Secret.from_token(api_key)
|
41 |
+
prompt_builder = PromptBuilder(template=template)
|
42 |
+
retriever = ChromaQueryTextRetriever(document_store = self.chroma_store)
|
43 |
+
#ExtractiveReader to extract answers from the relevant context
|
44 |
+
api_key = Secret.from_token("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")
|
45 |
+
llm = OpenAIGenerator(model="gpt-3.5-turbo-0125",api_key=api_key)
|
46 |
+
reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")
|
47 |
+
|
48 |
+
extractive_qa_pipeline = Pipeline()
|
49 |
+
extractive_qa_pipeline.add_component("retriever", retriever)
|
50 |
+
extractive_qa_pipeline.add_component("reader",reader)
|
51 |
+
extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
|
52 |
+
extractive_qa_pipeline.add_component("llm", llm)
|
53 |
+
|
54 |
+
# extractive_qa_pipeline.connect("retriever.documents", "reader.documents")
|
55 |
+
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")
|
56 |
+
extractive_qa_pipeline.connect("reader.answers", "prompt_builder.answers")
|
57 |
+
extractive_qa_pipeline.connect("prompt_builder", "llm")
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
# Define the input data for the pipeline components
|
62 |
+
input_data = {
|
63 |
+
"retriever": {"query": query, "top_k": 2},
|
64 |
+
"reader": {"query": query, "top_k": 2},
|
65 |
+
"prompt_builder": {"question": query},
|
66 |
+
# "reader": {"query": query}
|
67 |
+
# Use 'max_tokens' instead of 'max_new_tokens'
|
68 |
+
}
|
69 |
+
|
70 |
+
# Run the pipeline with the updated input data
|
71 |
+
results = extractive_qa_pipeline.run(input_data)
|
72 |
+
return results
|
73 |
+
|
74 |
+
# def LlamaCpp(self,query):
|
75 |
+
# template = """
|
76 |
+
# ` Answer the question using the provided context based on Aditya.
|
77 |
+
|
78 |
+
# Context:
|
79 |
+
# {% for doc in documents %}
|
80 |
+
# {{ doc.content }}
|
81 |
+
# {% endfor %}
|
82 |
+
# Question: {{question}}
|
83 |
+
# Answer:
|
84 |
+
# """
|
85 |
+
# self.InMemory_store = chroma_store_loader.InMemory_dataloader()
|
86 |
+
# prompt_builder = PromptBuilder(template=template)
|
87 |
+
# retriever = InMemoryEmbeddingRetriever(document_store = self.InMemory_store)
|
88 |
+
# #ExtractiveReader to extract answers from the relevant context
|
89 |
+
|
90 |
+
# llm = LlamaCppGenerator(
|
91 |
+
# model_path="openchat-3.5-1210.Q3_K_S.ggml",
|
92 |
+
# n_ctx=30000,
|
93 |
+
# n_batch=256,
|
94 |
+
# model_kwargs={"n_gpu_layers": 2, "main_gpu": 1},
|
95 |
+
# generation_kwargs={"max_tokens": 250, "temperature": 0.7},
|
96 |
+
# )
|
97 |
+
# llm.warm_up()
|
98 |
+
|
99 |
+
# # reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled",)
|
100 |
+
# extractive_qa_pipeline = Pipeline()
|
101 |
+
# text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
|
102 |
+
# extractive_qa_pipeline.add_component('text_embedder', text_embedder)
|
103 |
+
# extractive_qa_pipeline.add_component("retriever", retriever)
|
104 |
+
# # extractive_qa_pipeline.add_component("reader",reader)
|
105 |
+
|
106 |
+
# extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
|
107 |
+
# extractive_qa_pipeline.add_component("llm", llm)
|
108 |
+
# # extractive_qa_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
|
109 |
+
|
110 |
+
# # extractive_qa_pipeline.connect("retriever.documents", "reader")
|
111 |
+
# extractive_qa_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
|
112 |
+
# extractive_qa_pipeline.connect("retriever.documents", "prompt_builder.documents")
|
113 |
+
# extractive_qa_pipeline.connect("prompt_builder", "llm")
|
114 |
+
# # extractive_qa_pipeline.connect("llm.replies", "answer_builder.replies")
|
115 |
+
# # extractive_qa_pipeline.connect("retriever", "answer_builder.documents")
|
116 |
+
|
117 |
+
# # Define the input data for the pipeline components
|
118 |
+
# input_data = {
|
119 |
+
# "text_embedder": {"text": query},
|
120 |
+
# # "retriever": {"query": query, "top_k": 3},
|
121 |
+
# # "reader": {"query": query},
|
122 |
+
# "prompt_builder": {"question": query},
|
123 |
+
# # "answer_builder": {"query": query},
|
124 |
+
# # Use 'max_tokens' instead of 'max_new_tokens'
|
125 |
+
# }
|
126 |
+
|
127 |
+
# # Run the pipeline with the updated input data
|
128 |
+
# results = extractive_qa_pipeline.run(input_data)
|
129 |
+
# return results
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
# #{
|
134 |
+
# "error": "Cannot connect 'text_embedder' with 'retriever': no matching connections available.\n'text_embedder':\n - embedding: List[float]\n'retriever':\n - query: str (available)\n - _: Optional[Dict[str, Any]] (available)\n - top_k: Optional[int] (available)"
|
135 |
+
# }
|
136 |
+
|
137 |
+
|
138 |
+
|
Personal LLM by stranzersweb.docx
ADDED
Binary file (13.4 kB). View file
|
|
Pipfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[[source]]
|
2 |
+
url = "https://pypi.org/simple"
|
3 |
+
verify_ssl = true
|
4 |
+
name = "pypi"
|
5 |
+
|
6 |
+
[packages]
|
7 |
+
|
8 |
+
[dev-packages]
|
9 |
+
|
10 |
+
[requires]
|
11 |
+
python_version = "3.9"
|
Readme.Md
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Personal LLM by StranzersWeb Inc: System Design with Flask Backend
|
2 |
+
|
3 |
+
## Introduction
|
4 |
+
|
5 |
+
StranzersWeb Inc proudly presents its innovative Personal Large Language Model (LLM) application, employing Flask as the backend service to harness the capabilities of AI and Large Language Models for precise Question and Answer inference. The integration of Haystack-ai ensures efficient custom database management, enhancing the overall accuracy of data retrieval.
|
6 |
+
|
7 |
+
## System Workflow
|
8 |
+
|
9 |
+
1. **Dataset Loading:**
|
10 |
+
- Flask handles the backend service responsible for loading the dataset into a Document store (ChromaStore, InMemoryStore, or Elastic Store).
|
11 |
+
- Efficient storage and retrieval are facilitated by Flask's capabilities.
|
12 |
+
|
13 |
+
2. **Embedding Conversion:**
|
14 |
+
- The Haystack-ai controller, integrated with Flask, takes charge of converting the dataset into embeddings.
|
15 |
+
- Flask manages the communication between the application and Haystack-ai, ensuring a smooth embedding conversion process.
|
16 |
+
|
17 |
+
3. **Haystack Pipeline Components:**
|
18 |
+
- **Retriever:**
|
19 |
+
- Flask manages the Retriever component, retrieving a list of relevant data based on user queries.
|
20 |
+
- **Reader:**
|
21 |
+
- The Reader component, under Flask's control, scans documents to identify the best context-match for queries.
|
22 |
+
- **Prompt Builder:**
|
23 |
+
- Flask oversees the generation of prompts by the Prompt Builder component based on the context provided by the Reader.
|
24 |
+
- **LLM (Large Language Model):**
|
25 |
+
- Flask integrates with the Large Language Model to utilize its powerful inference capabilities for generating desired outputs.
|
26 |
+
|
27 |
+
## Key Features
|
28 |
+
|
29 |
+
1. **Pinpoint Data Retrieval:**
|
30 |
+
- Flask, in conjunction with Haystack-ai libraries, ensures accurate data retrieval.
|
31 |
+
- Pre-processing with Flask enhances the efficiency of the Large Language Model, leading to precise responses.
|
32 |
+
|
33 |
+
2. **Flexible Document Stores:**
|
34 |
+
- Users can select from various Document stores (ChromaStore, InMemoryStore, or Elastic Store) based on preferences, all seamlessly managed by Flask.
|
35 |
+
|
36 |
+
3. **Streamlined Inferencing Pipeline:**
|
37 |
+
- Flask orchestrates the seamless collaboration of Haystack pipeline components, ensuring an efficient and streamlined inferencing process.
|
38 |
+
- The integration leads to faster response times and an improved user experience.
|
39 |
+
|
40 |
+
## LLM Application System Design
|
41 |
+
|
42 |
+
1. **Flask Backend:**
|
43 |
+
- Manages the backend services using Flask, providing a robust foundation for handling HTTP requests and serving API endpoints.
|
44 |
+
- Integration with Haystack-ai and other components for efficient communication.
|
45 |
+
|
46 |
+
2. **Frontend Integration:**
|
47 |
+
- User-friendly interface for interacting with the application.
|
48 |
+
- Communicates with Flask backend through API calls for smooth user experience.
|
49 |
+
|
50 |
+
3. **Scalability and Performance:**
|
51 |
+
- Deployed on cloud infrastructure with Flask's capabilities for scalability.
|
52 |
+
- Load balancing and auto-scaling to handle varying loads effectively.
|
53 |
+
|
54 |
+
4. **Security and Privacy:**
|
55 |
+
- Flask incorporates robust security measures to protect user data and ensure privacy.
|
56 |
+
- Implements encryption for communication channels and secure storage practices.
|
57 |
+
|
58 |
+
## System Design
|
59 |
+
|
60 |
+
![System Design](SystemDesign.png)
|
SystemDesign.png
ADDED
__pycache__/Inferencer.cpython-310.pyc
ADDED
Binary file (2.74 kB). View file
|
|
__pycache__/dataloader.cpython-310.pyc
ADDED
Binary file (3.46 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, jsonify, request
|
2 |
+
from Inferencer import Inferncer
|
3 |
+
from dataloader import DataLoader
|
4 |
+
import logging
|
5 |
+
app = Flask(__name__)
|
6 |
+
|
7 |
+
UPLOAD_FOLDER = './data/'
|
8 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
9 |
+
|
10 |
+
inferencer = Inferncer()
|
11 |
+
data_loader = DataLoader()
|
12 |
+
|
13 |
+
#app logger
|
14 |
+
|
15 |
+
log_format = "%(asctime)s [%(levelname)s] - %(message)s"
|
16 |
+
logging.basicConfig(filename="app.log", level=logging.DEBUG, format=log_format)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
# Initialize chroma_store as a global variable
|
20 |
+
# chroma_store = data_loader.dataloader()
|
21 |
+
# in_memory_store = data_loader.InMemory_dataloader()
|
22 |
+
chroma_store = None
|
23 |
+
in_memory_store = None
|
24 |
+
|
25 |
+
@app.route("/")
|
26 |
+
def home():
|
27 |
+
return "Welcome to the Flask app!"
|
28 |
+
|
29 |
+
@app.route('/upload', methods=['POST'])
|
30 |
+
def upload_document():
|
31 |
+
try:
|
32 |
+
if 'file' not in request.files:
|
33 |
+
return jsonify({"error": "No file provided"}), 400
|
34 |
+
|
35 |
+
file = request.files['file']
|
36 |
+
|
37 |
+
if file.filename == '':
|
38 |
+
return jsonify({"error": "No file selected"}), 400
|
39 |
+
|
40 |
+
file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
|
41 |
+
return jsonify({"message": "File uploaded successfully"}), 200
|
42 |
+
|
43 |
+
except Exception as e:
|
44 |
+
return jsonify({"error": str(e)})
|
45 |
+
|
46 |
+
@app.route("/sync", methods=["POST"])
|
47 |
+
def sync_and_run_dataloader():
|
48 |
+
global chroma_store
|
49 |
+
global in_memory_store# Access the global chroma_store variable
|
50 |
+
try:
|
51 |
+
# Optionally, you can add authentication or other checks here
|
52 |
+
|
53 |
+
# Call the dataloader function
|
54 |
+
chroma_store = data_loader.dataloader()
|
55 |
+
in_memory_store = data_loader.InMemory_dataloader()
|
56 |
+
|
57 |
+
return jsonify({"message": "DataLoader executed successfully", "result": "success"})
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
return jsonify({"error": str(e)})
|
61 |
+
|
62 |
+
@app.route("/ask", methods=["POST"])
|
63 |
+
def ask_question():
|
64 |
+
try:
|
65 |
+
data = request.get_json()
|
66 |
+
query = data.get("question", "")
|
67 |
+
model = data.get("model", "")
|
68 |
+
|
69 |
+
if chroma_store is None:
|
70 |
+
return jsonify({"error": "Chroma store not initialized. Run sync_and_run_dataloader first."})
|
71 |
+
|
72 |
+
if model == "OpenAI":
|
73 |
+
results = inferencer.OpenAI(query=query)
|
74 |
+
return jsonify({"results": results})
|
75 |
+
elif model == "LlamaCpp":
|
76 |
+
results = inferencer.LlamaCpp(query=query)
|
77 |
+
return jsonify({"results": results})
|
78 |
+
else:
|
79 |
+
return jsonify({"error": f"Invalid model specified: {model}"})
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
return jsonify({"error": str(e)})
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
app.run(debug=True)
|
data/Aditya_test.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encompassing more than three years of industry experience. Currently pursuing a Master’s of Science in Computer Science at Florida State University, Aditya has consistently demonstrated a passion for innovation and a strong commitment to driving technical excellence.
|
2 |
+
|
3 |
+
In his role as a Research Assistant at the Department of Scientific Computing at FSU, Aditya has been actively involved in conducting in-depth analysis for Monsoon Forecast Prediction. His work spans a century's worth of data, focusing on variables like Salinity, Surface Temperature, and Surface-to-Air Temperature. Utilizing Apache Spark for efficient data handling and transformation, Aditya leveraged Spark's distributed computing capabilities to process vast datasets in parallel, resulting in a remarkable 30% reduction in overall training time for machine learning models. This experience highlights his proficiency in handling big data and implementing cutting-edge technologies for scientific research.
|
4 |
+
|
5 |
+
His previous role as a Software Engineer at Aspire Systems in Chennai, India, showcases Aditya's versatility in both backend and frontend development. Leading the redesign of a Life Insurance Company's architecture, he prioritized low latency and high throughput, emphasizing a customer-centric approach. Aditya engineered 20 SOAP APIs for responsive patient data management, collaborated on front-end enhancements, and implemented secure payment gateways and Single Sign-On for authentication. His contribution to debugging strategies, real-time log analysis with Splunk, and CI/CD pipelines with Jenkins further underscore his commitment to optimizing system performance.
|
6 |
+
|
7 |
+
Aditya's experience extends to client-facing roles, where he addressed varied client needs in occupational health. His solution-oriented approach ensured compliance and security, crafting robust solutions with custom analytic modules and dynamic dashboards powered by Kafka-streams. These efforts transformed insurance dynamics, fortifying coverage in medical, life, and pet domains with enhanced security.
|
8 |
+
|
9 |
+
As a Web Developer Intern at Impetus in Indore, India, Aditya showcased his creativity by innovating a PHP-centric design using the Laravel framework. His proficiency extended to managing diverse CMS systems and successfully migrating Joomla versions, highlighting his adaptability and problem-solving skills.
|
10 |
+
|
11 |
+
Aditya's academic journey includes a Bachelor of Technology in Computer Science Engineering from SRM University in Chennai, India. Additionally, his publication on "Review Classification & False Feedback Detection using different ML algorithms" reflects his engagement in academic research.
|
12 |
+
|
13 |
+
His skill set is expansive, covering a wide array of programming languages including Ruby, JavaScript, Go, Kotlin, PHP, C#, Swift, R, TypeScript, Scala, SQL, Rust, C/C++, Java, and Python. Aditya is a Full-Stack developer with expertise in React.js, Node.js, Git, MySQL, PostgreSQL, Scala, Kafka, Docker, GraphQL, AWS, RDMS, TypeScript, and data analysis tools such as Pandas, Scikit-learn, TensorFlow, NumPy, SciPy, Keras, Algorithmic Design, OpenCV, MapWindow, and GIS Tools.
|
14 |
+
|
15 |
+
Aditya has further demonstrated his commitment to continuous learning and professional development by obtaining certifications such as "ChatGPT Prompt Engineering for Developers" by DeepLearning.ai and engaging with topics like Code, Production Environments, Distributed Systems, Product Managers, Code Reviews, Google Cloud, Load Testing, Kubernetes, Testing, Uptime, Azure, Envoy, and C++.
|
16 |
+
|
17 |
+
Aditya is Dating Crystal
|
data/Aditya_train.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence, a passion for technological advancements, and a strong foundation in both theoretical knowledge and practical application.
|
2 |
+
|
3 |
+
During his tenure as a Research Assistant at the Department of Scientific Computing at Florida State University, Aditya not only demonstrated technical prowess but also showcased his ability to contribute meaningfully to academic research. His work on the EOAS/pyutils library, marked by the implementation of mathematical optimization and algorithms, reflects a keen understanding of optimizing software for efficiency. The achievement of reducing complexity from O(n^2) to O(n) is a testament to his problem-solving skills and dedication to pushing the boundaries of what is possible in software development.
|
4 |
+
|
5 |
+
In the realm of Full Stack Development at Aspire Systems, Aditya exhibited a holistic approach to software architecture. His role in revamping the software architecture for a leading UK-based Life Insurance Company not only resulted in a significant increase in online appointments and customer engagement but also showcased his strategic thinking in incorporating technologies like Kafka, Redis, and NoSQL databases for efficient data processing. The integration of RESTful APIs, along with frontend enhancements and the implementation of secure payment gateways, highlights his versatility in both backend and frontend development.
|
6 |
+
|
7 |
+
Aditya's client-facing responsibilities further underscore his ability to translate complex technical solutions into user-friendly, client-centric outcomes. His attention to detail in addressing client needs, considering variations, mandates, and security clearances, speaks to his understanding of the real-world implications of software solutions in diverse contexts.
|
8 |
+
|
9 |
+
Aditya served as a Customer Service Executive at Pollo Tropical in Tallahassee, FL, from August 2022 to August 2023. In this role, he contributed to creating a positive dining experience for customers by greeting them warmly and processing orders accurately through the point-of-sale (POS) system. His role also involved responding to customer inquiries, providing product information, making recommendations, and addressing concerns. Aditya collaborated with team members to ensure the smooth operation of the cashier station and timely service to customers, demonstrating effective teamwork and customer service skills.
|
10 |
+
|
11 |
+
During his undergraduate years, Aditya served as a Library Assistant at SRM Institute of Science & Technology in Chennai, TN, from August 2018 to August 2020. In this role, he demonstrated organizational skills by assembling class notebooks through digital printing and efficiently managing inventory of copy paper and office supplies. Aditya also handled pick-up and drop-off of department mail, responded to department emails and phone calls promptly, and acted as a backup for receiving materials, showcasing flexibility and reliability.
|
12 |
+
|
13 |
+
Aditya's skills include proficiency in Microsoft Office Suite (Word, Excel, Outlook) and data entry expertise. He is known for effective communication and customer service skills, along with a collaborative approach to teamwork. Additionally, he has experience with copier and shipping using FedEx, further showcasing his versatility.
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
As a Web Developer Intern at Impetus, Aditya not only demonstrated technical acumen but also showcased his ability to lead and innovate. Spearheading PHP-centric designs and overseeing CMS systems underscored his commitment to creating user-centric and flexible solutions. The automation of workflows with SharePoint, leading to a substantial reduction in operational overhead, showcases his efficiency in streamlining processes and improving overall organizational productivity.
|
18 |
+
|
19 |
+
Aditya's academic pursuits, including pursuing a Master’s of Science in Computer Science at Florida State University and completing a Bachelor of Technology in Computer Science Engineering at SRM University, reflect his dedication to continuous learning and academic excellence. His diverse skill set, ranging from programming languages to full-stack development technologies and data analysis tools, positions him as a well-rounded and adaptable professional.
|
20 |
+
|
21 |
+
In addition to his technical prowess, Aditya's achievements in offset, documentation, servers, supervision, leadership, and information technology underscore his ability to not only excel in technical roles but also contribute meaningfully to the broader aspects of project management and team leadership.
|
22 |
+
|
23 |
+
Aditya Sugandhi's journey as a Software Engineer is characterized by a unique blend of technical excellence, innovation, and a holistic understanding of the broader implications of software development. His commitment to pushing boundaries, both in academic research and industry settings, makes him a valuable asset to any organization seeking a dynamic and forward-thinking software engineer.y:
|
24 |
+
|
25 |
+
|
26 |
+
Aditya friends are SAS, Hunterr, MF, Rocco , he usally hangs out with them and have fun.
|
27 |
+
Aditya call his friends Seemanth as nigga, or N bhai!
|
28 |
+
Who is SAS : SAS aka Seemanth Sagar, his friends refer to him as olema, N bhai, jai shreemanth Nigger, OG Nigger,Nigga king!!
|
data/Resume_Vishwam_Shah_Back_end.pdf
ADDED
Binary file (140 kB). View file
|
|
data/mf.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Master of Science degree in Computer Science at Florida State University, with a GPA of 3.91. He completed his Bachelor of Science in Computer Engineering from Gujarat Technological University in India, achieving a GPA of 3.9. His academic coursework and projects have focused on diverse areas such as Problem Solving, AI, Data Science, Cryptography, IoT, and Bioinformatics.
|
2 |
+
|
3 |
+
In terms of professional experience, Vishwam has a strong background as a Full Stack Developer & DevOps Engineer at MaMo Technolabs LLP in Gujarat, India. He managed a team of four developers, served as a SCRUM master, and successfully launched multiple innovative products. His expertise includes utilizing a variety of technologies such as NodeJS, MongoDB, ExpressJS, ReactJS, AngularJS, AWS, PHP, C++, Dart, Flutter, and more. Vishwam demonstrated proficiency in integrating RESTful APIs, optimizing UIs, and deploying applications on cloud architecture to achieve accelerated page load times.
|
4 |
+
|
5 |
+
Vishwam also has international experience as a Full Stack Intern at Paul Mason Consulting Limited in the UK and India, where he contributed to reducing voucher upload time and improved software deployment processes through continuous integration/continuous delivery (CI/CD) pipelines. Additionally, he served as a Back-End Intern at Akash Technolabs, contributing to the development of an interactive website with authentication APIs and CRUD operations.
|
6 |
+
|
7 |
+
In the academic realm, Vishwam served as a Researcher in the Department of Psychology – Neuroscience at Florida State University, where he utilized technologies such as MATLAB, fMRIPrep, FreeSurfer, and more. His contributions included spearheading custom MATLAB scripts for raw fMRI data preprocessing and enhancing data quality by preprocessing datasets.
|
8 |
+
|
9 |
+
As a Mentor at Women in Computer Science (WiCs), Vishwam played a pivotal role in empowering female students' participation in computer science. He architected the curriculum and mentored over 20 students, boosting their technical skills through hands-on workshops on full-stack development, AWS, and DevOps.
|
10 |
+
|
11 |
+
Vishwam's technical skills span various languages and frameworks, including C++, AJAX, Firebase, Docker, HTML, CSS, Bootstrap, and more. He is proficient in cloud platforms such as Google Cloud Platform (GCP) and Amazon Web Services (AWS). His development and collaboration tools expertise includes Git, Trello, Notion, ClickUp, JIRA, and more.
|
12 |
+
|
13 |
+
In the realm of certifications, Vishwam has completed certifications in Google Cloud Platform (GCP) Fundamentals, Essential Google Cloud Infrastructure, and Programming for Everybody (Getting Started with Python). He has also completed a certification in Python Data Structures.
|
14 |
+
|
15 |
+
Vishwam's project experience includes Medical Image Segmentation using Python, TensorFlow, Keras, PyTorch, and OpenCV, where he applied a U-Net model for segmenting cell nuclei in microscopic images. He has also worked on optimizing K-Core Decomposition for Large-Scale Networks using Java, Perl, GraphChi, and WebGraph, achieving linear time complexity with less than 1% update rate within 20 iterations.
|
16 |
+
|
17 |
+
Overall, Vishwam Shah possesses a well-rounded skill set, combining academic excellence, hands-on professional experience, and a strong commitment to mentoring and collaborative learning environments.
|
dataloader.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import os
|
3 |
+
from haystack import Pipeline
|
4 |
+
from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder
|
5 |
+
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
|
6 |
+
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
|
7 |
+
from haystack.components.routers import FileTypeRouter
|
8 |
+
from haystack.components.joiners import DocumentJoiner
|
9 |
+
from haystack.components.writers import DocumentWriter
|
10 |
+
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
11 |
+
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
|
12 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
13 |
+
class DataLoader:
|
14 |
+
|
15 |
+
def __init__(self):
|
16 |
+
self.chroma_store = ChromaDocumentStore()
|
17 |
+
self.InMemory_store = InMemoryDocumentStore()
|
18 |
+
|
19 |
+
def dataloader(self):
|
20 |
+
HERE = Path(os.getcwd())
|
21 |
+
|
22 |
+
|
23 |
+
data_path = HERE / "data"
|
24 |
+
file_paths = [str(data_path / name) for name in os.listdir(data_path)]
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
pipeline = Pipeline()
|
29 |
+
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
|
30 |
+
pipeline.add_component("TextFileConverter", TextFileToDocument())
|
31 |
+
pipeline.add_component("PdfFileConverter", PyPDFToDocument())
|
32 |
+
|
33 |
+
pipeline.add_component("Joiner", DocumentJoiner())
|
34 |
+
pipeline.add_component("Cleaner", DocumentCleaner())
|
35 |
+
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
|
36 |
+
# pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
|
37 |
+
pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
|
38 |
+
|
39 |
+
pipeline.add_component("Writer", DocumentWriter(document_store=self.chroma_store))
|
40 |
+
|
41 |
+
pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
|
42 |
+
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
|
43 |
+
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
|
44 |
+
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
|
45 |
+
pipeline.connect("Joiner.documents", "Cleaner.documents")
|
46 |
+
pipeline.connect("Cleaner.documents", "Splitter.documents")
|
47 |
+
pipeline.connect("Splitter.documents", "Embedder.documents")
|
48 |
+
# pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
|
49 |
+
pipeline.connect("Embedder.documents", "Writer.documents")
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
pipeline.run(
|
54 |
+
{"FileTypeRouter": {"sources": file_paths}},
|
55 |
+
|
56 |
+
)
|
57 |
+
return self.chroma_store
|
58 |
+
|
59 |
+
|
60 |
+
def InMemory_dataloader(self):
|
61 |
+
HERE = Path(os.getcwd())
|
62 |
+
|
63 |
+
|
64 |
+
data_path = HERE / "data"
|
65 |
+
file_paths = [str(data_path / name) for name in os.listdir(data_path)]
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
pipeline = Pipeline()
|
70 |
+
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
|
71 |
+
pipeline.add_component("TextFileConverter", TextFileToDocument())
|
72 |
+
pipeline.add_component("PdfFileConverter", PyPDFToDocument())
|
73 |
+
|
74 |
+
pipeline.add_component("Joiner", DocumentJoiner())
|
75 |
+
pipeline.add_component("Cleaner", DocumentCleaner())
|
76 |
+
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
|
77 |
+
# pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
|
78 |
+
pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
|
79 |
+
|
80 |
+
pipeline.add_component("Writer", DocumentWriter(document_store=self.InMemory_store))
|
81 |
+
|
82 |
+
pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
|
83 |
+
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
|
84 |
+
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
|
85 |
+
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
|
86 |
+
pipeline.connect("Joiner.documents", "Cleaner.documents")
|
87 |
+
pipeline.connect("Cleaner.documents", "Splitter.documents")
|
88 |
+
pipeline.connect("Splitter.documents", "Embedder.documents")
|
89 |
+
# pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
|
90 |
+
pipeline.connect("Embedder.documents", "Writer.documents")
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
pipeline.run(
|
95 |
+
{"FileTypeRouter": {"sources": file_paths}},
|
96 |
+
|
97 |
+
)
|
98 |
+
return self.InMemory_store
|
99 |
+
|
100 |
+
|
101 |
+
def get_chroma_store(self):
|
102 |
+
return self.chroma_store
|
103 |
+
|
104 |
+
def get_InMemory_store(self):
|
105 |
+
return self.InMemory_store
|
106 |
+
|
env.yaml
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: RAGAPP
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
- pytorch
|
5 |
+
- nvidia
|
6 |
+
- defaults
|
7 |
+
dependencies:
|
8 |
+
- _libgcc_mutex=0.1=main
|
9 |
+
- _openmp_mutex=5.1=1_gnu
|
10 |
+
- abseil-cpp=20211102.0=hd4dd3e8_0
|
11 |
+
- aiohttp=3.9.3=py310h5eee18b_0
|
12 |
+
- arrow-cpp=14.0.2=h374c478_1
|
13 |
+
- asttokens=2.0.5=pyhd3eb1b0_0
|
14 |
+
- async-timeout=4.0.3=py310h06a4308_0
|
15 |
+
- aws-c-auth=0.6.19=h5eee18b_0
|
16 |
+
- aws-c-cal=0.5.20=hdbd6064_0
|
17 |
+
- aws-c-common=0.8.5=h5eee18b_0
|
18 |
+
- aws-c-compression=0.2.16=h5eee18b_0
|
19 |
+
- aws-c-event-stream=0.2.15=h6a678d5_0
|
20 |
+
- aws-c-http=0.6.25=h5eee18b_0
|
21 |
+
- aws-c-io=0.13.10=h5eee18b_0
|
22 |
+
- aws-c-mqtt=0.7.13=h5eee18b_0
|
23 |
+
- aws-c-s3=0.1.51=hdbd6064_0
|
24 |
+
- aws-c-sdkutils=0.1.6=h5eee18b_0
|
25 |
+
- aws-checksums=0.1.13=h5eee18b_0
|
26 |
+
- aws-crt-cpp=0.18.16=h6a678d5_0
|
27 |
+
- aws-sdk-cpp=1.10.55=h721c034_0
|
28 |
+
- blas=1.0=mkl
|
29 |
+
- blinker=1.6.2=py310h06a4308_0
|
30 |
+
- boost-cpp=1.82.0=hdb19cb5_2
|
31 |
+
- bottleneck=1.3.7=py310ha9d4c09_0
|
32 |
+
- brotli=1.0.9=h5eee18b_7
|
33 |
+
- brotli-bin=1.0.9=h5eee18b_7
|
34 |
+
- brotli-python=1.0.9=py310h6a678d5_7
|
35 |
+
- bzip2=1.0.8=h7b6447c_0
|
36 |
+
- c-ares=1.19.1=h5eee18b_0
|
37 |
+
- ca-certificates=2023.12.12=h06a4308_0
|
38 |
+
- certifi=2024.2.2=py310h06a4308_0
|
39 |
+
- cffi=1.16.0=py310h5eee18b_0
|
40 |
+
- charset-normalizer=2.0.4=pyhd3eb1b0_0
|
41 |
+
- click=8.1.7=py310h06a4308_0
|
42 |
+
- comm=0.1.2=py310h06a4308_0
|
43 |
+
- cryptography=42.0.2=py310hdda0065_0
|
44 |
+
- cuda-cudart=11.8.89=0
|
45 |
+
- cuda-cupti=11.8.87=0
|
46 |
+
- cuda-libraries=11.8.0=0
|
47 |
+
- cuda-nvrtc=11.8.89=0
|
48 |
+
- cuda-nvtx=11.8.86=0
|
49 |
+
- cuda-runtime=11.8.0=0
|
50 |
+
- cycler=0.11.0=pyhd3eb1b0_0
|
51 |
+
- cyrus-sasl=2.1.28=h52b45da_1
|
52 |
+
- datasets=2.12.0=py310h06a4308_0
|
53 |
+
- dbus=1.13.18=hb2f20db_0
|
54 |
+
- debugpy=1.6.7=py310h6a678d5_0
|
55 |
+
- decorator=5.1.1=pyhd3eb1b0_0
|
56 |
+
- dill=0.3.6=py310h06a4308_0
|
57 |
+
- exceptiongroup=1.2.0=py310h06a4308_0
|
58 |
+
- executing=0.8.3=pyhd3eb1b0_0
|
59 |
+
- expat=2.5.0=h6a678d5_0
|
60 |
+
- ffmpeg=4.3=hf484d3e_0
|
61 |
+
- filelock=3.13.1=py310h06a4308_0
|
62 |
+
- fontconfig=2.14.1=h4c34cd2_2
|
63 |
+
- fonttools=4.25.0=pyhd3eb1b0_0
|
64 |
+
- freetype=2.12.1=h4a9f257_0
|
65 |
+
- gflags=2.2.2=h6a678d5_1
|
66 |
+
- glib=2.78.4=h6a678d5_0
|
67 |
+
- glib-tools=2.78.4=h6a678d5_0
|
68 |
+
- glog=0.5.0=h6a678d5_1
|
69 |
+
- gmp=6.2.1=h295c915_3
|
70 |
+
- gmpy2=2.1.2=py310heeb90bb_0
|
71 |
+
- gnutls=3.6.15=he1e5248_0
|
72 |
+
- google-auth-oauthlib=0.4.4=pyhd3eb1b0_0
|
73 |
+
- grpc-cpp=1.48.2=he1ff14a_1
|
74 |
+
- gst-plugins-base=1.14.1=h6a678d5_1
|
75 |
+
- gstreamer=1.14.1=h5eee18b_1
|
76 |
+
- huggingface_hub=0.20.3=py310h06a4308_0
|
77 |
+
- icu=73.1=h6a678d5_0
|
78 |
+
- idna=3.4=py310h06a4308_0
|
79 |
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
80 |
+
- ipykernel=6.28.0=py310h06a4308_0
|
81 |
+
- ipython=8.20.0=py310h06a4308_0
|
82 |
+
- jedi=0.18.1=py310h06a4308_1
|
83 |
+
- jinja2=3.1.3=py310h06a4308_0
|
84 |
+
- jpeg=9e=h5eee18b_1
|
85 |
+
- jupyter_client=8.6.0=py310h06a4308_0
|
86 |
+
- jupyter_core=5.5.0=py310h06a4308_0
|
87 |
+
- kiwisolver=1.4.4=py310h6a678d5_0
|
88 |
+
- krb5=1.20.1=h143b758_1
|
89 |
+
- lame=3.100=h7b6447c_0
|
90 |
+
- lcms2=2.12=h3be6417_0
|
91 |
+
- ld_impl_linux-64=2.38=h1181459_1
|
92 |
+
- lerc=3.0=h295c915_0
|
93 |
+
- libboost=1.82.0=h109eef0_2
|
94 |
+
- libbrotlicommon=1.0.9=h5eee18b_7
|
95 |
+
- libbrotlidec=1.0.9=h5eee18b_7
|
96 |
+
- libbrotlienc=1.0.9=h5eee18b_7
|
97 |
+
- libclang=14.0.6=default_hc6dbbc7_1
|
98 |
+
- libclang13=14.0.6=default_he11475f_1
|
99 |
+
- libcublas=11.11.3.6=0
|
100 |
+
- libcufft=10.9.0.58=0
|
101 |
+
- libcufile=1.8.1.2=0
|
102 |
+
- libcups=2.4.2=h2d74bed_1
|
103 |
+
- libcurand=10.3.4.107=0
|
104 |
+
- libcurl=8.5.0=h251f7ec_0
|
105 |
+
- libcusolver=11.4.1.48=0
|
106 |
+
- libcusparse=11.7.5.86=0
|
107 |
+
- libdeflate=1.17=h5eee18b_1
|
108 |
+
- libedit=3.1.20230828=h5eee18b_0
|
109 |
+
- libev=4.33=h7f8727e_1
|
110 |
+
- libevent=2.1.12=hdbd6064_1
|
111 |
+
- libffi=3.4.4=h6a678d5_0
|
112 |
+
- libgcc-ng=11.2.0=h1234567_1
|
113 |
+
- libglib=2.78.4=hdc74915_0
|
114 |
+
- libgomp=11.2.0=h1234567_1
|
115 |
+
- libiconv=1.16=h7f8727e_2
|
116 |
+
- libidn2=2.3.4=h5eee18b_0
|
117 |
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
118 |
+
- libllvm14=14.0.6=hdb19cb5_3
|
119 |
+
- libnghttp2=1.57.0=h2d74bed_0
|
120 |
+
- libnpp=11.8.0.86=0
|
121 |
+
- libnvjpeg=11.9.0.86=0
|
122 |
+
- libpng=1.6.39=h5eee18b_0
|
123 |
+
- libpq=12.17=hdbd6064_0
|
124 |
+
- libprotobuf=3.20.3=he621ea3_0
|
125 |
+
- libsodium=1.0.18=h7b6447c_0
|
126 |
+
- libssh2=1.10.0=hdbd6064_2
|
127 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
128 |
+
- libtasn1=4.19.0=h5eee18b_0
|
129 |
+
- libthrift=0.15.0=h1795dd8_2
|
130 |
+
- libtiff=4.5.1=h6a678d5_0
|
131 |
+
- libunistring=0.9.10=h27cfd23_0
|
132 |
+
- libuuid=1.41.5=h5eee18b_0
|
133 |
+
- libwebp-base=1.3.2=h5eee18b_0
|
134 |
+
- libxcb=1.15=h7f8727e_0
|
135 |
+
- libxkbcommon=1.0.1=h5eee18b_1
|
136 |
+
- libxml2=2.10.4=hf1b16e4_1
|
137 |
+
- llvm-openmp=14.0.6=h9e868ea_0
|
138 |
+
- lz4-c=1.9.4=h6a678d5_0
|
139 |
+
- matplotlib=3.5.1=py310h06a4308_1
|
140 |
+
- matplotlib-base=3.5.1=py310ha18d171_1
|
141 |
+
- matplotlib-inline=0.1.6=py310h06a4308_0
|
142 |
+
- mkl=2023.1.0=h213fc3f_46344
|
143 |
+
- mkl-service=2.4.0=py310h5eee18b_1
|
144 |
+
- mkl_fft=1.3.8=py310h5eee18b_0
|
145 |
+
- mkl_random=1.2.4=py310hdb19cb5_0
|
146 |
+
- mpc=1.1.0=h10f8cd9_1
|
147 |
+
- mpfr=4.0.2=hb69a4c5_1
|
148 |
+
- mpmath=1.3.0=py310h06a4308_0
|
149 |
+
- multiprocess=0.70.14=py310h06a4308_0
|
150 |
+
- munkres=1.1.4=py_0
|
151 |
+
- mysql=5.7.24=h721c034_2
|
152 |
+
- ncurses=6.4=h6a678d5_0
|
153 |
+
- nest-asyncio=1.6.0=py310h06a4308_0
|
154 |
+
- nettle=3.7.3=hbbd107a_1
|
155 |
+
- networkx=3.1=py310h06a4308_0
|
156 |
+
- numexpr=2.8.7=py310h85018f9_0
|
157 |
+
- numpy=1.26.4=py310h5f9d8c6_0
|
158 |
+
- numpy-base=1.26.4=py310hb5e798b_0
|
159 |
+
- oauthlib=3.2.2=py310h06a4308_0
|
160 |
+
- openh264=2.1.1=h4ff587b_0
|
161 |
+
- openjpeg=2.4.0=h3ad879b_0
|
162 |
+
- openssl=3.0.13=h7f8727e_0
|
163 |
+
- orc=1.7.4=hb3bc3d3_1
|
164 |
+
- parso=0.8.3=pyhd3eb1b0_0
|
165 |
+
- pcre2=10.42=hebb0a14_0
|
166 |
+
- pexpect=4.8.0=pyhd3eb1b0_3
|
167 |
+
- pillow=10.2.0=py310h5eee18b_0
|
168 |
+
- pip=23.3.1=py310h06a4308_0
|
169 |
+
- platformdirs=3.10.0=py310h06a4308_0
|
170 |
+
- ply=3.11=py310h06a4308_0
|
171 |
+
- prompt-toolkit=3.0.43=py310h06a4308_0
|
172 |
+
- prompt_toolkit=3.0.43=hd3eb1b0_0
|
173 |
+
- psutil=5.9.0=py310h5eee18b_0
|
174 |
+
- ptyprocess=0.7.0=pyhd3eb1b0_2
|
175 |
+
- pure_eval=0.2.2=pyhd3eb1b0_0
|
176 |
+
- pyarrow=14.0.2=py310h1eedbd7_0
|
177 |
+
- pycparser=2.21=pyhd3eb1b0_0
|
178 |
+
- pygments=2.15.1=py310h06a4308_1
|
179 |
+
- pyjwt=2.4.0=py310h06a4308_0
|
180 |
+
- pyopenssl=24.0.0=py310h06a4308_0
|
181 |
+
- pyparsing=3.0.9=py310h06a4308_0
|
182 |
+
- pyqt=5.15.10=py310h6a678d5_0
|
183 |
+
- pyqt5-sip=12.13.0=py310h5eee18b_0
|
184 |
+
- pysocks=1.7.1=py310h06a4308_0
|
185 |
+
- python=3.10.13=h955ad1f_0
|
186 |
+
- python-dateutil=2.8.2=pyhd3eb1b0_0
|
187 |
+
- python-tzdata=2023.3=pyhd3eb1b0_0
|
188 |
+
- python-xxhash=2.0.2=py310h5eee18b_1
|
189 |
+
- pytorch-cuda=11.8=h7e8668a_5
|
190 |
+
- pytorch-mutex=1.0=cuda
|
191 |
+
- pyyaml=6.0.1=py310h5eee18b_0
|
192 |
+
- pyzmq=25.1.2=py310h6a678d5_0
|
193 |
+
- qt-main=5.15.2=h53bd1ea_10
|
194 |
+
- re2=2022.04.01=h295c915_0
|
195 |
+
- readline=8.2=h5eee18b_0
|
196 |
+
- regex=2023.10.3=py310h5eee18b_0
|
197 |
+
- requests=2.31.0=py310h06a4308_1
|
198 |
+
- responses=0.13.3=pyhd3eb1b0_0
|
199 |
+
- s2n=1.3.27=hdbd6064_0
|
200 |
+
- safetensors=0.4.2=py310ha89cbab_0
|
201 |
+
- setuptools=68.2.2=py310h06a4308_0
|
202 |
+
- sip=6.7.12=py310h6a678d5_0
|
203 |
+
- six=1.16.0=pyhd3eb1b0_1
|
204 |
+
- snappy=1.1.10=h6a678d5_1
|
205 |
+
- sqlite=3.41.2=h5eee18b_0
|
206 |
+
- stack_data=0.2.0=pyhd3eb1b0_0
|
207 |
+
- sympy=1.12=py310h06a4308_0
|
208 |
+
- tbb=2021.8.0=hdb19cb5_0
|
209 |
+
- tk=8.6.12=h1ccaba5_0
|
210 |
+
- tokenizers=0.15.1=py310h22610ee_0
|
211 |
+
- tomli=2.0.1=py310h06a4308_0
|
212 |
+
- torchaudio=2.2.1=py310_cu118
|
213 |
+
- torchvision=0.17.1=py310_cu118
|
214 |
+
- tornado=6.3.3=py310h5eee18b_0
|
215 |
+
- traitlets=5.7.1=py310h06a4308_0
|
216 |
+
- transformers=4.38.1=pyhd8ed1ab_0
|
217 |
+
- urllib3=2.1.0=py310h06a4308_1
|
218 |
+
- utf8proc=2.6.1=h5eee18b_1
|
219 |
+
- wcwidth=0.2.5=pyhd3eb1b0_0
|
220 |
+
- wheel=0.41.2=py310h06a4308_0
|
221 |
+
- xxhash=0.8.0=h7f8727e_3
|
222 |
+
- xz=5.4.5=h5eee18b_0
|
223 |
+
- yaml=0.2.5=h7b6447c_0
|
224 |
+
- zeromq=4.3.5=h6a678d5_0
|
225 |
+
- zlib=1.2.13=h5eee18b_0
|
226 |
+
- zstd=1.5.5=hc292b87_0
|
227 |
+
- pip:
|
228 |
+
- accelerate==0.27.2
|
229 |
+
- aiosignal==1.3.1
|
230 |
+
- annotated-types==0.6.0
|
231 |
+
- anyio==4.3.0
|
232 |
+
- asgiref==3.7.2
|
233 |
+
- attrs==23.2.0
|
234 |
+
- backoff==2.2.1
|
235 |
+
- bcrypt==4.1.2
|
236 |
+
- boilerpy3==1.0.7
|
237 |
+
- cachetools==5.3.2
|
238 |
+
- chroma-haystack==0.13.0
|
239 |
+
- chroma-hnswlib==0.7.3
|
240 |
+
- chromadb==0.4.19
|
241 |
+
- cmake==3.28.3
|
242 |
+
- coloredlogs==15.0.1
|
243 |
+
- deprecated==1.2.14
|
244 |
+
- diskcache==5.6.3
|
245 |
+
- distro==1.9.0
|
246 |
+
- elastic-transport==8.12.0
|
247 |
+
- elasticsearch==8.12.1
|
248 |
+
- elasticsearch-haystack==0.3.0
|
249 |
+
- fastapi==0.110.0
|
250 |
+
- flask==3.0.2
|
251 |
+
- flatbuffers==23.5.26
|
252 |
+
- frozenlist==1.4.1
|
253 |
+
- fsspec==2024.2.0
|
254 |
+
- google-auth==2.28.1
|
255 |
+
- googleapis-common-protos==1.62.0
|
256 |
+
- grpcio==1.62.0
|
257 |
+
- haystack-ai==2.0.0b8
|
258 |
+
- haystack-bm25==1.0.2
|
259 |
+
- httpcore==1.0.4
|
260 |
+
- httptools==0.6.1
|
261 |
+
- httpx==0.27.0
|
262 |
+
- humanfriendly==10.0
|
263 |
+
- importlib-metadata==6.11.0
|
264 |
+
- importlib-resources==6.1.2
|
265 |
+
- instructor-embedders-haystack==0.4.0
|
266 |
+
- instructorembedding==1.0.1
|
267 |
+
- itsdangerous==2.1.2
|
268 |
+
- joblib==1.3.2
|
269 |
+
- jsonlines==4.0.0
|
270 |
+
- jsonschema==4.21.1
|
271 |
+
- jsonschema-specifications==2023.12.1
|
272 |
+
- kubernetes==29.0.0
|
273 |
+
- lazy-imports==0.3.1
|
274 |
+
- lit==17.0.6
|
275 |
+
- llama-cpp-haystack==0.2.1
|
276 |
+
- llama-cpp-python==0.2.50
|
277 |
+
- markupsafe==2.1.5
|
278 |
+
- mistral-haystack==0.0.1
|
279 |
+
- mmh3==4.1.0
|
280 |
+
- monotonic==1.6
|
281 |
+
- more-itertools==10.2.0
|
282 |
+
- multidict==6.0.5
|
283 |
+
- nltk==3.8.1
|
284 |
+
- nvidia-cublas-cu11==11.10.3.66
|
285 |
+
- nvidia-cuda-cupti-cu11==11.7.101
|
286 |
+
- nvidia-cuda-nvrtc-cu11==11.7.99
|
287 |
+
- nvidia-cuda-runtime-cu11==11.7.99
|
288 |
+
- nvidia-cudnn-cu11==8.5.0.96
|
289 |
+
- nvidia-cufft-cu11==10.9.0.58
|
290 |
+
- nvidia-curand-cu11==10.2.10.91
|
291 |
+
- nvidia-cusolver-cu11==11.4.0.1
|
292 |
+
- nvidia-cusparse-cu11==11.7.4.91
|
293 |
+
- nvidia-nccl-cu11==2.14.3
|
294 |
+
- nvidia-nvtx-cu11==11.7.91
|
295 |
+
- onnxruntime==1.17.1
|
296 |
+
- openai==1.12.0
|
297 |
+
- opentelemetry-api==1.23.0
|
298 |
+
- opentelemetry-exporter-otlp-proto-common==1.23.0
|
299 |
+
- opentelemetry-exporter-otlp-proto-grpc==1.23.0
|
300 |
+
- opentelemetry-instrumentation==0.44b0
|
301 |
+
- opentelemetry-instrumentation-asgi==0.44b0
|
302 |
+
- opentelemetry-instrumentation-fastapi==0.44b0
|
303 |
+
- opentelemetry-proto==1.23.0
|
304 |
+
- opentelemetry-sdk==1.23.0
|
305 |
+
- opentelemetry-semantic-conventions==0.44b0
|
306 |
+
- opentelemetry-util-http==0.44b0
|
307 |
+
- orjson==3.9.15
|
308 |
+
- overrides==7.7.0
|
309 |
+
- packaging==23.2
|
310 |
+
- pandas==2.2.1
|
311 |
+
- posthog==3.4.2
|
312 |
+
- protobuf==3.19.6
|
313 |
+
- pulsar-client==3.4.0
|
314 |
+
- pyasn1==0.5.1
|
315 |
+
- pyasn1-modules==0.3.0
|
316 |
+
- pydantic==2.6.2
|
317 |
+
- pydantic-core==2.16.3
|
318 |
+
- pypdf==4.0.2
|
319 |
+
- pypika==0.48.9
|
320 |
+
- python-dotenv==1.0.1
|
321 |
+
- pytz==2024.1
|
322 |
+
- referencing==0.33.0
|
323 |
+
- requests-oauthlib==1.3.1
|
324 |
+
- rich==13.7.0
|
325 |
+
- rpds-py==0.18.0
|
326 |
+
- rsa==4.9
|
327 |
+
- scikit-learn==1.4.1.post1
|
328 |
+
- scipy==1.12.0
|
329 |
+
- sentence-transformers==2.2.2
|
330 |
+
- starlette==0.36.3
|
331 |
+
- tenacity==8.2.3
|
332 |
+
- threadpoolctl==3.3.0
|
333 |
+
- toolz==0.12.1
|
334 |
+
- torch==2.0.1
|
335 |
+
- tqdm==4.66.2
|
336 |
+
- triton==2.0.0
|
337 |
+
- typer==0.9.0
|
338 |
+
- typing-extensions==4.10.0
|
339 |
+
- tzdata==2024.1
|
340 |
+
- uvicorn==0.27.1
|
341 |
+
- uvloop==0.19.0
|
342 |
+
- watchfiles==0.21.0
|
343 |
+
- websocket-client==1.7.0
|
344 |
+
- websockets==12.0
|
345 |
+
- werkzeug==3.0.1
|
346 |
+
- wrapt==1.16.0
|
347 |
+
- yarl==1.9.4
|
348 |
+
- zipp==3.17.0
|
349 |
+
prefix: /conda/asugandhi/miniconda3/envs/RAGAPP
|
output_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{'llm': {'meta': [{'id': 'cmpl-acea8357-35de-4b59-b91d-f7a5a6db7df8', 'object': 'text_completion', 'created': 1708917529, 'model': 'openchat-3.5-1210.Q3_K_S.ggml', 'choices': [{'text': ' Numerical weather prediction (NWP) models use scientific laws and equations to', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 8186, 'completion_tokens': 16, 'total_tokens': 8202}}]}, 'answer_builder': {'answers': [GeneratedAnswer(data=' Numerical weather prediction (NWP) models use scientific laws and equations to', query='What are the key differences between GraphCast and traditional numerical weather prediction (NWP) models? write in at least 10000 words', documents=[Document(id=7e1267428582f9f3323ba16dc7fe4db1771b409537ae46b94bdeb4005888b9a5, content: 'GraphCast: Learning skillful medium-range
|
2 |
+
global weather forecasting
|
3 |
+
Remi Lam*,1, Alvaro Sanchez-Gon...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 0.6383349895477295, embedding: vector of size 384)], meta={})]}}
|
output_results.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{'llm': {'meta': [{'id': 'cmpl-256e8372-b43f-4fa9-8a91-156338e3ed5f', 'object': 'text_completion', 'created': 1708966691, 'model': 'openchat-3.5-1210.Q3_K_S.ggml', 'choices': [{'text': '\nAditya Sugandhi is known to have a group of close', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 3347, 'completion_tokens': 16, 'total_tokens': 3363}}]}, 'answer_builder': {'answers': [GeneratedAnswer(data='\nAditya Sugandhi is known to have a group of close', query="who are Aditya's friends?", documents=[Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.3431967496871948, embedding: vector of size 384), Document(id=11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9, content: 'Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encomp...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_test.txt', 'source_id': 'c85a2287836cae980897693decb5e9d07e80f60b7c96b4e542ef3057e11fc228'}, score: 1.3858964443206787, embedding: vector of size 384), Document(id=b9679ae3e33c58d9299d929f03d3b6f868d81dcd0fb7197d59e38c1962a4f92d, content: 'Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Ma...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/mf.txt', 'source_id': '6d425f2fa8ce25e5d4b7890423744220600079b727b22e39b514f70d4660eab5'}, score: 1.7688608169555664, embedding: vector of size 384), Document(id=a6ad41c3febd74d1f6825aac59c2d6dd7589ae8088bb3b449ea239c97d6f1b1c, content: ' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18
|
2 |
+
1.2 HRES . . . . . . . . . . . . . ....', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 1.8065273761749268, embedding: vector of size 384), Document(id=21cdf14f25359517ba11fd718fafc4d245bff87411a165314b7e814a05924234, content: ' . . . . . . . . . . . . . . . . . . . . . . . . . . 25
|
3 |
+
3.2 Architecture overview . . . . . . . . . ...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 1.8349415063858032, embedding: vector of size 384)], meta={})]}}
|
rag_model.ipynb
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": []
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 14,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stdout",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"bert_load_from_file: gguf version = 2\n",
|
20 |
+
"bert_load_from_file: gguf alignment = 32\n",
|
21 |
+
"bert_load_from_file: gguf data offset = 695552\n",
|
22 |
+
"bert_load_from_file: model name = BERT\n",
|
23 |
+
"bert_load_from_file: model architecture = bert\n",
|
24 |
+
"bert_load_from_file: model file type = 1\n",
|
25 |
+
"bert_load_from_file: bert tokenizer vocab = 30522\n",
|
26 |
+
"[0.01552767027169466, 0.08103805035352707, -0.12307794392108917, 0.09815496951341629, 0.023653453215956688, -0.06102974712848663, 0.07934562116861343, 0.02745242230594158, -0.028132867068052292, 0.03221212700009346, 0.12919503450393677, 0.0025996030308306217, -0.04139482602477074, -0.06577245146036148, -0.014648980461061, 0.015588296577334404, -0.08434717357158661, -0.07182654738426208, 0.014775916934013367, -0.07444048672914505, 0.0590442530810833, 0.04814479872584343, 0.06639457494020462, 0.008800982497632504, -0.017847837880253792, -0.020949387922883034, -0.026810096576809883, 0.026885343715548515, -0.0764176994562149, -0.057069629430770874, 0.039454489946365356, 0.06288687884807587, 0.036681558936834335, 0.03875448554754257, 0.09926188737154007, 0.07691209763288498, -0.0007747725467197597, -0.05224066600203514, -0.06268111616373062, -0.00026997251552529633, 0.06668399274349213, -0.10031015425920486, -0.00970512256026268, -0.01601257175207138, -0.03624574467539787, -0.10884801298379898, -0.027961881831288338, -0.02198118157684803, 0.011900517158210278, -0.005993946921080351, -0.08890494704246521, -0.01797824539244175, -0.040237877517938614, -0.049093399196863174, -0.019428042694926262, -0.005168401636183262, 0.032794076949357986, -0.03235733509063721, -0.0705694779753685, -0.0941174328327179, -0.051176246255636215, 0.08234924077987671, -0.020688237622380257, 0.026870127767324448, -0.031070750206708908, 0.021878499537706375, -0.06237325817346573, 0.07108485698699951, 0.0030630987603217363, -0.06985890865325928, -0.05954312905669212, -0.05837850645184517, -0.09073222428560257, 0.005469962954521179, -0.021687401458621025, 0.0314265601336956, -0.025661440566182137, -0.0495171844959259, 0.0394166000187397, -0.029094435274600983, -0.018130596727132797, -0.04031619802117348, 0.08927112817764282, 0.00014257561997510493, -0.026646623387932777, 0.06340110301971436, 0.07394086569547653, 0.014260515570640564, -0.023962723091244698, -0.06585869938135147, 0.04496406018733978, 0.04277855530381203, 0.008617856539785862, 0.0665624663233757, 0.026723850518465042, 0.01059289276599884, 0.011615158058702946, -0.04054207354784012, -0.04994109272956848, 0.10845799744129181, 0.036834508180618286, 0.045918650925159454, -0.05060620605945587, 0.11201019585132599, -0.11668886244297028, -0.01581607758998871, 0.0960628017783165, -0.0488315187394619, 0.024895356968045235, -0.04963228479027748, -0.03182365745306015, -0.004189752042293549, -0.022618744522333145, -0.020297333598136902, 0.010558796115219593, -0.03451183810830116, -0.08592583984136581, 0.07002798467874527, -0.0014977692626416683, -0.020605681464076042, 0.0009889955399557948, -0.06769613176584244, -0.016587721183896065, -0.03945926949381828, 0.027652334421873093, -0.0037252188194543123, 4.02796795242466e-05, 2.496357863577944e-34, -0.019553543999791145, -0.006931365933269262, 0.05519813671708107, 0.030014386400580406, -0.027222076430916786, -0.0040949187241494656, 0.028509650379419327, 0.0003461719024926424, -0.07768791913986206, 0.026781603693962097, -0.021593185141682625, -0.043786026537418365, 0.03954899311065674, -0.029267827048897743, 0.03505752608180046, 0.005345764569938183, -0.01677117310464382, 0.08446278423070908, 0.05020565167069435, 0.041258785873651505, 0.03950535133481026, 0.05992049351334572, 0.004634900484234095, -0.0946483463048935, -0.028090720996260643, -0.03398402780294418, -0.02709619328379631, -0.04133094474673271, -0.005644459743052721, 0.032718855887651443, 0.010113613680005074, -0.02065439336001873, -0.016786033287644386, 0.03233509510755539, -0.06616782397031784, 0.029395416378974915, -0.00663745915517211, -0.06478383392095566, -0.09521140158176422, -0.010280981659889221, -0.03638819605112076, -0.007304533384740353, 0.13017326593399048, -0.06668204814195633, -0.012214419431984425, 0.09507791697978973, -0.0009454676182940602, 0.045288313180208206, 0.061766546219587326, 0.06407830119132996, -0.06472055613994598, 0.02868455834686756, 0.014445719309151173, 0.03761356323957443, 0.04157082363963127, 0.007912926375865936, -0.028237026184797287, -0.048911020159721375, 0.05634745582938194, 0.0031706185545772314, 0.024482648819684982, -0.0926365926861763, -0.028224240988492966, 0.01816745474934578, -0.0009234159952029586, -0.06061384454369545, 0.02713773585855961, -0.0657828152179718, 0.06030780076980591, 0.05763610824942589, -0.0024990146048367023, -0.031143246218562126, 0.014573169872164726, 0.05780758708715439, -0.005530690308660269, -0.024387281388044357, 0.025631394237279892, 0.04571927711367607, -0.07182186841964722, 0.02106345444917679, 0.047523558139801025, -0.025845326483249664, 0.04639439284801483, -0.0461527556180954, 0.06309600919485092, 0.002871520584449172, -0.019818803295493126, -0.01131194643676281, 0.04196448624134064, -0.017453346401453018, -0.043370626866817474, 0.06779050827026367, -0.11423997581005096, -0.007464131806045771, 0.07379034906625748, -1.0159212682046505e-33, 0.04116467386484146, -0.02187393046915531, -0.06464317440986633, -0.04831999912858009, 0.054312679916620255, -0.04359174892306328, 0.10390615463256836, -0.008244805969297886, 0.02429776079952717, 0.08679671585559845, 0.03324231505393982, -0.04018168896436691, 0.023248450830578804, -0.11267966777086258, 0.027334723621606827, -0.018510276451706886, -0.015763893723487854, -0.06620948016643524, -0.029428796842694283, 0.024292776361107826, -0.0836699977517128, 0.06186313182115555, 0.00979425199329853, 0.0149845527485013, 0.02952435240149498, -0.01609259471297264, 0.06341543793678284, 0.025381680577993393, -0.07650972157716751, -0.08898097276687622, 0.0543917752802372, 0.029732191935181618, -0.12705901265144348, 0.11817684024572372, 0.05331788584589958, -0.03143112361431122, 0.0274629145860672, 0.007251844275742769, -0.031150249764323235, 0.0817786380648613, 0.01751711592078209, 0.07238985598087311, -0.006944955326616764, -0.0723976194858551, 0.034229815006256104, -0.003155543003231287, 0.011516829021275043, -0.06810746341943741, 0.09528303891420364, -0.03101549670100212, 0.04598725214600563, -0.032259490340948105, 0.07952931523323059, 0.011015753261744976, 0.07233146578073502, 0.04757140204310417, 0.07436589896678925, 0.03568919375538826, -0.05899377539753914, -0.07132003456354141, 0.02570781111717224, 0.05620163306593895, 0.029458558186888695, 0.07280883193016052, 0.014483439736068249, -0.09305085241794586, 0.04503859579563141, -0.07544805109500885, 0.04793871194124222, -0.0066075995564460754, -0.027827860787510872, -0.07631555944681168, -0.05412726849317551, 0.056384310126304626, 0.056813593953847885, 0.06885606050491333, -0.001682625850662589, -0.021189114078879356, -0.004618695937097073, -0.04061309993267059, 0.10019382834434509, -0.030752010643482208, 0.036137741059064865, 0.035284142941236496, 0.022952962666749954, 0.0072324820794165134, 0.0515342652797699, 0.020784474909305573, 0.005023692734539509, 0.019894951954483986, 0.05247249826788902, 0.020828237757086754, -0.010321374982595444, 0.0026851524598896503, 0.0014503364218398929, -1.771797109029194e-08, -0.07890938222408295, -0.10603849589824677, -0.04075992852449417, 0.07047312706708908, -0.053525179624557495, 0.028504792600870132, -0.01275587547570467, -0.04736935719847679, -0.044071078300476074, -0.016645105555653572, -0.04981076717376709, -0.010642158798873425, 0.017387278378009796, 0.015506042167544365, -0.02702799066901207, -0.06912237405776978, -0.006346073932945728, 0.048564061522483826, 0.019542649388313293, -0.10184305161237717, -0.02131459303200245, 0.002071274910122156, 0.06019570678472519, -0.04933277890086174, -0.023822331801056862, 0.061753757297992706, 0.03395755961537361, 0.035142987966537476, 0.04514467716217041, -0.04209870100021362, 0.051735058426856995, -0.010264404118061066, 0.010600893758237362, -0.04388001188635826, 0.048436664044857025, 0.09170644730329514, 0.0874226912856102, 0.02946961112320423, -0.0049003129824995995, 0.03189241513609886, -0.05068569630384445, 0.04898029565811157, 0.06254067271947861, -0.021246548742055893, 0.041442159563302994, -0.04294992610812187, -0.11569153517484665, -0.029132820665836334, 0.027501607313752174, -0.11903877556324005, -0.0024651181884109974, -0.019488628953695297, 0.032330770045518875, 0.014155727811157703, -0.019860858097672462, -0.03563971444964409, 0.03158700466156006, 0.04575197398662567, -0.04244818910956383, 0.007442069705575705, 0.12420977652072906, -0.0006733344052918255, 0.0338529571890831, -0.03671126440167427]\n"
|
27 |
+
]
|
28 |
+
}
|
29 |
+
],
|
30 |
+
"source": [
|
31 |
+
"from gpt4all import GPT4All, Embed4All\n",
|
32 |
+
"text = 'Aditya_test.txt'\n",
|
33 |
+
"embedder = Embed4All()\n",
|
34 |
+
"output = embedder.embed(text)\n",
|
35 |
+
"print(output)"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 15,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"import langchain_community as lcc\n",
|
45 |
+
"from langchain_community.chat_models import ChatHuggingFace\n",
|
46 |
+
"\n",
|
47 |
+
"local_llm = 'NousResearch/Yarn-Mistral-7b-128k'\n",
|
48 |
+
"llm = ChatOllama(model=local_llm, temperature=0)"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "code",
|
53 |
+
"execution_count": 13,
|
54 |
+
"metadata": {},
|
55 |
+
"outputs": [
|
56 |
+
{
|
57 |
+
"name": "stdout",
|
58 |
+
"output_type": "stream",
|
59 |
+
"text": [
|
60 |
+
"bert_load_from_file: gguf version = 2\n",
|
61 |
+
"bert_load_from_file: gguf alignment = 32\n",
|
62 |
+
"bert_load_from_file: gguf data offset = 695552\n",
|
63 |
+
"bert_load_from_file: model name = BERT\n",
|
64 |
+
"bert_load_from_file: model architecture = bert\n",
|
65 |
+
"bert_load_from_file: model file type = 1\n",
|
66 |
+
"bert_load_from_file: bert tokenizer vocab = 30522\n"
|
67 |
+
]
|
68 |
+
}
|
69 |
+
],
|
70 |
+
"source": [
|
71 |
+
"from langchain_community.embeddings import GPT4AllEmbeddings\n",
|
72 |
+
"\n",
|
73 |
+
"embedder = GPT4AllEmbeddings()"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"cell_type": "code",
|
78 |
+
"execution_count": 37,
|
79 |
+
"metadata": {},
|
80 |
+
"outputs": [
|
81 |
+
{
|
82 |
+
"ename": "AttributeError",
|
83 |
+
"evalue": "'dict' object has no attribute 'page_content'",
|
84 |
+
"output_type": "error",
|
85 |
+
"traceback": [
|
86 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
87 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
88 |
+
"Cell \u001b[0;32mIn[37], line 13\u001b[0m\n\u001b[1;32m 10\u001b[0m adjusted_documents \u001b[38;5;241m=\u001b[39m [{\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpage_content\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m], \u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m]} \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# Then, attempt to create the vector store with the adjusted document format\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m vectorstore \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madjusted_documents\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrag-chroma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membedder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 18\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vectorstore\u001b[38;5;241m.\u001b[39mas_retriever()\n",
|
89 |
+
"File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36mChroma.from_documents\u001b[0;34m(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m 758\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m 759\u001b[0m \n\u001b[1;32m 760\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m 775\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m texts \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 777\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m 779\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m 780\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 789\u001b[0m )\n",
|
90 |
+
"File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m 758\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m 759\u001b[0m \n\u001b[1;32m 760\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m 775\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m texts \u001b[38;5;241m=\u001b[39m [\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 777\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m 779\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m 780\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 789\u001b[0m )\n",
|
91 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'page_content'"
|
92 |
+
]
|
93 |
+
}
|
94 |
+
],
|
95 |
+
"source": [
|
96 |
+
"from langchain_community.vectorstores import Chroma\n",
|
97 |
+
"\n",
|
98 |
+
"# Example of preparing 'documents' variable (assuming each document is a string in a list)\n",
|
99 |
+
"# Here you would convert each text document into an embedding and prepare it as needed\n",
|
100 |
+
"\n",
|
101 |
+
"# Assuming 'embedder.embed(doc_text)' returns a numeric vector for each document\n",
|
102 |
+
"documents = [{'text': doc_text, 'embedding': embedder.embed(doc_text)} for doc_text in documents_list]\n",
|
103 |
+
"\n",
|
104 |
+
"# If Chroma expects a 'page_content' attribute, adjust your dictionaries accordingly\n",
|
105 |
+
"adjusted_documents = [{'page_content': doc['text'], 'embedding': doc['embedding']} for doc in documents]\n",
|
106 |
+
"\n",
|
107 |
+
"# Then, attempt to create the vector store with the adjusted document format\n",
|
108 |
+
"vectorstore = Chroma.from_documents(\n",
|
109 |
+
" documents=adjusted_documents,\n",
|
110 |
+
" collection_name=\"rag-chroma\",\n",
|
111 |
+
" embedding=embedder,\n",
|
112 |
+
")\n",
|
113 |
+
"retriever = vectorstore.as_retriever()\n"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": 16,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": [
|
122 |
+
"# Assuming 'query' is defined and TextLoader is set up\n",
|
123 |
+
"query = \"who is Aditya\"\n",
|
124 |
+
"documents = TextLoader.load_documents(query)\n"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "code",
|
129 |
+
"execution_count": 27,
|
130 |
+
"metadata": {},
|
131 |
+
"outputs": [
|
132 |
+
{
|
133 |
+
"ename": "ImportError",
|
134 |
+
"evalue": "cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)",
|
135 |
+
"output_type": "error",
|
136 |
+
"traceback": [
|
137 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
138 |
+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
139 |
+
"Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Rag\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Initialize RAG model (ensure you have a compatible model loaded)\u001b[39;00m\n\u001b[1;32m 4\u001b[0m rag_model \u001b[38;5;241m=\u001b[39m Rag()\n",
|
140 |
+
"\u001b[0;31mImportError\u001b[0m: cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)"
|
141 |
+
]
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"source": [
|
145 |
+
"from langchain_community.llms import Rag\n",
|
146 |
+
"\n",
|
147 |
+
"# Initialize RAG model (ensure you have a compatible model loaded)\n",
|
148 |
+
"rag_model = Rag()\n",
|
149 |
+
"\n",
|
150 |
+
"# Example function to generate answers using RAG and the retrieved documents\n",
|
151 |
+
"def generate_answer(rag_model, query, documents):\n",
|
152 |
+
" # Convert documents to a format suitable for the model, if necessary\n",
|
153 |
+
" context = ' '.join(documents) # Simplified; you might need a more sophisticated approach\n",
|
154 |
+
" \n",
|
155 |
+
" # Generate an answer using the RAG model\n",
|
156 |
+
" answer = rag_model.generate(query, context, \n",
|
157 |
+
" generation_kwargs={\"max_length\": 256, \"temperature\": 0.7})\n",
|
158 |
+
" return answer\n",
|
159 |
+
"\n",
|
160 |
+
"# Generate an answer for the query using retrieved documents as context\n",
|
161 |
+
"answer = generate_answer(rag_model, query, documents)\n",
|
162 |
+
"print(\"Generated Answer:\", answer)\n"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"cell_type": "code",
|
167 |
+
"execution_count": 21,
|
168 |
+
"metadata": {},
|
169 |
+
"outputs": [
|
170 |
+
{
|
171 |
+
"name": "stdout",
|
172 |
+
"output_type": "stream",
|
173 |
+
"text": [
|
174 |
+
"His previous role as a Software Engineer at Aspire Systems in Chennai, India, showcases Aditya's versatility in both backend and frontend development. Leading the redesign of a Life Insurance Company's architecture, he prioritized low latency and high throughput, emphasizing a customer-centric approach. Aditya engineered 20 SOAP APIs for responsive patient data management, collaborated on front-end enhancements, and implemented secure payment gateways and Single Sign-On for authentication. His contribution to debugging strategies, real-time log analysis with Splunk, and CI/CD pipelines with Jenkins further underscore his commitment to optimizing system performance.\n"
|
175 |
+
]
|
176 |
+
}
|
177 |
+
],
|
178 |
+
"source": [
|
179 |
+
"# Example structure for fine-tuning (high-level and simplified)\n",
|
180 |
+
"from langchain.training import train_model\n",
|
181 |
+
"\n",
|
182 |
+
"# Define your training dataset\n",
|
183 |
+
"training_data = [(\"Question 1\", \"Answer 1\"), (\"Question 2\", \"Answer 2\"), ...]\n",
|
184 |
+
"\n",
|
185 |
+
"# Train (fine-tune) the model\n",
|
186 |
+
"train_model(rag_model, training_data, epochs=5, learning_rate=1e-5)\n"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"cell_type": "code",
|
191 |
+
"execution_count": 28,
|
192 |
+
"metadata": {},
|
193 |
+
"outputs": [
|
194 |
+
{
|
195 |
+
"name": "stderr",
|
196 |
+
"output_type": "stream",
|
197 |
+
"text": [
|
198 |
+
"/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
199 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
200 |
+
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
201 |
+
]
|
202 |
+
}
|
203 |
+
],
|
204 |
+
"source": [
|
205 |
+
"from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration\n",
|
206 |
+
"\n",
|
207 |
+
"tokenizer = RagTokenizer.from_pretrained(\"facebook/rag-token-base\")\n",
|
208 |
+
"retriever = RagRetriever.from_pretrained(\"facebook/rag-token-base\")\n",
|
209 |
+
"generator = RagTokenForGeneration.from_pretrained(\"facebook/rag-token-base\")\n",
|
210 |
+
"\n",
|
211 |
+
"\n",
|
212 |
+
"def generate_answer(tokenizer, retriever, generator, query, documents):\n",
|
213 |
+
" inputs = tokenizer(query, documents, return_tensors=\"pt\", padding=\"max_length\", max_length=256, truncation=True)\n",
|
214 |
+
" input_ids = inputs[\"input_ids\"]\n",
|
215 |
+
" attention_mask = inputs[\"attention_mask\"]\n",
|
216 |
+
" doc_scores = retriever(input_ids, attention_mask)\n",
|
217 |
+
" context_input_ids = input_ids.new_full((input_ids.shape[0], 1), tokenizer.context_id, dtype=torch.long)\n",
|
218 |
+
" context_attention_mask = input_ids.new_full(context_input_ids.shape, 1)\n",
|
219 |
+
" generator_input_ids = torch.cat([context_input_ids, input_ids], dim=1)\n",
|
220 |
+
" generator_attention_mask = torch.cat([context_attention_mask, attention_mask], dim=1)\n",
|
221 |
+
" outputs = generator.generate(generator_input_ids, attention_mask=generator_attention_mask, doc_scores=doc_scores)\n",
|
222 |
+
" return tokenizer.batch_decode(outputs, skip_special_tokens=True)"
|
223 |
+
]
|
224 |
+
},
|
225 |
+
{
|
226 |
+
"cell_type": "code",
|
227 |
+
"execution_count": 4,
|
228 |
+
"metadata": {},
|
229 |
+
"outputs": [
|
230 |
+
{
|
231 |
+
"ename": "ModuleNotFoundError",
|
232 |
+
"evalue": "No module named 'haystack.indexing'",
|
233 |
+
"output_type": "error",
|
234 |
+
"traceback": [
|
235 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
236 |
+
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
237 |
+
"Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtimeit\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcleaning\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m clean_wiki_text\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_file, fetch_archive_from_http\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m convert_files_to_dicts, fetch_archive_from_http\n",
|
238 |
+
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'haystack.indexing'"
|
239 |
+
]
|
240 |
+
}
|
241 |
+
],
|
242 |
+
"source": [
|
243 |
+
"import os\n",
|
244 |
+
"import timeit\n",
|
245 |
+
"# from haystack.indexing.cleaning import clean_wiki_text\n",
|
246 |
+
"# from haystack.indexing.io import open_file, fetch_archive_from_http\n",
|
247 |
+
"# from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
|
248 |
+
"from haystack.preprocessor.cleaning import clean_whitespace, clean_html, clean_preprocessor,clean_wiki_text\n",
|
249 |
+
"from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n",
|
250 |
+
"from haystack.preprocessor import PreProcessor\n",
|
251 |
+
"from haystack.document_store import InMemoryDocumentStore, WeaviateDocumentStore\n",
|
252 |
+
"from haystack.retriever.dense import EmbeddingRetriever\n",
|
253 |
+
"from haystack.utils import print_answers\n",
|
254 |
+
"\n",
|
255 |
+
"def run_ingest():\n",
|
256 |
+
" # Update DATA_PATH to include \"Aditya_train.txt\"\n",
|
257 |
+
" data_file = \"Aditya_train.txt\"\n",
|
258 |
+
" DATA_PATH = os.path.join(cfg.DATA_PATH, data_file)\n",
|
259 |
+
" \n",
|
260 |
+
" # Ensure the file exists\n",
|
261 |
+
" if os.path.isfile(DATA_PATH):\n",
|
262 |
+
" start = timeit.default_timer()\n",
|
263 |
+
"\n",
|
264 |
+
" vector_store = WeaviateDocumentStore(host=cfg.WEAVIATE_HOST,\n",
|
265 |
+
" port=cfg.WEAVIATE_PORT,\n",
|
266 |
+
" embedding_dim=cfg.WEAVIATE_EMBEDDING_DIM)\n",
|
267 |
+
"\n",
|
268 |
+
" # Convert text files to dictionaries\n",
|
269 |
+
" raw_docs = convert_files_to_dicts(dir_path=DATA_PATH, clean_func=clean_wiki_text, split_paragraphs=True)\n",
|
270 |
+
"\n",
|
271 |
+
" # Convert to desired format\n",
|
272 |
+
" final_doc = []\n",
|
273 |
+
" for doc in raw_docs:\n",
|
274 |
+
" new_doc = {\n",
|
275 |
+
" 'content': doc['text'],\n",
|
276 |
+
" 'meta': {'name': doc['name']}\n",
|
277 |
+
" }\n",
|
278 |
+
" final_doc.append(new_doc)\n",
|
279 |
+
"\n",
|
280 |
+
" preprocessor = PreProcessor(\n",
|
281 |
+
" clean_empty_lines=True,\n",
|
282 |
+
" clean_whitespace=False,\n",
|
283 |
+
" clean_header_footer=False,\n",
|
284 |
+
" split_by=\"word\",\n",
|
285 |
+
" language=\"en\",\n",
|
286 |
+
" split_length=cfg.PRE_PROCESSOR_SPLIT_LENGTH,\n",
|
287 |
+
" split_overlap=cfg.PRE_PROCESSOR_SPLIT_OVERLAP,\n",
|
288 |
+
" split_respect_sentence_boundary=True,\n",
|
289 |
+
" )\n",
|
290 |
+
"\n",
|
291 |
+
" preprocessed_docs = preprocessor.process(final_doc)\n",
|
292 |
+
" vector_store.write_documents(preprocessed_docs)\n",
|
293 |
+
"\n",
|
294 |
+
" retriever = EmbeddingRetriever(\n",
|
295 |
+
" document_store=vector_store,\n",
|
296 |
+
" embedding_model=cfg.EMBEDDINGS\n",
|
297 |
+
" )\n",
|
298 |
+
" vector_store.update_embeddings(retriever)\n",
|
299 |
+
"\n",
|
300 |
+
" end = timeit.default_timer()\n",
|
301 |
+
" print(f\"Time to prepare embeddings: {end - start}\")\n",
|
302 |
+
" else:\n",
|
303 |
+
" print(f\"File {data_file} not found in the specified DATA_PATH.\")\n"
|
304 |
+
]
|
305 |
+
}
|
306 |
+
],
|
307 |
+
"metadata": {
|
308 |
+
"kernelspec": {
|
309 |
+
"display_name": "Langchain",
|
310 |
+
"language": "python",
|
311 |
+
"name": "python3"
|
312 |
+
},
|
313 |
+
"language_info": {
|
314 |
+
"codemirror_mode": {
|
315 |
+
"name": "ipython",
|
316 |
+
"version": 3
|
317 |
+
},
|
318 |
+
"file_extension": ".py",
|
319 |
+
"mimetype": "text/x-python",
|
320 |
+
"name": "python",
|
321 |
+
"nbconvert_exporter": "python",
|
322 |
+
"pygments_lexer": "ipython3",
|
323 |
+
"version": "3.11.0"
|
324 |
+
}
|
325 |
+
},
|
326 |
+
"nbformat": 4,
|
327 |
+
"nbformat_minor": 2
|
328 |
+
}
|
req.txt
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
annotated-types==0.6.0
|
2 |
+
anyio==4.3.0
|
3 |
+
appnope==0.1.4
|
4 |
+
asgiref==3.7.2
|
5 |
+
asttokens==2.4.1
|
6 |
+
backoff==2.2.1
|
7 |
+
bcrypt==4.1.2
|
8 |
+
blinker==1.7.0
|
9 |
+
boilerpy3==1.0.7
|
10 |
+
cachetools==5.3.3
|
11 |
+
certifi==2024.2.2
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
chroma-haystack==0.15.0
|
14 |
+
chroma-hnswlib==0.7.3
|
15 |
+
chromadb==0.4.19
|
16 |
+
click==8.1.7
|
17 |
+
coloredlogs==15.0.1
|
18 |
+
comm==0.2.2
|
19 |
+
debugpy==1.8.1
|
20 |
+
decorator==5.1.1
|
21 |
+
Deprecated==1.2.14
|
22 |
+
distro==1.9.0
|
23 |
+
exceptiongroup==1.2.0
|
24 |
+
executing==2.0.1
|
25 |
+
fastapi==0.110.0
|
26 |
+
filelock==3.13.1
|
27 |
+
Flask==3.0.2
|
28 |
+
flatbuffers==24.3.7
|
29 |
+
fsspec==2024.2.0
|
30 |
+
google-auth==2.28.2
|
31 |
+
googleapis-common-protos==1.63.0
|
32 |
+
grpcio==1.62.1
|
33 |
+
h11==0.14.0
|
34 |
+
haystack-ai==2.0.0
|
35 |
+
haystack-bm25==1.0.2
|
36 |
+
httpcore==1.0.4
|
37 |
+
httptools==0.6.1
|
38 |
+
httpx==0.27.0
|
39 |
+
huggingface-hub==0.21.4
|
40 |
+
humanfriendly==10.0
|
41 |
+
idna==3.6
|
42 |
+
importlib-metadata==6.11.0
|
43 |
+
importlib_resources==6.3.0
|
44 |
+
ipykernel==6.29.3
|
45 |
+
ipython==8.18.1
|
46 |
+
itsdangerous==2.1.2
|
47 |
+
jedi==0.19.1
|
48 |
+
Jinja2==3.1.3
|
49 |
+
joblib==1.3.2
|
50 |
+
jupyter_client==8.6.1
|
51 |
+
jupyter_core==5.7.2
|
52 |
+
kubernetes==29.0.0
|
53 |
+
lazy-imports==0.3.1
|
54 |
+
MarkupSafe==2.1.5
|
55 |
+
matplotlib-inline==0.1.6
|
56 |
+
mmh3==4.1.0
|
57 |
+
monotonic==1.6
|
58 |
+
more-itertools==10.2.0
|
59 |
+
mpmath==1.3.0
|
60 |
+
nest-asyncio==1.6.0
|
61 |
+
networkx==3.2.1
|
62 |
+
numpy==1.26.4
|
63 |
+
oauthlib==3.2.2
|
64 |
+
onnxruntime==1.16.3
|
65 |
+
openai==1.14.0
|
66 |
+
opentelemetry-api==1.23.0
|
67 |
+
opentelemetry-exporter-otlp-proto-common==1.23.0
|
68 |
+
opentelemetry-exporter-otlp-proto-grpc==1.23.0
|
69 |
+
opentelemetry-instrumentation==0.44b0
|
70 |
+
opentelemetry-instrumentation-asgi==0.44b0
|
71 |
+
opentelemetry-instrumentation-fastapi==0.44b0
|
72 |
+
opentelemetry-proto==1.23.0
|
73 |
+
opentelemetry-sdk==1.23.0
|
74 |
+
opentelemetry-semantic-conventions==0.44b0
|
75 |
+
opentelemetry-util-http==0.44b0
|
76 |
+
overrides==7.7.0
|
77 |
+
packaging==24.0
|
78 |
+
pandas==2.2.1
|
79 |
+
parso==0.8.3
|
80 |
+
pexpect==4.9.0
|
81 |
+
pillow==10.2.0
|
82 |
+
platformdirs==4.2.0
|
83 |
+
posthog==3.5.0
|
84 |
+
prompt-toolkit==3.0.43
|
85 |
+
protobuf==4.25.3
|
86 |
+
psutil==5.9.8
|
87 |
+
ptyprocess==0.7.0
|
88 |
+
pulsar-client==3.4.0
|
89 |
+
pure-eval==0.2.2
|
90 |
+
pyasn1==0.5.1
|
91 |
+
pyasn1-modules==0.3.0
|
92 |
+
pydantic==2.6.4
|
93 |
+
pydantic_core==2.16.3
|
94 |
+
Pygments==2.17.2
|
95 |
+
pypdf==4.1.0
|
96 |
+
PyPika==0.48.9
|
97 |
+
python-dateutil==2.9.0.post0
|
98 |
+
python-dotenv==1.0.1
|
99 |
+
pytz==2024.1
|
100 |
+
PyYAML==6.0.1
|
101 |
+
pyzmq==25.1.2
|
102 |
+
regex==2023.12.25
|
103 |
+
requests==2.31.0
|
104 |
+
requests-oauthlib==1.4.0
|
105 |
+
rsa==4.9
|
106 |
+
safetensors==0.4.2
|
107 |
+
scikit-learn==1.4.1.post1
|
108 |
+
scipy==1.12.0
|
109 |
+
sentence-transformers==2.5.1
|
110 |
+
six==1.16.0
|
111 |
+
sniffio==1.3.1
|
112 |
+
stack-data==0.6.3
|
113 |
+
starlette==0.36.3
|
114 |
+
sympy==1.12
|
115 |
+
tenacity==8.2.3
|
116 |
+
threadpoolctl==3.3.0
|
117 |
+
tokenizers==0.15.2
|
118 |
+
torch==2.2.1
|
119 |
+
tornado==6.4
|
120 |
+
tqdm==4.66.2
|
121 |
+
traitlets==5.14.2
|
122 |
+
transformers==4.38.2
|
123 |
+
typer==0.9.0
|
124 |
+
typing_extensions==4.10.0
|
125 |
+
tzdata==2024.1
|
126 |
+
urllib3==2.2.1
|
127 |
+
uvicorn==0.28.0
|
128 |
+
uvloop==0.19.0
|
129 |
+
watchfiles==0.21.0
|
130 |
+
wcwidth==0.2.13
|
131 |
+
websocket-client==1.7.0
|
132 |
+
websockets==12.0
|
133 |
+
Werkzeug==3.0.1
|
134 |
+
wrapt==1.16.0
|
135 |
+
zipp==3.18.0
|
test.ipynb
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"For text TextFileToDocument\n",
|
8 |
+
"for pdf PyPDFToDocument"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
+
"metadata": {},
|
15 |
+
"outputs": [
|
16 |
+
{
|
17 |
+
"name": "stdout",
|
18 |
+
"output_type": "stream",
|
19 |
+
"text": [
|
20 |
+
"/unity/f2/asugandhi/Downloads/LLM_Playground\n",
|
21 |
+
"\n"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"ename": "ValueError",
|
26 |
+
"evalue": "Input batch_size not found in component PdfFileConverter.",
|
27 |
+
"output_type": "error",
|
28 |
+
"traceback": [
|
29 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
30 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
31 |
+
"Cell \u001b[0;32mIn[21], line 27\u001b[0m\n\u001b[1;32m 25\u001b[0m pipeline\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPdfFileConverter\u001b[39m\u001b[38;5;124m\"\u001b[39m,\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPdfwriter_chroma\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 26\u001b[0m pipeline\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTextFileConverter\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwriter_chroma\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 27\u001b[0m \u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPdfFileConverter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msources\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTextFileConverter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msources\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 30\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 33\u001b[0m querying \u001b[38;5;241m=\u001b[39m Pipeline()\n\u001b[1;32m 34\u001b[0m reader \u001b[38;5;241m=\u001b[39m ExtractiveReader(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeepset/roberta-base-squad2-distilled\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
32 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/core/pipeline/pipeline.py:688\u001b[0m, in \u001b[0;36mPipeline.run\u001b[0;34m(self, data, debug)\u001b[0m\n\u001b[1;32m 682\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 683\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInputs \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m were not matched to any component inputs, please check your run parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 684\u001b[0m \u001b[38;5;28mlist\u001b[39m(unresolved_inputs\u001b[38;5;241m.\u001b[39mkeys()),\n\u001b[1;32m 685\u001b[0m )\n\u001b[1;32m 687\u001b[0m \u001b[38;5;66;03m# Raise if input is malformed in some way\u001b[39;00m\n\u001b[0;32m--> 688\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 689\u001b[0m \u001b[38;5;66;03m# NOTE: The above NOTE and TODO are technically not true.\u001b[39;00m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;66;03m# This implementation of run supports only the first format, but the second format is actually\u001b[39;00m\n\u001b[1;32m 691\u001b[0m \u001b[38;5;66;03m# never received by this method. It's handled by the `run()` method of the `Pipeline` class\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 695\u001b[0m \u001b[38;5;66;03m# deepcopying the inputs prevents the Pipeline run logic from being altered unexpectedly\u001b[39;00m\n\u001b[1;32m 696\u001b[0m \u001b[38;5;66;03m# when the same input reference is passed to multiple components.\u001b[39;00m\n\u001b[1;32m 697\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m component_name, component_inputs \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems():\n",
|
33 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/core/pipeline/pipeline.py:594\u001b[0m, in \u001b[0;36mPipeline._validate_input\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m input_name \u001b[38;5;129;01min\u001b[39;00m component_inputs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m input_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m instance\u001b[38;5;241m.\u001b[39m__haystack_input__\u001b[38;5;241m.\u001b[39m_sockets_dict:\n\u001b[0;32m--> 594\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInput \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minput_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in component \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 596\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m component_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph\u001b[38;5;241m.\u001b[39mnodes:\n\u001b[1;32m 597\u001b[0m instance \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph\u001b[38;5;241m.\u001b[39mnodes[component_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstance\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
|
34 |
+
"\u001b[0;31mValueError\u001b[0m: Input batch_size not found in component PdfFileConverter."
|
35 |
+
]
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"source": [
|
39 |
+
"import os\n",
|
40 |
+
"from haystack import Pipeline, Document\n",
|
41 |
+
"from haystack.components.converters import TextFileToDocument, PyPDFToDocument\n",
|
42 |
+
"from haystack.components.writers import DocumentWriter\n",
|
43 |
+
"from haystack.components.readers import ExtractiveReader\n",
|
44 |
+
"from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n",
|
45 |
+
"from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
|
46 |
+
"from pathlib import Path\n",
|
47 |
+
"HERE = Path(os.getcwd())\n",
|
48 |
+
"print(HERE)\n",
|
49 |
+
"\n",
|
50 |
+
"data_path = HERE / \"data\"\n",
|
51 |
+
"file_paths = [data_path / Path(name) for name in os.listdir(\"data\")]\n",
|
52 |
+
"print()\n",
|
53 |
+
"chroma_store = ChromaDocumentStore()\n",
|
54 |
+
"# Resolve the absolute path\n",
|
55 |
+
"# absolute_file_path = file_path.resolve()\n",
|
56 |
+
"# print(absolute_file_path)\n",
|
57 |
+
"pipeline = Pipeline()\n",
|
58 |
+
"pipeline.add_component(\"PdfFileConverter\", PyPDFToDocument())\n",
|
59 |
+
"pipeline.add_component(\"TextFileConverter\", TextFileToDocument())\n",
|
60 |
+
"pipeline.add_component(\"Pdfwriter_chroma\", DocumentWriter(document_store=chroma_store))\n",
|
61 |
+
"pipeline.add_component(\"writer_chroma\", DocumentWriter(document_store=chroma_store))\n",
|
62 |
+
"\n",
|
63 |
+
"pipeline.connect(\"PdfFileConverter\",\"Pdfwriter_chroma\")\n",
|
64 |
+
"pipeline.connect(\"TextFileConverter\", \"writer_chroma\")\n",
|
65 |
+
"pipeline.run(\n",
|
66 |
+
" {\"PdfFileConverter\": {\"sources\": file_paths, \"batch_size\": 1}},\n",
|
67 |
+
" {\"TextFileConverter\": {\"sources\": file_paths, \"batch_size\": 1}},\n",
|
68 |
+
")\n",
|
69 |
+
" \n",
|
70 |
+
" \n",
|
71 |
+
"querying = Pipeline()\n",
|
72 |
+
"reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
|
73 |
+
"querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
|
74 |
+
"querying.add_component(\"reader\",reader)\n",
|
75 |
+
"results = querying.run({\"retriever\": {\"query\": \"Vishwam\", \"top_k\": 3}})\n"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 3,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [
|
83 |
+
{
|
84 |
+
"name": "stdout",
|
85 |
+
"output_type": "stream",
|
86 |
+
"text": [
|
87 |
+
"/unity/f2/asugandhi/Downloads/LLM_Playground\n",
|
88 |
+
"{'reader': {'answers': [ExtractedAnswer(query='Who is Aditya?', score=0.6858945488929749, data='Software Engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=31, end=48), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.627069890499115, data='Sugandhi', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=7, end=15), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.5672385096549988, data='Software Engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=4616, end=4633), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.5219605565071106, data='software engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=4961, end=4978), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.5016087889671326, data='Sugandhi', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=4592, end=4600), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.44805991649627686, data='Web Developer Intern', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=3343, end=3363), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.0066661882226549205, data=None, document=None, context=None, document_offset=None, context_offset=None, meta={})]}}\n"
|
89 |
+
]
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"source": [
|
93 |
+
"from pathlib import Path\n",
|
94 |
+
"import os\n",
|
95 |
+
"from haystack import Pipeline\n",
|
96 |
+
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
|
97 |
+
"from haystack.components.converters import PyPDFToDocument, TextFileToDocument\n",
|
98 |
+
"from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n",
|
99 |
+
"from haystack.components.readers import ExtractiveReader\n",
|
100 |
+
"from haystack.components.routers import FileTypeRouter\n",
|
101 |
+
"from haystack.components.joiners import DocumentJoiner\n",
|
102 |
+
"from haystack.components.writers import DocumentWriter\n",
|
103 |
+
"from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n",
|
104 |
+
"from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
|
105 |
+
"\n",
|
106 |
+
"HERE = Path(os.getcwd())\n",
|
107 |
+
"print(HERE)\n",
|
108 |
+
"\n",
|
109 |
+
"data_path = HERE / \"data\"\n",
|
110 |
+
"file_paths = [str(data_path / name) for name in os.listdir(data_path)]\n",
|
111 |
+
"\n",
|
112 |
+
"chroma_store = ChromaDocumentStore()\n",
|
113 |
+
"\n",
|
114 |
+
"pipeline = Pipeline()\n",
|
115 |
+
"pipeline.add_component(\"FileTypeRouter\", FileTypeRouter(mime_types=[\"text/plain\", \"application/pdf\"]))\n",
|
116 |
+
"pipeline.add_component(\"TextFileConverter\", TextFileToDocument())\n",
|
117 |
+
"pipeline.add_component(\"PdfFileConverter\", PyPDFToDocument())\n",
|
118 |
+
"pipeline.add_component(\"Joiner\", DocumentJoiner())\n",
|
119 |
+
"pipeline.add_component(\"Cleaner\", DocumentCleaner())\n",
|
120 |
+
"pipeline.add_component(\"Splitter\", DocumentSplitter(split_by=\"sentence\", split_length=250, split_overlap=30))\n",
|
121 |
+
"# pipeline.add_component(\"Embedder\", SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\"))\n",
|
122 |
+
"pipeline.add_component(\"Writer\", DocumentWriter(document_store=chroma_store))\n",
|
123 |
+
"\n",
|
124 |
+
"pipeline.connect(\"FileTypeRouter.text/plain\", \"TextFileConverter.sources\")\n",
|
125 |
+
"pipeline.connect(\"FileTypeRouter.application/pdf\", \"PdfFileConverter.sources\")\n",
|
126 |
+
"pipeline.connect(\"TextFileConverter.documents\", \"Joiner.documents\")\n",
|
127 |
+
"pipeline.connect(\"PdfFileConverter.documents\", \"Joiner.documents\")\n",
|
128 |
+
"pipeline.connect(\"Joiner.documents\", \"Cleaner.documents\")\n",
|
129 |
+
"pipeline.connect(\"Cleaner.documents\", \"Splitter.documents\")\n",
|
130 |
+
"pipeline.connect(\"Splitter.documents\", \"Writer.documents\")\n",
|
131 |
+
"# pipeline.connect(\"Embedder.documents\", \"Writer.documents\")\n",
|
132 |
+
"\n",
|
133 |
+
"pipeline.run(\n",
|
134 |
+
" {\"FileTypeRouter\": {\"sources\": file_paths}},\n",
|
135 |
+
")\n",
|
136 |
+
"\n",
|
137 |
+
"# Querying pipeline\n",
|
138 |
+
"querying = Pipeline()\n",
|
139 |
+
"reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
|
140 |
+
"querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
|
141 |
+
"querying.add_component(\"reader\", reader)\n",
|
142 |
+
"querying.connect(\"retriever\", \"reader\")\n",
|
143 |
+
"query = \"Who is Aditya?\"\n",
|
144 |
+
"input_data = {\n",
|
145 |
+
" \"retriever\": {\"query\": query, \"top_k\": 1},\n",
|
146 |
+
" \"reader\": {\"query\": query},\n",
|
147 |
+
" # Use 'max_tokens' instead of 'max_new_tokens'\n",
|
148 |
+
" }\n",
|
149 |
+
"results = querying.run(input_data)\n",
|
150 |
+
"print(results)\n"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"cell_type": "markdown",
|
155 |
+
"metadata": {},
|
156 |
+
"source": [
|
157 |
+
"#DON'T RUN"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"cell_type": "code",
|
162 |
+
"execution_count": null,
|
163 |
+
"metadata": {},
|
164 |
+
"outputs": [],
|
165 |
+
"source": []
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 7,
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [
|
172 |
+
{
|
173 |
+
"name": "stdout",
|
174 |
+
"output_type": "stream",
|
175 |
+
"text": [
|
176 |
+
"who is Aditya?\n",
|
177 |
+
"{'llm': {'replies': ['Aditya Sugandhi is a Software Engineer with a strong foundation in both theoretical knowledge and practical application, known for his commitment to excellence, passion for technological advancements, and dedication to pushing boundaries in software development. He has experience in various roles such as a Research Assistant, Full Stack Developer, Customer Service Executive, and Web Developer Intern. Aditya is currently pursuing a Master’s of Science in Computer Science at Florida State University and holds a Bachelor of Technology in Computer Science Engineering from SRM University. He is characterized by technical excellence, innovation, and a holistic understanding of software development. Aditya enjoys spending time with his friends SAS, Hunterr, MF, and Rocco.'], 'meta': [{'model': 'gpt-3.5-turbo-0125', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 138, 'prompt_tokens': 917, 'total_tokens': 1055}}]}}\n"
|
178 |
+
]
|
179 |
+
}
|
180 |
+
],
|
181 |
+
"source": [
|
182 |
+
"from haystack import Pipeline\n",
|
183 |
+
"from haystack.utils import Secret\n",
|
184 |
+
"from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
|
185 |
+
"from haystack.components.readers import ExtractiveReader\n",
|
186 |
+
"from haystack.components.generators import GPTGenerator\n",
|
187 |
+
"from haystack.components.builders.prompt_builder import PromptBuilder\n",
|
188 |
+
"from haystack.components.generators import OpenAIGenerator\n",
|
189 |
+
"\n",
|
190 |
+
"template = \"\"\"\n",
|
191 |
+
"Answer all the questions in the following format and based on Aditya.\n",
|
192 |
+
"\n",
|
193 |
+
"Context:\n",
|
194 |
+
"{% for doc in documents %}\n",
|
195 |
+
" {{ doc.content }}\n",
|
196 |
+
"{% endfor %}\n",
|
197 |
+
"Question: {{question}}\n",
|
198 |
+
"Answer:\n",
|
199 |
+
"\"\"\"\n",
|
200 |
+
"\n",
|
201 |
+
"prompt_builder = PromptBuilder(template=template)\n",
|
202 |
+
"retriever = ChromaQueryTextRetriever(document_store = chroma_store)\n",
|
203 |
+
"#ExtractiveReader to extract answers from the relevant context\n",
|
204 |
+
"api_key = Secret.from_token(\"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\")\n",
|
205 |
+
"llm = OpenAIGenerator(model=\"gpt-3.5-turbo-0125\",api_key=api_key)\n",
|
206 |
+
"reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
|
207 |
+
"\n",
|
208 |
+
"extractive_qa_pipeline = Pipeline()\n",
|
209 |
+
"extractive_qa_pipeline.add_component(\"retriever\", retriever)\n",
|
210 |
+
"# extractive_qa_pipeline.add_component(\"reader\",reader)\n",
|
211 |
+
"extractive_qa_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n",
|
212 |
+
"extractive_qa_pipeline.add_component(\"llm\", llm)\n",
|
213 |
+
"\n",
|
214 |
+
"\n",
|
215 |
+
"# extractive_qa_pipeline.connect(\"retriever\", \"reader\")\n",
|
216 |
+
"extractive_qa_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
|
217 |
+
"extractive_qa_pipeline.connect(\"prompt_builder\", \"llm\")\n",
|
218 |
+
"\n",
|
219 |
+
"\n",
|
220 |
+
"query = \"who is Aditya?\"\n",
|
221 |
+
"print(query)\n",
|
222 |
+
"# Define the input data for the pipeline components\n",
|
223 |
+
"input_data = {\n",
|
224 |
+
" \"retriever\": {\"query\": query, \"top_k\": 1},\n",
|
225 |
+
" \"prompt_builder\": {\"question\": query},\n",
|
226 |
+
" # Use 'max_tokens' instead of 'max_new_tokens'\n",
|
227 |
+
"}\n",
|
228 |
+
"\n",
|
229 |
+
"# Run the pipeline with the updated input data\n",
|
230 |
+
"results = extractive_qa_pipeline.run(input_data)\n",
|
231 |
+
"print(results)"
|
232 |
+
]
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"cell_type": "code",
|
236 |
+
"execution_count": null,
|
237 |
+
"metadata": {},
|
238 |
+
"outputs": [],
|
239 |
+
"source": []
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": 20,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [
|
246 |
+
{
|
247 |
+
"name": "stderr",
|
248 |
+
"output_type": "stream",
|
249 |
+
"text": [
|
250 |
+
"llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from openchat-3.5-1210.Q3_K_S.ggml (version GGUF V3 (latest))\n",
|
251 |
+
"llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
|
252 |
+
"llama_model_loader: - kv 0: general.architecture str = llama\n",
|
253 |
+
"llama_model_loader: - kv 1: general.name str = openchat_openchat-3.5-1210\n",
|
254 |
+
"llama_model_loader: - kv 2: llama.context_length u32 = 8192\n",
|
255 |
+
"llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n",
|
256 |
+
"llama_model_loader: - kv 4: llama.block_count u32 = 32\n",
|
257 |
+
"llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n",
|
258 |
+
"llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n",
|
259 |
+
"llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n",
|
260 |
+
"llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n",
|
261 |
+
"llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n",
|
262 |
+
"llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n",
|
263 |
+
"llama_model_loader: - kv 11: general.file_type u32 = 11\n",
|
264 |
+
"llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n",
|
265 |
+
"llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32002] = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
|
266 |
+
"llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32002] = [0.000000, 0.000000, 0.000000, 0.0000...\n",
|
267 |
+
"llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32002] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
|
268 |
+
"llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n",
|
269 |
+
"llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n",
|
270 |
+
"llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0\n",
|
271 |
+
"llama_model_loader: - kv 19: tokenizer.ggml.add_bos_token bool = true\n",
|
272 |
+
"llama_model_loader: - kv 20: tokenizer.ggml.add_eos_token bool = false\n",
|
273 |
+
"llama_model_loader: - kv 21: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...\n",
|
274 |
+
"llama_model_loader: - kv 22: general.quantization_version u32 = 2\n",
|
275 |
+
"llama_model_loader: - type f32: 65 tensors\n",
|
276 |
+
"llama_model_loader: - type q3_K: 225 tensors\n",
|
277 |
+
"llama_model_loader: - type q6_K: 1 tensors\n",
|
278 |
+
"llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
|
279 |
+
"llm_load_print_meta: format = GGUF V3 (latest)\n",
|
280 |
+
"llm_load_print_meta: arch = llama\n",
|
281 |
+
"llm_load_print_meta: vocab type = SPM\n",
|
282 |
+
"llm_load_print_meta: n_vocab = 32002\n",
|
283 |
+
"llm_load_print_meta: n_merges = 0\n",
|
284 |
+
"llm_load_print_meta: n_ctx_train = 8192\n",
|
285 |
+
"llm_load_print_meta: n_embd = 4096\n",
|
286 |
+
"llm_load_print_meta: n_head = 32\n",
|
287 |
+
"llm_load_print_meta: n_head_kv = 8\n",
|
288 |
+
"llm_load_print_meta: n_layer = 32\n",
|
289 |
+
"llm_load_print_meta: n_rot = 128\n",
|
290 |
+
"llm_load_print_meta: n_embd_head_k = 128\n",
|
291 |
+
"llm_load_print_meta: n_embd_head_v = 128\n",
|
292 |
+
"llm_load_print_meta: n_gqa = 4\n",
|
293 |
+
"llm_load_print_meta: n_embd_k_gqa = 1024\n",
|
294 |
+
"llm_load_print_meta: n_embd_v_gqa = 1024\n",
|
295 |
+
"llm_load_print_meta: f_norm_eps = 0.0e+00\n",
|
296 |
+
"llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
|
297 |
+
"llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
|
298 |
+
"llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
|
299 |
+
"llm_load_print_meta: n_ff = 14336\n",
|
300 |
+
"llm_load_print_meta: n_expert = 0\n",
|
301 |
+
"llm_load_print_meta: n_expert_used = 0\n",
|
302 |
+
"llm_load_print_meta: rope scaling = linear\n",
|
303 |
+
"llm_load_print_meta: freq_base_train = 10000.0\n",
|
304 |
+
"llm_load_print_meta: freq_scale_train = 1\n",
|
305 |
+
"llm_load_print_meta: n_yarn_orig_ctx = 8192\n",
|
306 |
+
"llm_load_print_meta: rope_finetuned = unknown\n",
|
307 |
+
"llm_load_print_meta: model type = 7B\n",
|
308 |
+
"llm_load_print_meta: model ftype = Q3_K - Small\n",
|
309 |
+
"llm_load_print_meta: model params = 7.24 B\n",
|
310 |
+
"llm_load_print_meta: model size = 2.95 GiB (3.50 BPW) \n",
|
311 |
+
"llm_load_print_meta: general.name = openchat_openchat-3.5-1210\n",
|
312 |
+
"llm_load_print_meta: BOS token = 1 '<s>'\n",
|
313 |
+
"llm_load_print_meta: EOS token = 32000 '<|end_of_turn|>'\n",
|
314 |
+
"llm_load_print_meta: UNK token = 0 '<unk>'\n",
|
315 |
+
"llm_load_print_meta: PAD token = 0 '<unk>'\n",
|
316 |
+
"llm_load_print_meta: LF token = 13 '<0x0A>'\n",
|
317 |
+
"llm_load_tensors: ggml ctx size = 0.56 MiB\n",
|
318 |
+
"llm_load_tensors: offloading 32 repeating layers to GPU\n",
|
319 |
+
"llm_load_tensors: offloading non-repeating layers to GPU\n",
|
320 |
+
"llm_load_tensors: offloaded 33/33 layers to GPU\n",
|
321 |
+
"llm_load_tensors: CPU buffer size = 53.71 MiB\n",
|
322 |
+
"llm_load_tensors: CUDA0 buffer size = 804.66 MiB\n",
|
323 |
+
"llm_load_tensors: CUDA1 buffer size = 715.25 MiB\n",
|
324 |
+
"llm_load_tensors: CUDA2 buffer size = 715.25 MiB\n",
|
325 |
+
"llm_load_tensors: CUDA3 buffer size = 728.40 MiB\n",
|
326 |
+
".................................................................................................\n",
|
327 |
+
"llama_new_context_with_model: n_ctx = 10000\n",
|
328 |
+
"llama_new_context_with_model: freq_base = 10000.0\n",
|
329 |
+
"llama_new_context_with_model: freq_scale = 1\n",
|
330 |
+
"llama_kv_cache_init: CUDA0 KV buffer size = 351.56 MiB\n",
|
331 |
+
"llama_kv_cache_init: CUDA1 KV buffer size = 312.50 MiB\n",
|
332 |
+
"llama_kv_cache_init: CUDA2 KV buffer size = 312.50 MiB\n",
|
333 |
+
"llama_kv_cache_init: CUDA3 KV buffer size = 273.44 MiB\n",
|
334 |
+
"llama_new_context_with_model: KV self size = 1250.00 MiB, K (f16): 625.00 MiB, V (f16): 625.00 MiB\n"
|
335 |
+
]
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"ename": "",
|
339 |
+
"evalue": "",
|
340 |
+
"output_type": "error",
|
341 |
+
"traceback": [
|
342 |
+
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
|
343 |
+
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
|
344 |
+
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
|
345 |
+
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
346 |
+
]
|
347 |
+
}
|
348 |
+
],
|
349 |
+
"source": [
|
350 |
+
"from haystack import Pipeline\n",
|
351 |
+
"from haystack.utils import Secret\n",
|
352 |
+
"from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
|
353 |
+
"from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator\n",
|
354 |
+
"from haystack.components.readers import ExtractiveReader\n",
|
355 |
+
"from haystack.components.generators import GPTGenerator\n",
|
356 |
+
"from haystack.components.builders.prompt_builder import PromptBuilder\n",
|
357 |
+
"from haystack.components.builders.answer_builder import AnswerBuilder\n",
|
358 |
+
"from haystack.components.generators import OpenAIGenerator\n",
|
359 |
+
"\n",
|
360 |
+
"\n",
|
361 |
+
"\n",
|
362 |
+
"\n",
|
363 |
+
"template = \"\"\"\n",
|
364 |
+
"Answer all the questions in the following format and based on Aditya \n",
|
365 |
+
"and if not found generate answer accordingly using the given information.\n",
|
366 |
+
"\n",
|
367 |
+
"Context:\n",
|
368 |
+
"{% for doc in documents %}\n",
|
369 |
+
"{{ doc.content }}\n",
|
370 |
+
"{% endfor %}\n",
|
371 |
+
"Question: {{question}}\n",
|
372 |
+
"Answer:\n",
|
373 |
+
"\"\"\"\n",
|
374 |
+
"\n",
|
375 |
+
"prompt_builder = PromptBuilder(template=template)\n",
|
376 |
+
"retriever = ChromaQueryTextRetriever(document_store = chroma_store)\n",
|
377 |
+
"#ExtractiveReader to extract answers from the relevant context\n",
|
378 |
+
"\n",
|
379 |
+
"llm = LlamaCppGenerator(\n",
|
380 |
+
"model_path=\"openchat-3.5-1210.Q3_K_S.ggml\", \n",
|
381 |
+
"n_ctx=10000,\n",
|
382 |
+
"n_batch=256,\n",
|
383 |
+
"model_kwargs={\"n_gpu_layers\": -1},\n",
|
384 |
+
"generation_kwargs={\"max_tokens\": 250, \"temperature\": 0.9},\n",
|
385 |
+
")\n",
|
386 |
+
"\n",
|
387 |
+
"reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\",)\n",
|
388 |
+
"\n",
|
389 |
+
"extractive_qa_pipeline = Pipeline()\n",
|
390 |
+
"extractive_qa_pipeline.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
|
391 |
+
"# extractive_qa_pipeline.add_component(\"reader\",reader)\n",
|
392 |
+
"extractive_qa_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n",
|
393 |
+
"extractive_qa_pipeline.add_component(\"llm\", llm)\n",
|
394 |
+
"extractive_qa_pipeline.add_component(instance=AnswerBuilder(), name=\"answer_builder\")\n",
|
395 |
+
"\n",
|
396 |
+
"# extractive_qa_pipeline.connect(\"retriever.documents\", \"reader\")\n",
|
397 |
+
"extractive_qa_pipeline.connect(\"retriever\", \"prompt_builder.documents\") \n",
|
398 |
+
"extractive_qa_pipeline.connect(\"prompt_builder\", \"llm\")\n",
|
399 |
+
"extractive_qa_pipeline.connect(\"llm.replies\", \"answer_builder.replies\")\n",
|
400 |
+
"extractive_qa_pipeline.connect(\"retriever\", \"answer_builder.documents\")\n",
|
401 |
+
"\n",
|
402 |
+
"query = \"who is Aditya did Aditya Pursued his Masters from?\"\n",
|
403 |
+
"\n",
|
404 |
+
"# Define the input data for the pipeline components\n",
|
405 |
+
"input_data = {\n",
|
406 |
+
" \"retriever\": {\"query\": query, \"top_k\": 3},\n",
|
407 |
+
" # \"reader\": {\"query\": query},\n",
|
408 |
+
" \"prompt_builder\": {\"question\": query},\n",
|
409 |
+
" \"answer_builder\": {\"query\": query},\n",
|
410 |
+
" # Use 'max_tokens' instead of 'max_new_tokens'\n",
|
411 |
+
"}\n",
|
412 |
+
"\n",
|
413 |
+
"# Run the pipeline with the updated input data\n",
|
414 |
+
"results = extractive_qa_pipeline.run(input_data)\n",
|
415 |
+
"\n",
|
416 |
+
" \n",
|
417 |
+
" "
|
418 |
+
]
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"cell_type": "code",
|
422 |
+
"execution_count": 19,
|
423 |
+
"metadata": {},
|
424 |
+
"outputs": [
|
425 |
+
{
|
426 |
+
"name": "stdout",
|
427 |
+
"output_type": "stream",
|
428 |
+
"text": [
|
429 |
+
" Aditya pursued his Masters from Florida State University.\n"
|
430 |
+
]
|
431 |
+
}
|
432 |
+
],
|
433 |
+
"source": [
|
434 |
+
"# Assuming results is the dictionary containing the output\n",
|
435 |
+
"generated_content = results['llm']['meta'][0]['choices'][0]['text']\n",
|
436 |
+
"#print(results)\n",
|
437 |
+
"# Print the generated content\n",
|
438 |
+
"print(generated_content)\n"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "code",
|
443 |
+
"execution_count": null,
|
444 |
+
"metadata": {},
|
445 |
+
"outputs": [],
|
446 |
+
"source": []
|
447 |
+
}
|
448 |
+
],
|
449 |
+
"metadata": {
|
450 |
+
"kernelspec": {
|
451 |
+
"display_name": "RAGAPP",
|
452 |
+
"language": "python",
|
453 |
+
"name": "python3"
|
454 |
+
},
|
455 |
+
"language_info": {
|
456 |
+
"codemirror_mode": {
|
457 |
+
"name": "ipython",
|
458 |
+
"version": 3
|
459 |
+
},
|
460 |
+
"file_extension": ".py",
|
461 |
+
"mimetype": "text/x-python",
|
462 |
+
"name": "python",
|
463 |
+
"nbconvert_exporter": "python",
|
464 |
+
"pygments_lexer": "ipython3",
|
465 |
+
"version": "3.10.13"
|
466 |
+
}
|
467 |
+
},
|
468 |
+
"nbformat": 4,
|
469 |
+
"nbformat_minor": 2
|
470 |
+
}
|
test2.ipynb
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"/Users/adityasugandhi/Documents/GitHub/LLM_Playground/.newenv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
13 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"name": "stdout",
|
18 |
+
"output_type": "stream",
|
19 |
+
"text": [
|
20 |
+
"/Users/adityasugandhi/Documents/GitHub/LLM_Playground\n"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"name": "stderr",
|
25 |
+
"output_type": "stream",
|
26 |
+
"text": [
|
27 |
+
"Batches: 100%|██████████| 1/1 [00:03<00:00, 3.22s/it]\n",
|
28 |
+
"/Users/adityasugandhi/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:11<00:00, 7.14MiB/s]\n"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"name": "stdout",
|
33 |
+
"output_type": "stream",
|
34 |
+
"text": [
|
35 |
+
"{'retriever': {'documents': [Document(id=fee80856fdb487fb694c739e089614d733502a7bd6d8b192f29ed6dad2088f44, content: 'Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Ma...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/mf.txt', 'source_id': '99393e97120fcb9e88daa2d490060e9a91385ae63c7890d12b351978c02d3d93'}, score: 1.0066444873809814, embedding: vector of size 384), Document(id=e700bf2b5df175311a60ca00ffb6ed77b65b09c4221a2466b68e4802d90a831a, content: 'VISHWAM SHAH\n",
|
36 |
+
"Tallahassee, FL |[email protected] |+1 (850) 666 - 0095 |https://www.linkedin.com/...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/Resume_Vishwam_Shah_Back_end.pdf', 'source_id': 'd23089ee94ea955eb9ef0045999019220184668c96631b25686fc002722e8753'}, score: 1.5628944635391235, embedding: vector of size 384), Document(id=299afa7bfc84e7700fd38b178933ab2bf3a67b09298662651b173af03fde7968, content: ' The\n",
|
37 |
+
"“ECMWF Parameter ID” column is a ECMWF’s numeric label, and can be used to construct the URL fo...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/2212.12794.pdf', 'source_id': '314ee646f1f3143cad0677f2cdf057f1d625e5f2a1891449011557e1f75249d5'}, score: 1.6514018774032593, embedding: vector of size 384)]}}\n"
|
38 |
+
]
|
39 |
+
}
|
40 |
+
],
|
41 |
+
"source": [
|
42 |
+
"from pathlib import Path\n",
|
43 |
+
"import os\n",
|
44 |
+
"from haystack import Pipeline\n",
|
45 |
+
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder\n",
|
46 |
+
"from haystack.components.converters import PyPDFToDocument, TextFileToDocument\n",
|
47 |
+
"from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n",
|
48 |
+
"from haystack.components.routers import FileTypeRouter\n",
|
49 |
+
"from haystack.components.joiners import DocumentJoiner\n",
|
50 |
+
"from haystack.components.writers import DocumentWriter\n",
|
51 |
+
"from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n",
|
52 |
+
"from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
|
53 |
+
"\n",
|
54 |
+
"HERE = Path(os.getcwd())\n",
|
55 |
+
"print(HERE)\n",
|
56 |
+
"\n",
|
57 |
+
"data_path = HERE / \"data\"\n",
|
58 |
+
"file_paths = [str(data_path / name) for name in os.listdir(data_path)]\n",
|
59 |
+
"\n",
|
60 |
+
"chroma_store = ChromaDocumentStore()\n",
|
61 |
+
"\n",
|
62 |
+
"pipeline = Pipeline()\n",
|
63 |
+
"pipeline.add_component(\"FileTypeRouter\", FileTypeRouter(mime_types=[\"text/plain\", \"application/pdf\"]))\n",
|
64 |
+
"pipeline.add_component(\"TextFileConverter\", TextFileToDocument())\n",
|
65 |
+
"pipeline.add_component(\"PdfFileConverter\", PyPDFToDocument())\n",
|
66 |
+
"\n",
|
67 |
+
"pipeline.add_component(\"Joiner\", DocumentJoiner())\n",
|
68 |
+
"pipeline.add_component(\"Cleaner\", DocumentCleaner())\n",
|
69 |
+
"pipeline.add_component(\"Splitter\", DocumentSplitter(split_by=\"sentence\", split_length=250, split_overlap=30))\n",
|
70 |
+
"# pipeline.add_component(\"TextEmbedder\", SentenceTransformersTextEmbedder())\n",
|
71 |
+
"pipeline.add_component(\"Embedder\", SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\"))\n",
|
72 |
+
"\n",
|
73 |
+
"pipeline.add_component(\"Writer\", DocumentWriter(document_store=chroma_store))\n",
|
74 |
+
"\n",
|
75 |
+
"pipeline.connect(\"FileTypeRouter.text/plain\", \"TextFileConverter.sources\")\n",
|
76 |
+
"pipeline.connect(\"FileTypeRouter.application/pdf\", \"PdfFileConverter.sources\")\n",
|
77 |
+
"pipeline.connect(\"TextFileConverter.documents\", \"Joiner.documents\")\n",
|
78 |
+
"pipeline.connect(\"PdfFileConverter.documents\", \"Joiner.documents\")\n",
|
79 |
+
"pipeline.connect(\"Joiner.documents\", \"Cleaner.documents\")\n",
|
80 |
+
"pipeline.connect(\"Cleaner.documents\", \"Splitter.documents\")\n",
|
81 |
+
"pipeline.connect(\"Splitter.documents\", \"Embedder.documents\")\n",
|
82 |
+
"# pipeline.connect(\"TextEmbedder.embeddings\", \"Embedder.documents\")\n",
|
83 |
+
"pipeline.connect(\"Embedder.documents\", \"Writer.documents\")\n",
|
84 |
+
"\n",
|
85 |
+
"pipeline.run(\n",
|
86 |
+
" {\"FileTypeRouter\": {\"sources\": file_paths}},\n",
|
87 |
+
")\n",
|
88 |
+
"\n",
|
89 |
+
"# Querying pipeline\n",
|
90 |
+
"querying = Pipeline()\n",
|
91 |
+
"querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
|
92 |
+
"results = querying.run({\"retriever\": {\"query\": \"Vishwam\", \"top_k\": 3}})\n",
|
93 |
+
"print(results)\n"
|
94 |
+
]
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"cell_type": "markdown",
|
98 |
+
"metadata": {},
|
99 |
+
"source": [
|
100 |
+
"#Information Retriver"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 4,
|
106 |
+
"metadata": {},
|
107 |
+
"outputs": [
|
108 |
+
{
|
109 |
+
"name": "stdout",
|
110 |
+
"output_type": "stream",
|
111 |
+
"text": [
|
112 |
+
"{'retriever': {'documents': [Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.1221085786819458, embedding: vector of size 384), Document(id=11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9, content: 'Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encomp...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_test.txt', 'source_id': 'c85a2287836cae980897693decb5e9d07e80f60b7c96b4e542ef3057e11fc228'}, score: 1.2236461639404297, embedding: vector of size 384), Document(id=a6ad41c3febd74d1f6825aac59c2d6dd7589ae8088bb3b449ea239c97d6f1b1c, content: ' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18\n",
|
113 |
+
"1.2 HRES . . . . . . . . . . . . . ....', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 1.6584246158599854, embedding: vector of size 384)]}}\n"
|
114 |
+
]
|
115 |
+
}
|
116 |
+
],
|
117 |
+
"source": [
|
118 |
+
"# # Querying pipeline\n",
|
119 |
+
"# querying = Pipeline()\n",
|
120 |
+
"# querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
|
121 |
+
"# results = querying.run({\"retriever\": {\"query\": \"Aditya\", \"top_k\": 3}})\n",
|
122 |
+
"# print(results)\n"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"cell_type": "code",
|
127 |
+
"execution_count": 28,
|
128 |
+
"metadata": {},
|
129 |
+
"outputs": [
|
130 |
+
{
|
131 |
+
"ename": "AttributeError",
|
132 |
+
"evalue": "'str' object has no attribute 'resolve_value'",
|
133 |
+
"output_type": "error",
|
134 |
+
"traceback": [
|
135 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
136 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
137 |
+
"Cell \u001b[0;32mIn[28], line 29\u001b[0m\n\u001b[1;32m 25\u001b[0m api_key \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39menviron\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOPENAI_API_KEY\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m#ExtractiveReader to extract answers from the relevant context\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# api_key = Secret.from_token(\"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\")\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mOpenAIGenerator\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgpt-3.5-turbo-0125\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mapi_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 30\u001b[0m reader \u001b[38;5;241m=\u001b[39m ExtractiveReader(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeepset/roberta-base-squad2-distilled\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m extractive_qa_pipeline \u001b[38;5;241m=\u001b[39m Pipeline()\n",
|
138 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/core/component/component.py:122\u001b[0m, in \u001b[0;36mComponentMeta.__call__\u001b[0;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124;03mThis method is called when clients instantiate a Component and\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03mruns before __new__ and __init__.\u001b[39;00m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# This will call __new__ then __init__, giving us back the Component instance\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m instance \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;66;03m# Before returning, we have the chance to modify the newly created\u001b[39;00m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;66;03m# Component instance, so we take the chance and set up the I/O sockets\u001b[39;00m\n\u001b[1;32m 126\u001b[0m \n\u001b[1;32m 127\u001b[0m \u001b[38;5;66;03m# If `component.set_output_types()` was called in the component constructor,\u001b[39;00m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;66;03m# `__haystack_output__` is already populated, no need to do anything.\u001b[39;00m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(instance, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__haystack_output__\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 130\u001b[0m \u001b[38;5;66;03m# If that's not the case, we need to populate `__haystack_output__`\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;66;03m# We deepcopy the content of the cache to transfer ownership from the class method\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;66;03m# to the actual instance, so that different instances of the same class won't share this data.\u001b[39;00m\n",
|
139 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/components/generators/openai.py:103\u001b[0m, in \u001b[0;36mOpenAIGenerator.__init__\u001b[0;34m(self, api_key, model, streaming_callback, api_base_url, organization, system_prompt, generation_kwargs)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi_base_url \u001b[38;5;241m=\u001b[39m api_base_url\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morganization \u001b[38;5;241m=\u001b[39m organization\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient \u001b[38;5;241m=\u001b[39m OpenAI(api_key\u001b[38;5;241m=\u001b[39m\u001b[43mapi_key\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve_value\u001b[49m(), organization\u001b[38;5;241m=\u001b[39morganization, base_url\u001b[38;5;241m=\u001b[39mapi_base_url)\n",
|
140 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'resolve_value'"
|
141 |
+
]
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"source": [
|
145 |
+
"from dotenv import load_dotenv\n",
|
146 |
+
"\n",
|
147 |
+
"load_dotenv() \n",
|
148 |
+
"from haystack import Pipeline\n",
|
149 |
+
"from haystack.utils import Secret\n",
|
150 |
+
"from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
|
151 |
+
"from haystack.components.readers import ExtractiveReader\n",
|
152 |
+
"from haystack.components.generators import GPTGenerator\n",
|
153 |
+
"from haystack.components.builders.prompt_builder import PromptBuilder\n",
|
154 |
+
"from haystack.components.generators import OpenAIGenerator\n",
|
155 |
+
"\n",
|
156 |
+
"template = \"\"\"\n",
|
157 |
+
" ` Answer the question using the provided context based on Aditya.\n",
|
158 |
+
"\n",
|
159 |
+
" Context:\n",
|
160 |
+
" {% for context in answers %}\n",
|
161 |
+
" {{ context }}\n",
|
162 |
+
" {% endfor %}\n",
|
163 |
+
" Question: {{question}}\n",
|
164 |
+
" Answer:\n",
|
165 |
+
" \"\"\"\n",
|
166 |
+
"\n",
|
167 |
+
"prompt_builder = PromptBuilder(template=template)\n",
|
168 |
+
"retriever = ChromaQueryTextRetriever(document_store = chroma_store)\n",
|
169 |
+
"api_key = os.environ.get(\"OPENAI_API_KEY\")\n",
|
170 |
+
"\n",
|
171 |
+
"#ExtractiveReader to extract answers from the relevant context\n",
|
172 |
+
"api_key = Secret.from_token(api_key)\n",
|
173 |
+
"llm = OpenAIGenerator(model=\"gpt-3.5-turbo-0125\",api_key=api_key)\n",
|
174 |
+
"reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
|
175 |
+
"\n",
|
176 |
+
"extractive_qa_pipeline = Pipeline()\n",
|
177 |
+
"extractive_qa_pipeline.add_component(\"retriever\", retriever)\n",
|
178 |
+
"extractive_qa_pipeline.add_component('reader', reader)\n",
|
179 |
+
"extractive_qa_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n",
|
180 |
+
"extractive_qa_pipeline.add_component(\"llm\", llm)\n",
|
181 |
+
"\n",
|
182 |
+
"extractive_qa_pipeline.connect(\"retriever.documents\", \"reader.documents\")\n",
|
183 |
+
"extractive_qa_pipeline.connect(\"reader.answers\", \"prompt_builder.answers\")\n",
|
184 |
+
"extractive_qa_pipeline.connect(\"prompt_builder\", \"llm\")\n",
|
185 |
+
"\n",
|
186 |
+
"\n",
|
187 |
+
"query = \"what is Aditya Pursuing ?\"\n",
|
188 |
+
"print(query)\n",
|
189 |
+
"# Define the input data for the pipeline components\n",
|
190 |
+
"input_data = {\n",
|
191 |
+
" \"retriever\": {\"query\": query, \"top_k\": 2},\n",
|
192 |
+
" \"reader\": {\"query\": query, \"top_k\": 2},\n",
|
193 |
+
" \"prompt_builder\": {\"question\": query},\n",
|
194 |
+
" # Use 'max_tokens' instead of 'max_new_tokens'\n",
|
195 |
+
"}\n",
|
196 |
+
"\n",
|
197 |
+
"# Run the pipeline with the updated input data\n",
|
198 |
+
"results = extractive_qa_pipeline.run(input_data)\n",
|
199 |
+
"print(results)"
|
200 |
+
]
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"cell_type": "code",
|
204 |
+
"execution_count": 5,
|
205 |
+
"metadata": {},
|
206 |
+
"outputs": [
|
207 |
+
{
|
208 |
+
"ename": "TypeError",
|
209 |
+
"evalue": "isinstance() arg 2 must be a type, a tuple of types, or a union",
|
210 |
+
"output_type": "error",
|
211 |
+
"traceback": [
|
212 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
213 |
+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
214 |
+
"Cell \u001b[0;32mIn[5], line 9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mdefault(obj)\n\u001b[0;32m----> 9\u001b[0m json_results \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdumps\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExtractedAnswerEncoder\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28mprint\u001b[39m(json_results)\n",
|
215 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/__init__.py:238\u001b[0m, in \u001b[0;36mdumps\u001b[0;34m(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 233\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONEncoder\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 235\u001b[0m \u001b[43m \u001b[49m\u001b[43mskipkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mensure_ascii\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mensure_ascii\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 236\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_circular\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_circular\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallow_nan\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallow_nan\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 237\u001b[0m \u001b[43m \u001b[49m\u001b[43mseparators\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mseparators\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdefault\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msort_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m--> 238\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n",
|
216 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:201\u001b[0m, in \u001b[0;36mJSONEncoder.encode\u001b[0;34m(self, o)\u001b[0m\n\u001b[1;32m 199\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterencode(o, _one_shot\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(chunks, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[0;32m--> 201\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mchunks\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(chunks)\n",
|
217 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:431\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode_list(o, _current_indent_level)\n\u001b[1;32m 430\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(o, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 431\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode_dict(o, _current_indent_level)\n\u001b[1;32m 432\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m markers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
218 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:405\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 404\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 405\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 406\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 407\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
|
219 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:405\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 404\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 405\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 406\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 407\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
|
220 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:325\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 324\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 325\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 327\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
|
221 |
+
"File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:438\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCircular reference detected\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 437\u001b[0m markers[markerid] \u001b[38;5;241m=\u001b[39m o\n\u001b[0;32m--> 438\u001b[0m o \u001b[38;5;241m=\u001b[39m \u001b[43m_default\u001b[49m\u001b[43m(\u001b[49m\u001b[43mo\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode(o, _current_indent_level)\n\u001b[1;32m 440\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m markers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
222 |
+
"Cell \u001b[0;32mIn[5], line 5\u001b[0m, in \u001b[0;36mExtractedAnswerEncoder.default\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdefault\u001b[39m(\u001b[38;5;28mself\u001b[39m, obj):\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28;43misinstance\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresults\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Convert ExtractedAnswer to a dictionary\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mdefault(obj)\n",
|
223 |
+
"\u001b[0;31mTypeError\u001b[0m: isinstance() arg 2 must be a type, a tuple of types, or a union"
|
224 |
+
]
|
225 |
+
}
|
226 |
+
],
|
227 |
+
"source": [
|
228 |
+
"import json\n",
|
229 |
+
"\n",
|
230 |
+
"class ExtractedAnswerEncoder(json.JSONEncoder):\n",
|
231 |
+
" def default(self, obj):\n",
|
232 |
+
" if isinstance(obj, results):\n",
|
233 |
+
" # Convert ExtractedAnswer to a dictionary\n",
|
234 |
+
" return obj.__dict__\n",
|
235 |
+
" return super().default(obj)\n",
|
236 |
+
"json_results = json.dumps(results, indent=2, cls=ExtractedAnswerEncoder)\n",
|
237 |
+
"\n",
|
238 |
+
"print(json_results)"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": null,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [],
|
246 |
+
"source": [
|
247 |
+
"p"
|
248 |
+
]
|
249 |
+
}
|
250 |
+
],
|
251 |
+
"metadata": {
|
252 |
+
"kernelspec": {
|
253 |
+
"display_name": "RAGAPP",
|
254 |
+
"language": "python",
|
255 |
+
"name": "python3"
|
256 |
+
},
|
257 |
+
"language_info": {
|
258 |
+
"codemirror_mode": {
|
259 |
+
"name": "ipython",
|
260 |
+
"version": 3
|
261 |
+
},
|
262 |
+
"file_extension": ".py",
|
263 |
+
"mimetype": "text/x-python",
|
264 |
+
"name": "python",
|
265 |
+
"nbconvert_exporter": "python",
|
266 |
+
"pygments_lexer": "ipython3",
|
267 |
+
"version": "3.9.13"
|
268 |
+
}
|
269 |
+
},
|
270 |
+
"nbformat": 4,
|
271 |
+
"nbformat_minor": 2
|
272 |
+
}
|
test_trainer/runs/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
test_trainer/runs/Feb22_22-15-01_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658123.bfs-v13-skynet.coaps.fsu.edu.3062760.0
ADDED
Binary file (4.54 kB). View file
|
|
test_trainer/runs/Feb22_22-17-41_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658261.bfs-v13-skynet.coaps.fsu.edu.3062760.1
ADDED
Binary file (4.54 kB). View file
|
|
test_trainer/runs/Feb22_22-17-41_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658535.bfs-v13-skynet.coaps.fsu.edu.3062760.2
ADDED
Binary file (4.54 kB). View file
|
|
test_trainer/runs/Feb22_22-24-50_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658690.bfs-v13-skynet.coaps.fsu.edu.3062760.3
ADDED
Binary file (4.54 kB). View file
|
|
utils/ExtractQA.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
|
2 |
+
from haystack.nodes import JoinDocuments
|
3 |
+
from haystack import Pipeline
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def ExtracQA(reader,retriever,query):
|
9 |
+
qa_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
|
10 |
+
result = qa_pipeline.run(query=query, params={"retriever": {"top_k": 3}, "reader": {"top_k": 5}})
|
11 |
+
|
12 |
+
|
13 |
+
return result
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def MultipleRetriever(reader,es_retriever,dpr_retriever,query):
|
19 |
+
p = Pipeline()
|
20 |
+
p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
21 |
+
p.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
|
22 |
+
p.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"])
|
23 |
+
p.add_node(component=reader, name="QAReader", inputs=["JoinResults"])
|
24 |
+
result = p.run(query=query, params={"ESRetriever": {"top_k": 10}, "DPRRetriever": {"top_k": 10}, "QAReader": {"top_k": 5}})
|
25 |
+
|
26 |
+
return result
|
27 |
+
|
utils/__init__.py
ADDED
File without changes
|
utils/dataloader.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from haystack import Pipeline
|
5 |
+
from haystack.components.converters import TextFileToDocument
|
6 |
+
from haystack.components.writers import DocumentWriter
|
7 |
+
|
8 |
+
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def load_data():
|
15 |
+
file_paths = ["data" / Path(name) for name in os.listdir("data")]
|
16 |
+
|
17 |
+
# Chroma is used in-memory so we use the same instances in the two pipelines below
|
18 |
+
document_store = ChromaDocumentStore()
|
19 |
+
|
20 |
+
indexing = Pipeline()
|
21 |
+
indexing.add_component("converter", TextFileToDocument())
|
22 |
+
indexing.add_component("writer", DocumentWriter(document_store))
|
23 |
+
indexing.connect("converter", "writer")
|
24 |
+
indexing.run({"converter": {"sources": file_paths}})
|
25 |
+
|
26 |
+
return document_store
|
27 |
+
|
28 |
+
|
29 |
+
|
utils/prompt_builder.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
|
2 |
+
from haystack.components.generators import HuggingFaceTGIGenerator
|
3 |
+
from haystack.components.builders import PromptBuilder
|
4 |
+
from haystack.agents.memory import ConversationSummaryMemory
|
5 |
+
from dataloader import load_data
|
6 |
+
from hayst
|
7 |
+
prompt = """
|
8 |
+
Answer the query based on the provided context for Aditya.
|
9 |
+
If the context does not contain the answer, say 'Answer not found'.
|
10 |
+
Context:
|
11 |
+
{% for doc in documents %}
|
12 |
+
{{ doc.content }}
|
13 |
+
{% endfor %}
|
14 |
+
query: {{query}}
|
15 |
+
Answer:
|
16 |
+
"""
|
17 |
+
prompt_builder = PromptBuilder(template=prompt)
|
18 |
+
|
19 |
+
llm = HuggingFaceTGIGenerator(model="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
20 |
+
llm.warm_up()
|
21 |
+
retriever = ChromaQueryTextRetriever(load_data())
|
22 |
+
|
23 |
+
querying = Pipeline()
|
24 |
+
querying.add_component("retriever", retriever)
|
25 |
+
querying.add_component("prompt_builder", prompt_builder)
|
26 |
+
querying.add_component("llm", llm)
|
27 |
+
|
28 |
+
querying.connect("retriever.documents", "prompt_builder.documents")
|
29 |
+
querying.connect("prompt_builder", "llm")
|