adityasugandhi commited on
Commit
919910a
·
1 Parent(s): 95e4875
Dockerfile CHANGED
@@ -1,11 +1,12 @@
1
- # Use an existing Docker image as a base
2
- FROM python:3.10
3
 
4
- # Set the working directory inside the container
5
  WORKDIR /app
6
 
7
- # Copy the requirements file into the container at /app
8
- COPY requirements.txt /requirements.txt
9
 
10
- # Install the dependencies
11
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
1
+ FROM python:3.9.5-slim
 
2
 
 
3
  WORKDIR /app
4
 
5
+ COPY ./req.txt /app/req.txt
 
6
 
7
+ RUN pip install --upgrade pip && \
8
+ pip install --no-cache-dir -r req.txt
9
+
10
+ COPY . /app
11
+
12
+ CMD ["python", "app.py"]
Front-End Design.docx ADDED
Binary file (14.2 kB). View file
 
Inferencer.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack import Pipeline
2
+ from haystack.utils import Secret
3
+ from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
4
+ # from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
5
+ from haystack.components.readers import ExtractiveReader
6
+ # from haystack.components.generators import GPTGenerator
7
+ from haystack.components.builders.prompt_builder import PromptBuilder
8
+ from haystack.components.builders.answer_builder import AnswerBuilder
9
+ from haystack.components.generators import OpenAIGenerator
10
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
11
+ from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
12
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
13
+ from dataloader import DataLoader
14
+ from dotenv import load_dotenv
15
+ import os
16
+ load_dotenv() # Load variables from .env file
17
+
18
+
19
+ chroma_store_loader = DataLoader()
20
+ class Inferncer:
21
+
22
+ def __init__(self):
23
+ self.chroma_store = chroma_store_loader.chroma_store
24
+ self.InMemory_store = chroma_store_loader.InMemory_store
25
+
26
+ def OpenAI(self,query):
27
+ template = """
28
+
29
+ Utilize the provided context related to Aditya Sugandhi to answer the question. If the answer is not explicitly available in the given information, generate a response using the Language Model (LLM). Optimize the process for clarity and efficiency.
30
+ Context:
31
+ {% for context in answers %}
32
+ {{ context }}
33
+ {% endfor %}
34
+ Question: {{question}}
35
+ Answer:
36
+ """
37
+ api_key = os.environ.get("OPENAI_API_KEY")
38
+
39
+ #ExtractiveReader to extract answers from the relevant context
40
+ api_key = Secret.from_token(api_key)
41
+ prompt_builder = PromptBuilder(template=template)
42
+ retriever = ChromaQueryTextRetriever(document_store = self.chroma_store)
43
+ #ExtractiveReader to extract answers from the relevant context
44
+ api_key = Secret.from_token("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")
45
+ llm = OpenAIGenerator(model="gpt-3.5-turbo-0125",api_key=api_key)
46
+ reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")
47
+
48
+ extractive_qa_pipeline = Pipeline()
49
+ extractive_qa_pipeline.add_component("retriever", retriever)
50
+ extractive_qa_pipeline.add_component("reader",reader)
51
+ extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
52
+ extractive_qa_pipeline.add_component("llm", llm)
53
+
54
+ # extractive_qa_pipeline.connect("retriever.documents", "reader.documents")
55
+ extractive_qa_pipeline.connect("retriever.documents", "reader.documents")
56
+ extractive_qa_pipeline.connect("reader.answers", "prompt_builder.answers")
57
+ extractive_qa_pipeline.connect("prompt_builder", "llm")
58
+
59
+
60
+
61
+ # Define the input data for the pipeline components
62
+ input_data = {
63
+ "retriever": {"query": query, "top_k": 2},
64
+ "reader": {"query": query, "top_k": 2},
65
+ "prompt_builder": {"question": query},
66
+ # "reader": {"query": query}
67
+ # Use 'max_tokens' instead of 'max_new_tokens'
68
+ }
69
+
70
+ # Run the pipeline with the updated input data
71
+ results = extractive_qa_pipeline.run(input_data)
72
+ return results
73
+
74
+ # def LlamaCpp(self,query):
75
+ # template = """
76
+ # ` Answer the question using the provided context based on Aditya.
77
+
78
+ # Context:
79
+ # {% for doc in documents %}
80
+ # {{ doc.content }}
81
+ # {% endfor %}
82
+ # Question: {{question}}
83
+ # Answer:
84
+ # """
85
+ # self.InMemory_store = chroma_store_loader.InMemory_dataloader()
86
+ # prompt_builder = PromptBuilder(template=template)
87
+ # retriever = InMemoryEmbeddingRetriever(document_store = self.InMemory_store)
88
+ # #ExtractiveReader to extract answers from the relevant context
89
+
90
+ # llm = LlamaCppGenerator(
91
+ # model_path="openchat-3.5-1210.Q3_K_S.ggml",
92
+ # n_ctx=30000,
93
+ # n_batch=256,
94
+ # model_kwargs={"n_gpu_layers": 2, "main_gpu": 1},
95
+ # generation_kwargs={"max_tokens": 250, "temperature": 0.7},
96
+ # )
97
+ # llm.warm_up()
98
+
99
+ # # reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled",)
100
+ # extractive_qa_pipeline = Pipeline()
101
+ # text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
102
+ # extractive_qa_pipeline.add_component('text_embedder', text_embedder)
103
+ # extractive_qa_pipeline.add_component("retriever", retriever)
104
+ # # extractive_qa_pipeline.add_component("reader",reader)
105
+
106
+ # extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
107
+ # extractive_qa_pipeline.add_component("llm", llm)
108
+ # # extractive_qa_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
109
+
110
+ # # extractive_qa_pipeline.connect("retriever.documents", "reader")
111
+ # extractive_qa_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
112
+ # extractive_qa_pipeline.connect("retriever.documents", "prompt_builder.documents")
113
+ # extractive_qa_pipeline.connect("prompt_builder", "llm")
114
+ # # extractive_qa_pipeline.connect("llm.replies", "answer_builder.replies")
115
+ # # extractive_qa_pipeline.connect("retriever", "answer_builder.documents")
116
+
117
+ # # Define the input data for the pipeline components
118
+ # input_data = {
119
+ # "text_embedder": {"text": query},
120
+ # # "retriever": {"query": query, "top_k": 3},
121
+ # # "reader": {"query": query},
122
+ # "prompt_builder": {"question": query},
123
+ # # "answer_builder": {"query": query},
124
+ # # Use 'max_tokens' instead of 'max_new_tokens'
125
+ # }
126
+
127
+ # # Run the pipeline with the updated input data
128
+ # results = extractive_qa_pipeline.run(input_data)
129
+ # return results
130
+
131
+
132
+
133
+ # #{
134
+ # "error": "Cannot connect 'text_embedder' with 'retriever': no matching connections available.\n'text_embedder':\n - embedding: List[float]\n'retriever':\n - query: str (available)\n - _: Optional[Dict[str, Any]] (available)\n - top_k: Optional[int] (available)"
135
+ # }
136
+
137
+
138
+
Personal LLM by stranzersweb.docx ADDED
Binary file (13.4 kB). View file
 
Pipfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+
8
+ [dev-packages]
9
+
10
+ [requires]
11
+ python_version = "3.9"
Readme.Md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Personal LLM by StranzersWeb Inc: System Design with Flask Backend
2
+
3
+ ## Introduction
4
+
5
+ StranzersWeb Inc proudly presents its innovative Personal Large Language Model (LLM) application, employing Flask as the backend service to harness the capabilities of AI and Large Language Models for precise Question and Answer inference. The integration of Haystack-ai ensures efficient custom database management, enhancing the overall accuracy of data retrieval.
6
+
7
+ ## System Workflow
8
+
9
+ 1. **Dataset Loading:**
10
+ - Flask handles the backend service responsible for loading the dataset into a Document store (ChromaStore, InMemoryStore, or Elastic Store).
11
+ - Efficient storage and retrieval are facilitated by Flask's capabilities.
12
+
13
+ 2. **Embedding Conversion:**
14
+ - The Haystack-ai controller, integrated with Flask, takes charge of converting the dataset into embeddings.
15
+ - Flask manages the communication between the application and Haystack-ai, ensuring a smooth embedding conversion process.
16
+
17
+ 3. **Haystack Pipeline Components:**
18
+ - **Retriever:**
19
+ - Flask manages the Retriever component, retrieving a list of relevant data based on user queries.
20
+ - **Reader:**
21
+ - The Reader component, under Flask's control, scans documents to identify the best context-match for queries.
22
+ - **Prompt Builder:**
23
+ - Flask oversees the generation of prompts by the Prompt Builder component based on the context provided by the Reader.
24
+ - **LLM (Large Language Model):**
25
+ - Flask integrates with the Large Language Model to utilize its powerful inference capabilities for generating desired outputs.
26
+
27
+ ## Key Features
28
+
29
+ 1. **Pinpoint Data Retrieval:**
30
+ - Flask, in conjunction with Haystack-ai libraries, ensures accurate data retrieval.
31
+ - Pre-processing with Flask enhances the efficiency of the Large Language Model, leading to precise responses.
32
+
33
+ 2. **Flexible Document Stores:**
34
+ - Users can select from various Document stores (ChromaStore, InMemoryStore, or Elastic Store) based on preferences, all seamlessly managed by Flask.
35
+
36
+ 3. **Streamlined Inferencing Pipeline:**
37
+ - Flask orchestrates the seamless collaboration of Haystack pipeline components, ensuring an efficient and streamlined inferencing process.
38
+ - The integration leads to faster response times and an improved user experience.
39
+
40
+ ## LLM Application System Design
41
+
42
+ 1. **Flask Backend:**
43
+ - Manages the backend services using Flask, providing a robust foundation for handling HTTP requests and serving API endpoints.
44
+ - Integration with Haystack-ai and other components for efficient communication.
45
+
46
+ 2. **Frontend Integration:**
47
+ - User-friendly interface for interacting with the application.
48
+ - Communicates with Flask backend through API calls for smooth user experience.
49
+
50
+ 3. **Scalability and Performance:**
51
+ - Deployed on cloud infrastructure with Flask's capabilities for scalability.
52
+ - Load balancing and auto-scaling to handle varying loads effectively.
53
+
54
+ 4. **Security and Privacy:**
55
+ - Flask incorporates robust security measures to protect user data and ensure privacy.
56
+ - Implements encryption for communication channels and secure storage practices.
57
+
58
+ ## System Design
59
+
60
+ ![System Design](SystemDesign.png)
SystemDesign.png ADDED
__pycache__/Inferencer.cpython-310.pyc ADDED
Binary file (2.74 kB). View file
 
__pycache__/dataloader.cpython-310.pyc ADDED
Binary file (3.46 kB). View file
 
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify, request
2
+ from Inferencer import Inferncer
3
+ from dataloader import DataLoader
4
+ import logging
5
+ app = Flask(__name__)
6
+
7
+ UPLOAD_FOLDER = './data/'
8
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
9
+
10
+ inferencer = Inferncer()
11
+ data_loader = DataLoader()
12
+
13
+ #app logger
14
+
15
+ log_format = "%(asctime)s [%(levelname)s] - %(message)s"
16
+ logging.basicConfig(filename="app.log", level=logging.DEBUG, format=log_format)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Initialize chroma_store as a global variable
20
+ # chroma_store = data_loader.dataloader()
21
+ # in_memory_store = data_loader.InMemory_dataloader()
22
+ chroma_store = None
23
+ in_memory_store = None
24
+
25
+ @app.route("/")
26
+ def home():
27
+ return "Welcome to the Flask app!"
28
+
29
+ @app.route('/upload', methods=['POST'])
30
+ def upload_document():
31
+ try:
32
+ if 'file' not in request.files:
33
+ return jsonify({"error": "No file provided"}), 400
34
+
35
+ file = request.files['file']
36
+
37
+ if file.filename == '':
38
+ return jsonify({"error": "No file selected"}), 400
39
+
40
+ file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
41
+ return jsonify({"message": "File uploaded successfully"}), 200
42
+
43
+ except Exception as e:
44
+ return jsonify({"error": str(e)})
45
+
46
+ @app.route("/sync", methods=["POST"])
47
+ def sync_and_run_dataloader():
48
+ global chroma_store
49
+ global in_memory_store# Access the global chroma_store variable
50
+ try:
51
+ # Optionally, you can add authentication or other checks here
52
+
53
+ # Call the dataloader function
54
+ chroma_store = data_loader.dataloader()
55
+ in_memory_store = data_loader.InMemory_dataloader()
56
+
57
+ return jsonify({"message": "DataLoader executed successfully", "result": "success"})
58
+
59
+ except Exception as e:
60
+ return jsonify({"error": str(e)})
61
+
62
+ @app.route("/ask", methods=["POST"])
63
+ def ask_question():
64
+ try:
65
+ data = request.get_json()
66
+ query = data.get("question", "")
67
+ model = data.get("model", "")
68
+
69
+ if chroma_store is None:
70
+ return jsonify({"error": "Chroma store not initialized. Run sync_and_run_dataloader first."})
71
+
72
+ if model == "OpenAI":
73
+ results = inferencer.OpenAI(query=query)
74
+ return jsonify({"results": results})
75
+ elif model == "LlamaCpp":
76
+ results = inferencer.LlamaCpp(query=query)
77
+ return jsonify({"results": results})
78
+ else:
79
+ return jsonify({"error": f"Invalid model specified: {model}"})
80
+
81
+ except Exception as e:
82
+ return jsonify({"error": str(e)})
83
+
84
+ if __name__ == "__main__":
85
+ app.run(debug=True)
data/Aditya_test.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encompassing more than three years of industry experience. Currently pursuing a Master’s of Science in Computer Science at Florida State University, Aditya has consistently demonstrated a passion for innovation and a strong commitment to driving technical excellence.
2
+
3
+ In his role as a Research Assistant at the Department of Scientific Computing at FSU, Aditya has been actively involved in conducting in-depth analysis for Monsoon Forecast Prediction. His work spans a century's worth of data, focusing on variables like Salinity, Surface Temperature, and Surface-to-Air Temperature. Utilizing Apache Spark for efficient data handling and transformation, Aditya leveraged Spark's distributed computing capabilities to process vast datasets in parallel, resulting in a remarkable 30% reduction in overall training time for machine learning models. This experience highlights his proficiency in handling big data and implementing cutting-edge technologies for scientific research.
4
+
5
+ His previous role as a Software Engineer at Aspire Systems in Chennai, India, showcases Aditya's versatility in both backend and frontend development. Leading the redesign of a Life Insurance Company's architecture, he prioritized low latency and high throughput, emphasizing a customer-centric approach. Aditya engineered 20 SOAP APIs for responsive patient data management, collaborated on front-end enhancements, and implemented secure payment gateways and Single Sign-On for authentication. His contribution to debugging strategies, real-time log analysis with Splunk, and CI/CD pipelines with Jenkins further underscore his commitment to optimizing system performance.
6
+
7
+ Aditya's experience extends to client-facing roles, where he addressed varied client needs in occupational health. His solution-oriented approach ensured compliance and security, crafting robust solutions with custom analytic modules and dynamic dashboards powered by Kafka-streams. These efforts transformed insurance dynamics, fortifying coverage in medical, life, and pet domains with enhanced security.
8
+
9
+ As a Web Developer Intern at Impetus in Indore, India, Aditya showcased his creativity by innovating a PHP-centric design using the Laravel framework. His proficiency extended to managing diverse CMS systems and successfully migrating Joomla versions, highlighting his adaptability and problem-solving skills.
10
+
11
+ Aditya's academic journey includes a Bachelor of Technology in Computer Science Engineering from SRM University in Chennai, India. Additionally, his publication on "Review Classification & False Feedback Detection using different ML algorithms" reflects his engagement in academic research.
12
+
13
+ His skill set is expansive, covering a wide array of programming languages including Ruby, JavaScript, Go, Kotlin, PHP, C#, Swift, R, TypeScript, Scala, SQL, Rust, C/C++, Java, and Python. Aditya is a Full-Stack developer with expertise in React.js, Node.js, Git, MySQL, PostgreSQL, Scala, Kafka, Docker, GraphQL, AWS, RDMS, TypeScript, and data analysis tools such as Pandas, Scikit-learn, TensorFlow, NumPy, SciPy, Keras, Algorithmic Design, OpenCV, MapWindow, and GIS Tools.
14
+
15
+ Aditya has further demonstrated his commitment to continuous learning and professional development by obtaining certifications such as "ChatGPT Prompt Engineering for Developers" by DeepLearning.ai and engaging with topics like Code, Production Environments, Distributed Systems, Product Managers, Code Reviews, Google Cloud, Load Testing, Kubernetes, Testing, Uptime, Azure, Envoy, and C++.
16
+
17
+ Aditya is Dating Crystal
data/Aditya_train.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence, a passion for technological advancements, and a strong foundation in both theoretical knowledge and practical application.
2
+
3
+ During his tenure as a Research Assistant at the Department of Scientific Computing at Florida State University, Aditya not only demonstrated technical prowess but also showcased his ability to contribute meaningfully to academic research. His work on the EOAS/pyutils library, marked by the implementation of mathematical optimization and algorithms, reflects a keen understanding of optimizing software for efficiency. The achievement of reducing complexity from O(n^2) to O(n) is a testament to his problem-solving skills and dedication to pushing the boundaries of what is possible in software development.
4
+
5
+ In the realm of Full Stack Development at Aspire Systems, Aditya exhibited a holistic approach to software architecture. His role in revamping the software architecture for a leading UK-based Life Insurance Company not only resulted in a significant increase in online appointments and customer engagement but also showcased his strategic thinking in incorporating technologies like Kafka, Redis, and NoSQL databases for efficient data processing. The integration of RESTful APIs, along with frontend enhancements and the implementation of secure payment gateways, highlights his versatility in both backend and frontend development.
6
+
7
+ Aditya's client-facing responsibilities further underscore his ability to translate complex technical solutions into user-friendly, client-centric outcomes. His attention to detail in addressing client needs, considering variations, mandates, and security clearances, speaks to his understanding of the real-world implications of software solutions in diverse contexts.
8
+
9
+ Aditya served as a Customer Service Executive at Pollo Tropical in Tallahassee, FL, from August 2022 to August 2023. In this role, he contributed to creating a positive dining experience for customers by greeting them warmly and processing orders accurately through the point-of-sale (POS) system. His role also involved responding to customer inquiries, providing product information, making recommendations, and addressing concerns. Aditya collaborated with team members to ensure the smooth operation of the cashier station and timely service to customers, demonstrating effective teamwork and customer service skills.
10
+
11
+ During his undergraduate years, Aditya served as a Library Assistant at SRM Institute of Science & Technology in Chennai, TN, from August 2018 to August 2020. In this role, he demonstrated organizational skills by assembling class notebooks through digital printing and efficiently managing inventory of copy paper and office supplies. Aditya also handled pick-up and drop-off of department mail, responded to department emails and phone calls promptly, and acted as a backup for receiving materials, showcasing flexibility and reliability.
12
+
13
+ Aditya's skills include proficiency in Microsoft Office Suite (Word, Excel, Outlook) and data entry expertise. He is known for effective communication and customer service skills, along with a collaborative approach to teamwork. Additionally, he has experience with copier and shipping using FedEx, further showcasing his versatility.
14
+
15
+
16
+
17
+ As a Web Developer Intern at Impetus, Aditya not only demonstrated technical acumen but also showcased his ability to lead and innovate. Spearheading PHP-centric designs and overseeing CMS systems underscored his commitment to creating user-centric and flexible solutions. The automation of workflows with SharePoint, leading to a substantial reduction in operational overhead, showcases his efficiency in streamlining processes and improving overall organizational productivity.
18
+
19
+ Aditya's academic pursuits, including pursuing a Master’s of Science in Computer Science at Florida State University and completing a Bachelor of Technology in Computer Science Engineering at SRM University, reflect his dedication to continuous learning and academic excellence. His diverse skill set, ranging from programming languages to full-stack development technologies and data analysis tools, positions him as a well-rounded and adaptable professional.
20
+
21
+ In addition to his technical prowess, Aditya's achievements in offset, documentation, servers, supervision, leadership, and information technology underscore his ability to not only excel in technical roles but also contribute meaningfully to the broader aspects of project management and team leadership.
22
+
23
+ Aditya Sugandhi's journey as a Software Engineer is characterized by a unique blend of technical excellence, innovation, and a holistic understanding of the broader implications of software development. His commitment to pushing boundaries, both in academic research and industry settings, makes him a valuable asset to any organization seeking a dynamic and forward-thinking software engineer.y:
24
+
25
+
26
+ Aditya friends are SAS, Hunterr, MF, Rocco , he usally hangs out with them and have fun.
27
+ Aditya call his friends Seemanth as nigga, or N bhai!
28
+ Who is SAS : SAS aka Seemanth Sagar, his friends refer to him as olema, N bhai, jai shreemanth Nigger, OG Nigger,Nigga king!!
data/Resume_Vishwam_Shah_Back_end.pdf ADDED
Binary file (140 kB). View file
 
data/mf.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Master of Science degree in Computer Science at Florida State University, with a GPA of 3.91. He completed his Bachelor of Science in Computer Engineering from Gujarat Technological University in India, achieving a GPA of 3.9. His academic coursework and projects have focused on diverse areas such as Problem Solving, AI, Data Science, Cryptography, IoT, and Bioinformatics.
2
+
3
+ In terms of professional experience, Vishwam has a strong background as a Full Stack Developer & DevOps Engineer at MaMo Technolabs LLP in Gujarat, India. He managed a team of four developers, served as a SCRUM master, and successfully launched multiple innovative products. His expertise includes utilizing a variety of technologies such as NodeJS, MongoDB, ExpressJS, ReactJS, AngularJS, AWS, PHP, C++, Dart, Flutter, and more. Vishwam demonstrated proficiency in integrating RESTful APIs, optimizing UIs, and deploying applications on cloud architecture to achieve accelerated page load times.
4
+
5
+ Vishwam also has international experience as a Full Stack Intern at Paul Mason Consulting Limited in the UK and India, where he contributed to reducing voucher upload time and improved software deployment processes through continuous integration/continuous delivery (CI/CD) pipelines. Additionally, he served as a Back-End Intern at Akash Technolabs, contributing to the development of an interactive website with authentication APIs and CRUD operations.
6
+
7
+ In the academic realm, Vishwam served as a Researcher in the Department of Psychology – Neuroscience at Florida State University, where he utilized technologies such as MATLAB, fMRIPrep, FreeSurfer, and more. His contributions included spearheading custom MATLAB scripts for raw fMRI data preprocessing and enhancing data quality by preprocessing datasets.
8
+
9
+ As a Mentor at Women in Computer Science (WiCs), Vishwam played a pivotal role in empowering female students' participation in computer science. He architected the curriculum and mentored over 20 students, boosting their technical skills through hands-on workshops on full-stack development, AWS, and DevOps.
10
+
11
+ Vishwam's technical skills span various languages and frameworks, including C++, AJAX, Firebase, Docker, HTML, CSS, Bootstrap, and more. He is proficient in cloud platforms such as Google Cloud Platform (GCP) and Amazon Web Services (AWS). His development and collaboration tools expertise includes Git, Trello, Notion, ClickUp, JIRA, and more.
12
+
13
+ In the realm of certifications, Vishwam has completed certifications in Google Cloud Platform (GCP) Fundamentals, Essential Google Cloud Infrastructure, and Programming for Everybody (Getting Started with Python). He has also completed a certification in Python Data Structures.
14
+
15
+ Vishwam's project experience includes Medical Image Segmentation using Python, TensorFlow, Keras, PyTorch, and OpenCV, where he applied a U-Net model for segmenting cell nuclei in microscopic images. He has also worked on optimizing K-Core Decomposition for Large-Scale Networks using Java, Perl, GraphChi, and WebGraph, achieving linear time complexity with less than 1% update rate within 20 iterations.
16
+
17
+ Overall, Vishwam Shah possesses a well-rounded skill set, combining academic excellence, hands-on professional experience, and a strong commitment to mentoring and collaborative learning environments.
dataloader.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+ from haystack import Pipeline
4
+ from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder
5
+ from haystack.components.converters import PyPDFToDocument, TextFileToDocument
6
+ from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
7
+ from haystack.components.routers import FileTypeRouter
8
+ from haystack.components.joiners import DocumentJoiner
9
+ from haystack.components.writers import DocumentWriter
10
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
11
+ from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
12
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
13
+ class DataLoader:
14
+
15
+ def __init__(self):
16
+ self.chroma_store = ChromaDocumentStore()
17
+ self.InMemory_store = InMemoryDocumentStore()
18
+
19
+ def dataloader(self):
20
+ HERE = Path(os.getcwd())
21
+
22
+
23
+ data_path = HERE / "data"
24
+ file_paths = [str(data_path / name) for name in os.listdir(data_path)]
25
+
26
+
27
+
28
+ pipeline = Pipeline()
29
+ pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
30
+ pipeline.add_component("TextFileConverter", TextFileToDocument())
31
+ pipeline.add_component("PdfFileConverter", PyPDFToDocument())
32
+
33
+ pipeline.add_component("Joiner", DocumentJoiner())
34
+ pipeline.add_component("Cleaner", DocumentCleaner())
35
+ pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
36
+ # pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
37
+ pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
38
+
39
+ pipeline.add_component("Writer", DocumentWriter(document_store=self.chroma_store))
40
+
41
+ pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
42
+ pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
43
+ pipeline.connect("TextFileConverter.documents", "Joiner.documents")
44
+ pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
45
+ pipeline.connect("Joiner.documents", "Cleaner.documents")
46
+ pipeline.connect("Cleaner.documents", "Splitter.documents")
47
+ pipeline.connect("Splitter.documents", "Embedder.documents")
48
+ # pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
49
+ pipeline.connect("Embedder.documents", "Writer.documents")
50
+
51
+
52
+
53
+ pipeline.run(
54
+ {"FileTypeRouter": {"sources": file_paths}},
55
+
56
+ )
57
+ return self.chroma_store
58
+
59
+
60
+ def InMemory_dataloader(self):
61
+ HERE = Path(os.getcwd())
62
+
63
+
64
+ data_path = HERE / "data"
65
+ file_paths = [str(data_path / name) for name in os.listdir(data_path)]
66
+
67
+
68
+
69
+ pipeline = Pipeline()
70
+ pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
71
+ pipeline.add_component("TextFileConverter", TextFileToDocument())
72
+ pipeline.add_component("PdfFileConverter", PyPDFToDocument())
73
+
74
+ pipeline.add_component("Joiner", DocumentJoiner())
75
+ pipeline.add_component("Cleaner", DocumentCleaner())
76
+ pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
77
+ # pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
78
+ pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
79
+
80
+ pipeline.add_component("Writer", DocumentWriter(document_store=self.InMemory_store))
81
+
82
+ pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
83
+ pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
84
+ pipeline.connect("TextFileConverter.documents", "Joiner.documents")
85
+ pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
86
+ pipeline.connect("Joiner.documents", "Cleaner.documents")
87
+ pipeline.connect("Cleaner.documents", "Splitter.documents")
88
+ pipeline.connect("Splitter.documents", "Embedder.documents")
89
+ # pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
90
+ pipeline.connect("Embedder.documents", "Writer.documents")
91
+
92
+
93
+
94
+ pipeline.run(
95
+ {"FileTypeRouter": {"sources": file_paths}},
96
+
97
+ )
98
+ return self.InMemory_store
99
+
100
+
101
+ def get_chroma_store(self):
102
+ return self.chroma_store
103
+
104
+ def get_InMemory_store(self):
105
+ return self.InMemory_store
106
+
env.yaml ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: RAGAPP
2
+ channels:
3
+ - conda-forge
4
+ - pytorch
5
+ - nvidia
6
+ - defaults
7
+ dependencies:
8
+ - _libgcc_mutex=0.1=main
9
+ - _openmp_mutex=5.1=1_gnu
10
+ - abseil-cpp=20211102.0=hd4dd3e8_0
11
+ - aiohttp=3.9.3=py310h5eee18b_0
12
+ - arrow-cpp=14.0.2=h374c478_1
13
+ - asttokens=2.0.5=pyhd3eb1b0_0
14
+ - async-timeout=4.0.3=py310h06a4308_0
15
+ - aws-c-auth=0.6.19=h5eee18b_0
16
+ - aws-c-cal=0.5.20=hdbd6064_0
17
+ - aws-c-common=0.8.5=h5eee18b_0
18
+ - aws-c-compression=0.2.16=h5eee18b_0
19
+ - aws-c-event-stream=0.2.15=h6a678d5_0
20
+ - aws-c-http=0.6.25=h5eee18b_0
21
+ - aws-c-io=0.13.10=h5eee18b_0
22
+ - aws-c-mqtt=0.7.13=h5eee18b_0
23
+ - aws-c-s3=0.1.51=hdbd6064_0
24
+ - aws-c-sdkutils=0.1.6=h5eee18b_0
25
+ - aws-checksums=0.1.13=h5eee18b_0
26
+ - aws-crt-cpp=0.18.16=h6a678d5_0
27
+ - aws-sdk-cpp=1.10.55=h721c034_0
28
+ - blas=1.0=mkl
29
+ - blinker=1.6.2=py310h06a4308_0
30
+ - boost-cpp=1.82.0=hdb19cb5_2
31
+ - bottleneck=1.3.7=py310ha9d4c09_0
32
+ - brotli=1.0.9=h5eee18b_7
33
+ - brotli-bin=1.0.9=h5eee18b_7
34
+ - brotli-python=1.0.9=py310h6a678d5_7
35
+ - bzip2=1.0.8=h7b6447c_0
36
+ - c-ares=1.19.1=h5eee18b_0
37
+ - ca-certificates=2023.12.12=h06a4308_0
38
+ - certifi=2024.2.2=py310h06a4308_0
39
+ - cffi=1.16.0=py310h5eee18b_0
40
+ - charset-normalizer=2.0.4=pyhd3eb1b0_0
41
+ - click=8.1.7=py310h06a4308_0
42
+ - comm=0.1.2=py310h06a4308_0
43
+ - cryptography=42.0.2=py310hdda0065_0
44
+ - cuda-cudart=11.8.89=0
45
+ - cuda-cupti=11.8.87=0
46
+ - cuda-libraries=11.8.0=0
47
+ - cuda-nvrtc=11.8.89=0
48
+ - cuda-nvtx=11.8.86=0
49
+ - cuda-runtime=11.8.0=0
50
+ - cycler=0.11.0=pyhd3eb1b0_0
51
+ - cyrus-sasl=2.1.28=h52b45da_1
52
+ - datasets=2.12.0=py310h06a4308_0
53
+ - dbus=1.13.18=hb2f20db_0
54
+ - debugpy=1.6.7=py310h6a678d5_0
55
+ - decorator=5.1.1=pyhd3eb1b0_0
56
+ - dill=0.3.6=py310h06a4308_0
57
+ - exceptiongroup=1.2.0=py310h06a4308_0
58
+ - executing=0.8.3=pyhd3eb1b0_0
59
+ - expat=2.5.0=h6a678d5_0
60
+ - ffmpeg=4.3=hf484d3e_0
61
+ - filelock=3.13.1=py310h06a4308_0
62
+ - fontconfig=2.14.1=h4c34cd2_2
63
+ - fonttools=4.25.0=pyhd3eb1b0_0
64
+ - freetype=2.12.1=h4a9f257_0
65
+ - gflags=2.2.2=h6a678d5_1
66
+ - glib=2.78.4=h6a678d5_0
67
+ - glib-tools=2.78.4=h6a678d5_0
68
+ - glog=0.5.0=h6a678d5_1
69
+ - gmp=6.2.1=h295c915_3
70
+ - gmpy2=2.1.2=py310heeb90bb_0
71
+ - gnutls=3.6.15=he1e5248_0
72
+ - google-auth-oauthlib=0.4.4=pyhd3eb1b0_0
73
+ - grpc-cpp=1.48.2=he1ff14a_1
74
+ - gst-plugins-base=1.14.1=h6a678d5_1
75
+ - gstreamer=1.14.1=h5eee18b_1
76
+ - huggingface_hub=0.20.3=py310h06a4308_0
77
+ - icu=73.1=h6a678d5_0
78
+ - idna=3.4=py310h06a4308_0
79
+ - intel-openmp=2023.1.0=hdb19cb5_46306
80
+ - ipykernel=6.28.0=py310h06a4308_0
81
+ - ipython=8.20.0=py310h06a4308_0
82
+ - jedi=0.18.1=py310h06a4308_1
83
+ - jinja2=3.1.3=py310h06a4308_0
84
+ - jpeg=9e=h5eee18b_1
85
+ - jupyter_client=8.6.0=py310h06a4308_0
86
+ - jupyter_core=5.5.0=py310h06a4308_0
87
+ - kiwisolver=1.4.4=py310h6a678d5_0
88
+ - krb5=1.20.1=h143b758_1
89
+ - lame=3.100=h7b6447c_0
90
+ - lcms2=2.12=h3be6417_0
91
+ - ld_impl_linux-64=2.38=h1181459_1
92
+ - lerc=3.0=h295c915_0
93
+ - libboost=1.82.0=h109eef0_2
94
+ - libbrotlicommon=1.0.9=h5eee18b_7
95
+ - libbrotlidec=1.0.9=h5eee18b_7
96
+ - libbrotlienc=1.0.9=h5eee18b_7
97
+ - libclang=14.0.6=default_hc6dbbc7_1
98
+ - libclang13=14.0.6=default_he11475f_1
99
+ - libcublas=11.11.3.6=0
100
+ - libcufft=10.9.0.58=0
101
+ - libcufile=1.8.1.2=0
102
+ - libcups=2.4.2=h2d74bed_1
103
+ - libcurand=10.3.4.107=0
104
+ - libcurl=8.5.0=h251f7ec_0
105
+ - libcusolver=11.4.1.48=0
106
+ - libcusparse=11.7.5.86=0
107
+ - libdeflate=1.17=h5eee18b_1
108
+ - libedit=3.1.20230828=h5eee18b_0
109
+ - libev=4.33=h7f8727e_1
110
+ - libevent=2.1.12=hdbd6064_1
111
+ - libffi=3.4.4=h6a678d5_0
112
+ - libgcc-ng=11.2.0=h1234567_1
113
+ - libglib=2.78.4=hdc74915_0
114
+ - libgomp=11.2.0=h1234567_1
115
+ - libiconv=1.16=h7f8727e_2
116
+ - libidn2=2.3.4=h5eee18b_0
117
+ - libjpeg-turbo=2.0.0=h9bf148f_0
118
+ - libllvm14=14.0.6=hdb19cb5_3
119
+ - libnghttp2=1.57.0=h2d74bed_0
120
+ - libnpp=11.8.0.86=0
121
+ - libnvjpeg=11.9.0.86=0
122
+ - libpng=1.6.39=h5eee18b_0
123
+ - libpq=12.17=hdbd6064_0
124
+ - libprotobuf=3.20.3=he621ea3_0
125
+ - libsodium=1.0.18=h7b6447c_0
126
+ - libssh2=1.10.0=hdbd6064_2
127
+ - libstdcxx-ng=11.2.0=h1234567_1
128
+ - libtasn1=4.19.0=h5eee18b_0
129
+ - libthrift=0.15.0=h1795dd8_2
130
+ - libtiff=4.5.1=h6a678d5_0
131
+ - libunistring=0.9.10=h27cfd23_0
132
+ - libuuid=1.41.5=h5eee18b_0
133
+ - libwebp-base=1.3.2=h5eee18b_0
134
+ - libxcb=1.15=h7f8727e_0
135
+ - libxkbcommon=1.0.1=h5eee18b_1
136
+ - libxml2=2.10.4=hf1b16e4_1
137
+ - llvm-openmp=14.0.6=h9e868ea_0
138
+ - lz4-c=1.9.4=h6a678d5_0
139
+ - matplotlib=3.5.1=py310h06a4308_1
140
+ - matplotlib-base=3.5.1=py310ha18d171_1
141
+ - matplotlib-inline=0.1.6=py310h06a4308_0
142
+ - mkl=2023.1.0=h213fc3f_46344
143
+ - mkl-service=2.4.0=py310h5eee18b_1
144
+ - mkl_fft=1.3.8=py310h5eee18b_0
145
+ - mkl_random=1.2.4=py310hdb19cb5_0
146
+ - mpc=1.1.0=h10f8cd9_1
147
+ - mpfr=4.0.2=hb69a4c5_1
148
+ - mpmath=1.3.0=py310h06a4308_0
149
+ - multiprocess=0.70.14=py310h06a4308_0
150
+ - munkres=1.1.4=py_0
151
+ - mysql=5.7.24=h721c034_2
152
+ - ncurses=6.4=h6a678d5_0
153
+ - nest-asyncio=1.6.0=py310h06a4308_0
154
+ - nettle=3.7.3=hbbd107a_1
155
+ - networkx=3.1=py310h06a4308_0
156
+ - numexpr=2.8.7=py310h85018f9_0
157
+ - numpy=1.26.4=py310h5f9d8c6_0
158
+ - numpy-base=1.26.4=py310hb5e798b_0
159
+ - oauthlib=3.2.2=py310h06a4308_0
160
+ - openh264=2.1.1=h4ff587b_0
161
+ - openjpeg=2.4.0=h3ad879b_0
162
+ - openssl=3.0.13=h7f8727e_0
163
+ - orc=1.7.4=hb3bc3d3_1
164
+ - parso=0.8.3=pyhd3eb1b0_0
165
+ - pcre2=10.42=hebb0a14_0
166
+ - pexpect=4.8.0=pyhd3eb1b0_3
167
+ - pillow=10.2.0=py310h5eee18b_0
168
+ - pip=23.3.1=py310h06a4308_0
169
+ - platformdirs=3.10.0=py310h06a4308_0
170
+ - ply=3.11=py310h06a4308_0
171
+ - prompt-toolkit=3.0.43=py310h06a4308_0
172
+ - prompt_toolkit=3.0.43=hd3eb1b0_0
173
+ - psutil=5.9.0=py310h5eee18b_0
174
+ - ptyprocess=0.7.0=pyhd3eb1b0_2
175
+ - pure_eval=0.2.2=pyhd3eb1b0_0
176
+ - pyarrow=14.0.2=py310h1eedbd7_0
177
+ - pycparser=2.21=pyhd3eb1b0_0
178
+ - pygments=2.15.1=py310h06a4308_1
179
+ - pyjwt=2.4.0=py310h06a4308_0
180
+ - pyopenssl=24.0.0=py310h06a4308_0
181
+ - pyparsing=3.0.9=py310h06a4308_0
182
+ - pyqt=5.15.10=py310h6a678d5_0
183
+ - pyqt5-sip=12.13.0=py310h5eee18b_0
184
+ - pysocks=1.7.1=py310h06a4308_0
185
+ - python=3.10.13=h955ad1f_0
186
+ - python-dateutil=2.8.2=pyhd3eb1b0_0
187
+ - python-tzdata=2023.3=pyhd3eb1b0_0
188
+ - python-xxhash=2.0.2=py310h5eee18b_1
189
+ - pytorch-cuda=11.8=h7e8668a_5
190
+ - pytorch-mutex=1.0=cuda
191
+ - pyyaml=6.0.1=py310h5eee18b_0
192
+ - pyzmq=25.1.2=py310h6a678d5_0
193
+ - qt-main=5.15.2=h53bd1ea_10
194
+ - re2=2022.04.01=h295c915_0
195
+ - readline=8.2=h5eee18b_0
196
+ - regex=2023.10.3=py310h5eee18b_0
197
+ - requests=2.31.0=py310h06a4308_1
198
+ - responses=0.13.3=pyhd3eb1b0_0
199
+ - s2n=1.3.27=hdbd6064_0
200
+ - safetensors=0.4.2=py310ha89cbab_0
201
+ - setuptools=68.2.2=py310h06a4308_0
202
+ - sip=6.7.12=py310h6a678d5_0
203
+ - six=1.16.0=pyhd3eb1b0_1
204
+ - snappy=1.1.10=h6a678d5_1
205
+ - sqlite=3.41.2=h5eee18b_0
206
+ - stack_data=0.2.0=pyhd3eb1b0_0
207
+ - sympy=1.12=py310h06a4308_0
208
+ - tbb=2021.8.0=hdb19cb5_0
209
+ - tk=8.6.12=h1ccaba5_0
210
+ - tokenizers=0.15.1=py310h22610ee_0
211
+ - tomli=2.0.1=py310h06a4308_0
212
+ - torchaudio=2.2.1=py310_cu118
213
+ - torchvision=0.17.1=py310_cu118
214
+ - tornado=6.3.3=py310h5eee18b_0
215
+ - traitlets=5.7.1=py310h06a4308_0
216
+ - transformers=4.38.1=pyhd8ed1ab_0
217
+ - urllib3=2.1.0=py310h06a4308_1
218
+ - utf8proc=2.6.1=h5eee18b_1
219
+ - wcwidth=0.2.5=pyhd3eb1b0_0
220
+ - wheel=0.41.2=py310h06a4308_0
221
+ - xxhash=0.8.0=h7f8727e_3
222
+ - xz=5.4.5=h5eee18b_0
223
+ - yaml=0.2.5=h7b6447c_0
224
+ - zeromq=4.3.5=h6a678d5_0
225
+ - zlib=1.2.13=h5eee18b_0
226
+ - zstd=1.5.5=hc292b87_0
227
+ - pip:
228
+ - accelerate==0.27.2
229
+ - aiosignal==1.3.1
230
+ - annotated-types==0.6.0
231
+ - anyio==4.3.0
232
+ - asgiref==3.7.2
233
+ - attrs==23.2.0
234
+ - backoff==2.2.1
235
+ - bcrypt==4.1.2
236
+ - boilerpy3==1.0.7
237
+ - cachetools==5.3.2
238
+ - chroma-haystack==0.13.0
239
+ - chroma-hnswlib==0.7.3
240
+ - chromadb==0.4.19
241
+ - cmake==3.28.3
242
+ - coloredlogs==15.0.1
243
+ - deprecated==1.2.14
244
+ - diskcache==5.6.3
245
+ - distro==1.9.0
246
+ - elastic-transport==8.12.0
247
+ - elasticsearch==8.12.1
248
+ - elasticsearch-haystack==0.3.0
249
+ - fastapi==0.110.0
250
+ - flask==3.0.2
251
+ - flatbuffers==23.5.26
252
+ - frozenlist==1.4.1
253
+ - fsspec==2024.2.0
254
+ - google-auth==2.28.1
255
+ - googleapis-common-protos==1.62.0
256
+ - grpcio==1.62.0
257
+ - haystack-ai==2.0.0b8
258
+ - haystack-bm25==1.0.2
259
+ - httpcore==1.0.4
260
+ - httptools==0.6.1
261
+ - httpx==0.27.0
262
+ - humanfriendly==10.0
263
+ - importlib-metadata==6.11.0
264
+ - importlib-resources==6.1.2
265
+ - instructor-embedders-haystack==0.4.0
266
+ - instructorembedding==1.0.1
267
+ - itsdangerous==2.1.2
268
+ - joblib==1.3.2
269
+ - jsonlines==4.0.0
270
+ - jsonschema==4.21.1
271
+ - jsonschema-specifications==2023.12.1
272
+ - kubernetes==29.0.0
273
+ - lazy-imports==0.3.1
274
+ - lit==17.0.6
275
+ - llama-cpp-haystack==0.2.1
276
+ - llama-cpp-python==0.2.50
277
+ - markupsafe==2.1.5
278
+ - mistral-haystack==0.0.1
279
+ - mmh3==4.1.0
280
+ - monotonic==1.6
281
+ - more-itertools==10.2.0
282
+ - multidict==6.0.5
283
+ - nltk==3.8.1
284
+ - nvidia-cublas-cu11==11.10.3.66
285
+ - nvidia-cuda-cupti-cu11==11.7.101
286
+ - nvidia-cuda-nvrtc-cu11==11.7.99
287
+ - nvidia-cuda-runtime-cu11==11.7.99
288
+ - nvidia-cudnn-cu11==8.5.0.96
289
+ - nvidia-cufft-cu11==10.9.0.58
290
+ - nvidia-curand-cu11==10.2.10.91
291
+ - nvidia-cusolver-cu11==11.4.0.1
292
+ - nvidia-cusparse-cu11==11.7.4.91
293
+ - nvidia-nccl-cu11==2.14.3
294
+ - nvidia-nvtx-cu11==11.7.91
295
+ - onnxruntime==1.17.1
296
+ - openai==1.12.0
297
+ - opentelemetry-api==1.23.0
298
+ - opentelemetry-exporter-otlp-proto-common==1.23.0
299
+ - opentelemetry-exporter-otlp-proto-grpc==1.23.0
300
+ - opentelemetry-instrumentation==0.44b0
301
+ - opentelemetry-instrumentation-asgi==0.44b0
302
+ - opentelemetry-instrumentation-fastapi==0.44b0
303
+ - opentelemetry-proto==1.23.0
304
+ - opentelemetry-sdk==1.23.0
305
+ - opentelemetry-semantic-conventions==0.44b0
306
+ - opentelemetry-util-http==0.44b0
307
+ - orjson==3.9.15
308
+ - overrides==7.7.0
309
+ - packaging==23.2
310
+ - pandas==2.2.1
311
+ - posthog==3.4.2
312
+ - protobuf==3.19.6
313
+ - pulsar-client==3.4.0
314
+ - pyasn1==0.5.1
315
+ - pyasn1-modules==0.3.0
316
+ - pydantic==2.6.2
317
+ - pydantic-core==2.16.3
318
+ - pypdf==4.0.2
319
+ - pypika==0.48.9
320
+ - python-dotenv==1.0.1
321
+ - pytz==2024.1
322
+ - referencing==0.33.0
323
+ - requests-oauthlib==1.3.1
324
+ - rich==13.7.0
325
+ - rpds-py==0.18.0
326
+ - rsa==4.9
327
+ - scikit-learn==1.4.1.post1
328
+ - scipy==1.12.0
329
+ - sentence-transformers==2.2.2
330
+ - starlette==0.36.3
331
+ - tenacity==8.2.3
332
+ - threadpoolctl==3.3.0
333
+ - toolz==0.12.1
334
+ - torch==2.0.1
335
+ - tqdm==4.66.2
336
+ - triton==2.0.0
337
+ - typer==0.9.0
338
+ - typing-extensions==4.10.0
339
+ - tzdata==2024.1
340
+ - uvicorn==0.27.1
341
+ - uvloop==0.19.0
342
+ - watchfiles==0.21.0
343
+ - websocket-client==1.7.0
344
+ - websockets==12.0
345
+ - werkzeug==3.0.1
346
+ - wrapt==1.16.0
347
+ - yarl==1.9.4
348
+ - zipp==3.17.0
349
+ prefix: /conda/asugandhi/miniconda3/envs/RAGAPP
output_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {'llm': {'meta': [{'id': 'cmpl-acea8357-35de-4b59-b91d-f7a5a6db7df8', 'object': 'text_completion', 'created': 1708917529, 'model': 'openchat-3.5-1210.Q3_K_S.ggml', 'choices': [{'text': ' Numerical weather prediction (NWP) models use scientific laws and equations to', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 8186, 'completion_tokens': 16, 'total_tokens': 8202}}]}, 'answer_builder': {'answers': [GeneratedAnswer(data=' Numerical weather prediction (NWP) models use scientific laws and equations to', query='What are the key differences between GraphCast and traditional numerical weather prediction (NWP) models? write in at least 10000 words', documents=[Document(id=7e1267428582f9f3323ba16dc7fe4db1771b409537ae46b94bdeb4005888b9a5, content: 'GraphCast: Learning skillful medium-range
2
+ global weather forecasting
3
+ Remi Lam*,1, Alvaro Sanchez-Gon...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 0.6383349895477295, embedding: vector of size 384)], meta={})]}}
output_results.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {'llm': {'meta': [{'id': 'cmpl-256e8372-b43f-4fa9-8a91-156338e3ed5f', 'object': 'text_completion', 'created': 1708966691, 'model': 'openchat-3.5-1210.Q3_K_S.ggml', 'choices': [{'text': '\nAditya Sugandhi is known to have a group of close', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 3347, 'completion_tokens': 16, 'total_tokens': 3363}}]}, 'answer_builder': {'answers': [GeneratedAnswer(data='\nAditya Sugandhi is known to have a group of close', query="who are Aditya's friends?", documents=[Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.3431967496871948, embedding: vector of size 384), Document(id=11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9, content: 'Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encomp...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_test.txt', 'source_id': 'c85a2287836cae980897693decb5e9d07e80f60b7c96b4e542ef3057e11fc228'}, score: 1.3858964443206787, embedding: vector of size 384), Document(id=b9679ae3e33c58d9299d929f03d3b6f868d81dcd0fb7197d59e38c1962a4f92d, content: 'Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Ma...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/mf.txt', 'source_id': '6d425f2fa8ce25e5d4b7890423744220600079b727b22e39b514f70d4660eab5'}, score: 1.7688608169555664, embedding: vector of size 384), Document(id=a6ad41c3febd74d1f6825aac59c2d6dd7589ae8088bb3b449ea239c97d6f1b1c, content: ' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18
2
+ 1.2 HRES . . . . . . . . . . . . . ....', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 1.8065273761749268, embedding: vector of size 384), Document(id=21cdf14f25359517ba11fd718fafc4d245bff87411a165314b7e814a05924234, content: ' . . . . . . . . . . . . . . . . . . . . . . . . . . 25
3
+ 3.2 Architecture overview . . . . . . . . . ...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 1.8349415063858032, embedding: vector of size 384)], meta={})]}}
rag_model.ipynb ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 14,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "bert_load_from_file: gguf version = 2\n",
20
+ "bert_load_from_file: gguf alignment = 32\n",
21
+ "bert_load_from_file: gguf data offset = 695552\n",
22
+ "bert_load_from_file: model name = BERT\n",
23
+ "bert_load_from_file: model architecture = bert\n",
24
+ "bert_load_from_file: model file type = 1\n",
25
+ "bert_load_from_file: bert tokenizer vocab = 30522\n",
26
+ "[0.01552767027169466, 0.08103805035352707, -0.12307794392108917, 0.09815496951341629, 0.023653453215956688, -0.06102974712848663, 0.07934562116861343, 0.02745242230594158, -0.028132867068052292, 0.03221212700009346, 0.12919503450393677, 0.0025996030308306217, -0.04139482602477074, -0.06577245146036148, -0.014648980461061, 0.015588296577334404, -0.08434717357158661, -0.07182654738426208, 0.014775916934013367, -0.07444048672914505, 0.0590442530810833, 0.04814479872584343, 0.06639457494020462, 0.008800982497632504, -0.017847837880253792, -0.020949387922883034, -0.026810096576809883, 0.026885343715548515, -0.0764176994562149, -0.057069629430770874, 0.039454489946365356, 0.06288687884807587, 0.036681558936834335, 0.03875448554754257, 0.09926188737154007, 0.07691209763288498, -0.0007747725467197597, -0.05224066600203514, -0.06268111616373062, -0.00026997251552529633, 0.06668399274349213, -0.10031015425920486, -0.00970512256026268, -0.01601257175207138, -0.03624574467539787, -0.10884801298379898, -0.027961881831288338, -0.02198118157684803, 0.011900517158210278, -0.005993946921080351, -0.08890494704246521, -0.01797824539244175, -0.040237877517938614, -0.049093399196863174, -0.019428042694926262, -0.005168401636183262, 0.032794076949357986, -0.03235733509063721, -0.0705694779753685, -0.0941174328327179, -0.051176246255636215, 0.08234924077987671, -0.020688237622380257, 0.026870127767324448, -0.031070750206708908, 0.021878499537706375, -0.06237325817346573, 0.07108485698699951, 0.0030630987603217363, -0.06985890865325928, -0.05954312905669212, -0.05837850645184517, -0.09073222428560257, 0.005469962954521179, -0.021687401458621025, 0.0314265601336956, -0.025661440566182137, -0.0495171844959259, 0.0394166000187397, -0.029094435274600983, -0.018130596727132797, -0.04031619802117348, 0.08927112817764282, 0.00014257561997510493, -0.026646623387932777, 0.06340110301971436, 0.07394086569547653, 0.014260515570640564, -0.023962723091244698, -0.06585869938135147, 0.04496406018733978, 0.04277855530381203, 0.008617856539785862, 0.0665624663233757, 0.026723850518465042, 0.01059289276599884, 0.011615158058702946, -0.04054207354784012, -0.04994109272956848, 0.10845799744129181, 0.036834508180618286, 0.045918650925159454, -0.05060620605945587, 0.11201019585132599, -0.11668886244297028, -0.01581607758998871, 0.0960628017783165, -0.0488315187394619, 0.024895356968045235, -0.04963228479027748, -0.03182365745306015, -0.004189752042293549, -0.022618744522333145, -0.020297333598136902, 0.010558796115219593, -0.03451183810830116, -0.08592583984136581, 0.07002798467874527, -0.0014977692626416683, -0.020605681464076042, 0.0009889955399557948, -0.06769613176584244, -0.016587721183896065, -0.03945926949381828, 0.027652334421873093, -0.0037252188194543123, 4.02796795242466e-05, 2.496357863577944e-34, -0.019553543999791145, -0.006931365933269262, 0.05519813671708107, 0.030014386400580406, -0.027222076430916786, -0.0040949187241494656, 0.028509650379419327, 0.0003461719024926424, -0.07768791913986206, 0.026781603693962097, -0.021593185141682625, -0.043786026537418365, 0.03954899311065674, -0.029267827048897743, 0.03505752608180046, 0.005345764569938183, -0.01677117310464382, 0.08446278423070908, 0.05020565167069435, 0.041258785873651505, 0.03950535133481026, 0.05992049351334572, 0.004634900484234095, -0.0946483463048935, -0.028090720996260643, -0.03398402780294418, -0.02709619328379631, -0.04133094474673271, -0.005644459743052721, 0.032718855887651443, 0.010113613680005074, -0.02065439336001873, -0.016786033287644386, 0.03233509510755539, -0.06616782397031784, 0.029395416378974915, -0.00663745915517211, -0.06478383392095566, -0.09521140158176422, -0.010280981659889221, -0.03638819605112076, -0.007304533384740353, 0.13017326593399048, -0.06668204814195633, -0.012214419431984425, 0.09507791697978973, -0.0009454676182940602, 0.045288313180208206, 0.061766546219587326, 0.06407830119132996, -0.06472055613994598, 0.02868455834686756, 0.014445719309151173, 0.03761356323957443, 0.04157082363963127, 0.007912926375865936, -0.028237026184797287, -0.048911020159721375, 0.05634745582938194, 0.0031706185545772314, 0.024482648819684982, -0.0926365926861763, -0.028224240988492966, 0.01816745474934578, -0.0009234159952029586, -0.06061384454369545, 0.02713773585855961, -0.0657828152179718, 0.06030780076980591, 0.05763610824942589, -0.0024990146048367023, -0.031143246218562126, 0.014573169872164726, 0.05780758708715439, -0.005530690308660269, -0.024387281388044357, 0.025631394237279892, 0.04571927711367607, -0.07182186841964722, 0.02106345444917679, 0.047523558139801025, -0.025845326483249664, 0.04639439284801483, -0.0461527556180954, 0.06309600919485092, 0.002871520584449172, -0.019818803295493126, -0.01131194643676281, 0.04196448624134064, -0.017453346401453018, -0.043370626866817474, 0.06779050827026367, -0.11423997581005096, -0.007464131806045771, 0.07379034906625748, -1.0159212682046505e-33, 0.04116467386484146, -0.02187393046915531, -0.06464317440986633, -0.04831999912858009, 0.054312679916620255, -0.04359174892306328, 0.10390615463256836, -0.008244805969297886, 0.02429776079952717, 0.08679671585559845, 0.03324231505393982, -0.04018168896436691, 0.023248450830578804, -0.11267966777086258, 0.027334723621606827, -0.018510276451706886, -0.015763893723487854, -0.06620948016643524, -0.029428796842694283, 0.024292776361107826, -0.0836699977517128, 0.06186313182115555, 0.00979425199329853, 0.0149845527485013, 0.02952435240149498, -0.01609259471297264, 0.06341543793678284, 0.025381680577993393, -0.07650972157716751, -0.08898097276687622, 0.0543917752802372, 0.029732191935181618, -0.12705901265144348, 0.11817684024572372, 0.05331788584589958, -0.03143112361431122, 0.0274629145860672, 0.007251844275742769, -0.031150249764323235, 0.0817786380648613, 0.01751711592078209, 0.07238985598087311, -0.006944955326616764, -0.0723976194858551, 0.034229815006256104, -0.003155543003231287, 0.011516829021275043, -0.06810746341943741, 0.09528303891420364, -0.03101549670100212, 0.04598725214600563, -0.032259490340948105, 0.07952931523323059, 0.011015753261744976, 0.07233146578073502, 0.04757140204310417, 0.07436589896678925, 0.03568919375538826, -0.05899377539753914, -0.07132003456354141, 0.02570781111717224, 0.05620163306593895, 0.029458558186888695, 0.07280883193016052, 0.014483439736068249, -0.09305085241794586, 0.04503859579563141, -0.07544805109500885, 0.04793871194124222, -0.0066075995564460754, -0.027827860787510872, -0.07631555944681168, -0.05412726849317551, 0.056384310126304626, 0.056813593953847885, 0.06885606050491333, -0.001682625850662589, -0.021189114078879356, -0.004618695937097073, -0.04061309993267059, 0.10019382834434509, -0.030752010643482208, 0.036137741059064865, 0.035284142941236496, 0.022952962666749954, 0.0072324820794165134, 0.0515342652797699, 0.020784474909305573, 0.005023692734539509, 0.019894951954483986, 0.05247249826788902, 0.020828237757086754, -0.010321374982595444, 0.0026851524598896503, 0.0014503364218398929, -1.771797109029194e-08, -0.07890938222408295, -0.10603849589824677, -0.04075992852449417, 0.07047312706708908, -0.053525179624557495, 0.028504792600870132, -0.01275587547570467, -0.04736935719847679, -0.044071078300476074, -0.016645105555653572, -0.04981076717376709, -0.010642158798873425, 0.017387278378009796, 0.015506042167544365, -0.02702799066901207, -0.06912237405776978, -0.006346073932945728, 0.048564061522483826, 0.019542649388313293, -0.10184305161237717, -0.02131459303200245, 0.002071274910122156, 0.06019570678472519, -0.04933277890086174, -0.023822331801056862, 0.061753757297992706, 0.03395755961537361, 0.035142987966537476, 0.04514467716217041, -0.04209870100021362, 0.051735058426856995, -0.010264404118061066, 0.010600893758237362, -0.04388001188635826, 0.048436664044857025, 0.09170644730329514, 0.0874226912856102, 0.02946961112320423, -0.0049003129824995995, 0.03189241513609886, -0.05068569630384445, 0.04898029565811157, 0.06254067271947861, -0.021246548742055893, 0.041442159563302994, -0.04294992610812187, -0.11569153517484665, -0.029132820665836334, 0.027501607313752174, -0.11903877556324005, -0.0024651181884109974, -0.019488628953695297, 0.032330770045518875, 0.014155727811157703, -0.019860858097672462, -0.03563971444964409, 0.03158700466156006, 0.04575197398662567, -0.04244818910956383, 0.007442069705575705, 0.12420977652072906, -0.0006733344052918255, 0.0338529571890831, -0.03671126440167427]\n"
27
+ ]
28
+ }
29
+ ],
30
+ "source": [
31
+ "from gpt4all import GPT4All, Embed4All\n",
32
+ "text = 'Aditya_test.txt'\n",
33
+ "embedder = Embed4All()\n",
34
+ "output = embedder.embed(text)\n",
35
+ "print(output)"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 15,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "import langchain_community as lcc\n",
45
+ "from langchain_community.chat_models import ChatHuggingFace\n",
46
+ "\n",
47
+ "local_llm = 'NousResearch/Yarn-Mistral-7b-128k'\n",
48
+ "llm = ChatOllama(model=local_llm, temperature=0)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 13,
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "name": "stdout",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "bert_load_from_file: gguf version = 2\n",
61
+ "bert_load_from_file: gguf alignment = 32\n",
62
+ "bert_load_from_file: gguf data offset = 695552\n",
63
+ "bert_load_from_file: model name = BERT\n",
64
+ "bert_load_from_file: model architecture = bert\n",
65
+ "bert_load_from_file: model file type = 1\n",
66
+ "bert_load_from_file: bert tokenizer vocab = 30522\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "from langchain_community.embeddings import GPT4AllEmbeddings\n",
72
+ "\n",
73
+ "embedder = GPT4AllEmbeddings()"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 37,
79
+ "metadata": {},
80
+ "outputs": [
81
+ {
82
+ "ename": "AttributeError",
83
+ "evalue": "'dict' object has no attribute 'page_content'",
84
+ "output_type": "error",
85
+ "traceback": [
86
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
87
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
88
+ "Cell \u001b[0;32mIn[37], line 13\u001b[0m\n\u001b[1;32m 10\u001b[0m adjusted_documents \u001b[38;5;241m=\u001b[39m [{\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpage_content\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m], \u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m]} \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# Then, attempt to create the vector store with the adjusted document format\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m vectorstore \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madjusted_documents\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrag-chroma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membedder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 18\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vectorstore\u001b[38;5;241m.\u001b[39mas_retriever()\n",
89
+ "File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36mChroma.from_documents\u001b[0;34m(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m 758\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m 759\u001b[0m \n\u001b[1;32m 760\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m 775\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m texts \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 777\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m 779\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m 780\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 789\u001b[0m )\n",
90
+ "File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m 758\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m 759\u001b[0m \n\u001b[1;32m 760\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m 775\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m texts \u001b[38;5;241m=\u001b[39m [\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 777\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m 779\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m 780\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 789\u001b[0m )\n",
91
+ "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'page_content'"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "from langchain_community.vectorstores import Chroma\n",
97
+ "\n",
98
+ "# Example of preparing 'documents' variable (assuming each document is a string in a list)\n",
99
+ "# Here you would convert each text document into an embedding and prepare it as needed\n",
100
+ "\n",
101
+ "# Assuming 'embedder.embed(doc_text)' returns a numeric vector for each document\n",
102
+ "documents = [{'text': doc_text, 'embedding': embedder.embed(doc_text)} for doc_text in documents_list]\n",
103
+ "\n",
104
+ "# If Chroma expects a 'page_content' attribute, adjust your dictionaries accordingly\n",
105
+ "adjusted_documents = [{'page_content': doc['text'], 'embedding': doc['embedding']} for doc in documents]\n",
106
+ "\n",
107
+ "# Then, attempt to create the vector store with the adjusted document format\n",
108
+ "vectorstore = Chroma.from_documents(\n",
109
+ " documents=adjusted_documents,\n",
110
+ " collection_name=\"rag-chroma\",\n",
111
+ " embedding=embedder,\n",
112
+ ")\n",
113
+ "retriever = vectorstore.as_retriever()\n"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 16,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "# Assuming 'query' is defined and TextLoader is set up\n",
123
+ "query = \"who is Aditya\"\n",
124
+ "documents = TextLoader.load_documents(query)\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 27,
130
+ "metadata": {},
131
+ "outputs": [
132
+ {
133
+ "ename": "ImportError",
134
+ "evalue": "cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)",
135
+ "output_type": "error",
136
+ "traceback": [
137
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
138
+ "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
139
+ "Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Rag\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Initialize RAG model (ensure you have a compatible model loaded)\u001b[39;00m\n\u001b[1;32m 4\u001b[0m rag_model \u001b[38;5;241m=\u001b[39m Rag()\n",
140
+ "\u001b[0;31mImportError\u001b[0m: cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)"
141
+ ]
142
+ }
143
+ ],
144
+ "source": [
145
+ "from langchain_community.llms import Rag\n",
146
+ "\n",
147
+ "# Initialize RAG model (ensure you have a compatible model loaded)\n",
148
+ "rag_model = Rag()\n",
149
+ "\n",
150
+ "# Example function to generate answers using RAG and the retrieved documents\n",
151
+ "def generate_answer(rag_model, query, documents):\n",
152
+ " # Convert documents to a format suitable for the model, if necessary\n",
153
+ " context = ' '.join(documents) # Simplified; you might need a more sophisticated approach\n",
154
+ " \n",
155
+ " # Generate an answer using the RAG model\n",
156
+ " answer = rag_model.generate(query, context, \n",
157
+ " generation_kwargs={\"max_length\": 256, \"temperature\": 0.7})\n",
158
+ " return answer\n",
159
+ "\n",
160
+ "# Generate an answer for the query using retrieved documents as context\n",
161
+ "answer = generate_answer(rag_model, query, documents)\n",
162
+ "print(\"Generated Answer:\", answer)\n"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 21,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "His previous role as a Software Engineer at Aspire Systems in Chennai, India, showcases Aditya's versatility in both backend and frontend development. Leading the redesign of a Life Insurance Company's architecture, he prioritized low latency and high throughput, emphasizing a customer-centric approach. Aditya engineered 20 SOAP APIs for responsive patient data management, collaborated on front-end enhancements, and implemented secure payment gateways and Single Sign-On for authentication. His contribution to debugging strategies, real-time log analysis with Splunk, and CI/CD pipelines with Jenkins further underscore his commitment to optimizing system performance.\n"
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "# Example structure for fine-tuning (high-level and simplified)\n",
180
+ "from langchain.training import train_model\n",
181
+ "\n",
182
+ "# Define your training dataset\n",
183
+ "training_data = [(\"Question 1\", \"Answer 1\"), (\"Question 2\", \"Answer 2\"), ...]\n",
184
+ "\n",
185
+ "# Train (fine-tune) the model\n",
186
+ "train_model(rag_model, training_data, epochs=5, learning_rate=1e-5)\n"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 28,
192
+ "metadata": {},
193
+ "outputs": [
194
+ {
195
+ "name": "stderr",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
199
+ " from .autonotebook import tqdm as notebook_tqdm\n",
200
+ "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
201
+ ]
202
+ }
203
+ ],
204
+ "source": [
205
+ "from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration\n",
206
+ "\n",
207
+ "tokenizer = RagTokenizer.from_pretrained(\"facebook/rag-token-base\")\n",
208
+ "retriever = RagRetriever.from_pretrained(\"facebook/rag-token-base\")\n",
209
+ "generator = RagTokenForGeneration.from_pretrained(\"facebook/rag-token-base\")\n",
210
+ "\n",
211
+ "\n",
212
+ "def generate_answer(tokenizer, retriever, generator, query, documents):\n",
213
+ " inputs = tokenizer(query, documents, return_tensors=\"pt\", padding=\"max_length\", max_length=256, truncation=True)\n",
214
+ " input_ids = inputs[\"input_ids\"]\n",
215
+ " attention_mask = inputs[\"attention_mask\"]\n",
216
+ " doc_scores = retriever(input_ids, attention_mask)\n",
217
+ " context_input_ids = input_ids.new_full((input_ids.shape[0], 1), tokenizer.context_id, dtype=torch.long)\n",
218
+ " context_attention_mask = input_ids.new_full(context_input_ids.shape, 1)\n",
219
+ " generator_input_ids = torch.cat([context_input_ids, input_ids], dim=1)\n",
220
+ " generator_attention_mask = torch.cat([context_attention_mask, attention_mask], dim=1)\n",
221
+ " outputs = generator.generate(generator_input_ids, attention_mask=generator_attention_mask, doc_scores=doc_scores)\n",
222
+ " return tokenizer.batch_decode(outputs, skip_special_tokens=True)"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 4,
228
+ "metadata": {},
229
+ "outputs": [
230
+ {
231
+ "ename": "ModuleNotFoundError",
232
+ "evalue": "No module named 'haystack.indexing'",
233
+ "output_type": "error",
234
+ "traceback": [
235
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
236
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
237
+ "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtimeit\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcleaning\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m clean_wiki_text\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_file, fetch_archive_from_http\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m convert_files_to_dicts, fetch_archive_from_http\n",
238
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'haystack.indexing'"
239
+ ]
240
+ }
241
+ ],
242
+ "source": [
243
+ "import os\n",
244
+ "import timeit\n",
245
+ "# from haystack.indexing.cleaning import clean_wiki_text\n",
246
+ "# from haystack.indexing.io import open_file, fetch_archive_from_http\n",
247
+ "# from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
248
+ "from haystack.preprocessor.cleaning import clean_whitespace, clean_html, clean_preprocessor,clean_wiki_text\n",
249
+ "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n",
250
+ "from haystack.preprocessor import PreProcessor\n",
251
+ "from haystack.document_store import InMemoryDocumentStore, WeaviateDocumentStore\n",
252
+ "from haystack.retriever.dense import EmbeddingRetriever\n",
253
+ "from haystack.utils import print_answers\n",
254
+ "\n",
255
+ "def run_ingest():\n",
256
+ " # Update DATA_PATH to include \"Aditya_train.txt\"\n",
257
+ " data_file = \"Aditya_train.txt\"\n",
258
+ " DATA_PATH = os.path.join(cfg.DATA_PATH, data_file)\n",
259
+ " \n",
260
+ " # Ensure the file exists\n",
261
+ " if os.path.isfile(DATA_PATH):\n",
262
+ " start = timeit.default_timer()\n",
263
+ "\n",
264
+ " vector_store = WeaviateDocumentStore(host=cfg.WEAVIATE_HOST,\n",
265
+ " port=cfg.WEAVIATE_PORT,\n",
266
+ " embedding_dim=cfg.WEAVIATE_EMBEDDING_DIM)\n",
267
+ "\n",
268
+ " # Convert text files to dictionaries\n",
269
+ " raw_docs = convert_files_to_dicts(dir_path=DATA_PATH, clean_func=clean_wiki_text, split_paragraphs=True)\n",
270
+ "\n",
271
+ " # Convert to desired format\n",
272
+ " final_doc = []\n",
273
+ " for doc in raw_docs:\n",
274
+ " new_doc = {\n",
275
+ " 'content': doc['text'],\n",
276
+ " 'meta': {'name': doc['name']}\n",
277
+ " }\n",
278
+ " final_doc.append(new_doc)\n",
279
+ "\n",
280
+ " preprocessor = PreProcessor(\n",
281
+ " clean_empty_lines=True,\n",
282
+ " clean_whitespace=False,\n",
283
+ " clean_header_footer=False,\n",
284
+ " split_by=\"word\",\n",
285
+ " language=\"en\",\n",
286
+ " split_length=cfg.PRE_PROCESSOR_SPLIT_LENGTH,\n",
287
+ " split_overlap=cfg.PRE_PROCESSOR_SPLIT_OVERLAP,\n",
288
+ " split_respect_sentence_boundary=True,\n",
289
+ " )\n",
290
+ "\n",
291
+ " preprocessed_docs = preprocessor.process(final_doc)\n",
292
+ " vector_store.write_documents(preprocessed_docs)\n",
293
+ "\n",
294
+ " retriever = EmbeddingRetriever(\n",
295
+ " document_store=vector_store,\n",
296
+ " embedding_model=cfg.EMBEDDINGS\n",
297
+ " )\n",
298
+ " vector_store.update_embeddings(retriever)\n",
299
+ "\n",
300
+ " end = timeit.default_timer()\n",
301
+ " print(f\"Time to prepare embeddings: {end - start}\")\n",
302
+ " else:\n",
303
+ " print(f\"File {data_file} not found in the specified DATA_PATH.\")\n"
304
+ ]
305
+ }
306
+ ],
307
+ "metadata": {
308
+ "kernelspec": {
309
+ "display_name": "Langchain",
310
+ "language": "python",
311
+ "name": "python3"
312
+ },
313
+ "language_info": {
314
+ "codemirror_mode": {
315
+ "name": "ipython",
316
+ "version": 3
317
+ },
318
+ "file_extension": ".py",
319
+ "mimetype": "text/x-python",
320
+ "name": "python",
321
+ "nbconvert_exporter": "python",
322
+ "pygments_lexer": "ipython3",
323
+ "version": "3.11.0"
324
+ }
325
+ },
326
+ "nbformat": 4,
327
+ "nbformat_minor": 2
328
+ }
req.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.6.0
2
+ anyio==4.3.0
3
+ appnope==0.1.4
4
+ asgiref==3.7.2
5
+ asttokens==2.4.1
6
+ backoff==2.2.1
7
+ bcrypt==4.1.2
8
+ blinker==1.7.0
9
+ boilerpy3==1.0.7
10
+ cachetools==5.3.3
11
+ certifi==2024.2.2
12
+ charset-normalizer==3.3.2
13
+ chroma-haystack==0.15.0
14
+ chroma-hnswlib==0.7.3
15
+ chromadb==0.4.19
16
+ click==8.1.7
17
+ coloredlogs==15.0.1
18
+ comm==0.2.2
19
+ debugpy==1.8.1
20
+ decorator==5.1.1
21
+ Deprecated==1.2.14
22
+ distro==1.9.0
23
+ exceptiongroup==1.2.0
24
+ executing==2.0.1
25
+ fastapi==0.110.0
26
+ filelock==3.13.1
27
+ Flask==3.0.2
28
+ flatbuffers==24.3.7
29
+ fsspec==2024.2.0
30
+ google-auth==2.28.2
31
+ googleapis-common-protos==1.63.0
32
+ grpcio==1.62.1
33
+ h11==0.14.0
34
+ haystack-ai==2.0.0
35
+ haystack-bm25==1.0.2
36
+ httpcore==1.0.4
37
+ httptools==0.6.1
38
+ httpx==0.27.0
39
+ huggingface-hub==0.21.4
40
+ humanfriendly==10.0
41
+ idna==3.6
42
+ importlib-metadata==6.11.0
43
+ importlib_resources==6.3.0
44
+ ipykernel==6.29.3
45
+ ipython==8.18.1
46
+ itsdangerous==2.1.2
47
+ jedi==0.19.1
48
+ Jinja2==3.1.3
49
+ joblib==1.3.2
50
+ jupyter_client==8.6.1
51
+ jupyter_core==5.7.2
52
+ kubernetes==29.0.0
53
+ lazy-imports==0.3.1
54
+ MarkupSafe==2.1.5
55
+ matplotlib-inline==0.1.6
56
+ mmh3==4.1.0
57
+ monotonic==1.6
58
+ more-itertools==10.2.0
59
+ mpmath==1.3.0
60
+ nest-asyncio==1.6.0
61
+ networkx==3.2.1
62
+ numpy==1.26.4
63
+ oauthlib==3.2.2
64
+ onnxruntime==1.16.3
65
+ openai==1.14.0
66
+ opentelemetry-api==1.23.0
67
+ opentelemetry-exporter-otlp-proto-common==1.23.0
68
+ opentelemetry-exporter-otlp-proto-grpc==1.23.0
69
+ opentelemetry-instrumentation==0.44b0
70
+ opentelemetry-instrumentation-asgi==0.44b0
71
+ opentelemetry-instrumentation-fastapi==0.44b0
72
+ opentelemetry-proto==1.23.0
73
+ opentelemetry-sdk==1.23.0
74
+ opentelemetry-semantic-conventions==0.44b0
75
+ opentelemetry-util-http==0.44b0
76
+ overrides==7.7.0
77
+ packaging==24.0
78
+ pandas==2.2.1
79
+ parso==0.8.3
80
+ pexpect==4.9.0
81
+ pillow==10.2.0
82
+ platformdirs==4.2.0
83
+ posthog==3.5.0
84
+ prompt-toolkit==3.0.43
85
+ protobuf==4.25.3
86
+ psutil==5.9.8
87
+ ptyprocess==0.7.0
88
+ pulsar-client==3.4.0
89
+ pure-eval==0.2.2
90
+ pyasn1==0.5.1
91
+ pyasn1-modules==0.3.0
92
+ pydantic==2.6.4
93
+ pydantic_core==2.16.3
94
+ Pygments==2.17.2
95
+ pypdf==4.1.0
96
+ PyPika==0.48.9
97
+ python-dateutil==2.9.0.post0
98
+ python-dotenv==1.0.1
99
+ pytz==2024.1
100
+ PyYAML==6.0.1
101
+ pyzmq==25.1.2
102
+ regex==2023.12.25
103
+ requests==2.31.0
104
+ requests-oauthlib==1.4.0
105
+ rsa==4.9
106
+ safetensors==0.4.2
107
+ scikit-learn==1.4.1.post1
108
+ scipy==1.12.0
109
+ sentence-transformers==2.5.1
110
+ six==1.16.0
111
+ sniffio==1.3.1
112
+ stack-data==0.6.3
113
+ starlette==0.36.3
114
+ sympy==1.12
115
+ tenacity==8.2.3
116
+ threadpoolctl==3.3.0
117
+ tokenizers==0.15.2
118
+ torch==2.2.1
119
+ tornado==6.4
120
+ tqdm==4.66.2
121
+ traitlets==5.14.2
122
+ transformers==4.38.2
123
+ typer==0.9.0
124
+ typing_extensions==4.10.0
125
+ tzdata==2024.1
126
+ urllib3==2.2.1
127
+ uvicorn==0.28.0
128
+ uvloop==0.19.0
129
+ watchfiles==0.21.0
130
+ wcwidth==0.2.13
131
+ websocket-client==1.7.0
132
+ websockets==12.0
133
+ Werkzeug==3.0.1
134
+ wrapt==1.16.0
135
+ zipp==3.18.0
test.ipynb ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "For text TextFileToDocument\n",
8
+ "for pdf PyPDFToDocument"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "metadata": {},
15
+ "outputs": [
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "/unity/f2/asugandhi/Downloads/LLM_Playground\n",
21
+ "\n"
22
+ ]
23
+ },
24
+ {
25
+ "ename": "ValueError",
26
+ "evalue": "Input batch_size not found in component PdfFileConverter.",
27
+ "output_type": "error",
28
+ "traceback": [
29
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
30
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
31
+ "Cell \u001b[0;32mIn[21], line 27\u001b[0m\n\u001b[1;32m 25\u001b[0m pipeline\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPdfFileConverter\u001b[39m\u001b[38;5;124m\"\u001b[39m,\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPdfwriter_chroma\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 26\u001b[0m pipeline\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTextFileConverter\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwriter_chroma\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 27\u001b[0m \u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPdfFileConverter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msources\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTextFileConverter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msources\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 30\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 33\u001b[0m querying \u001b[38;5;241m=\u001b[39m Pipeline()\n\u001b[1;32m 34\u001b[0m reader \u001b[38;5;241m=\u001b[39m ExtractiveReader(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeepset/roberta-base-squad2-distilled\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
32
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/core/pipeline/pipeline.py:688\u001b[0m, in \u001b[0;36mPipeline.run\u001b[0;34m(self, data, debug)\u001b[0m\n\u001b[1;32m 682\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 683\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInputs \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m were not matched to any component inputs, please check your run parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 684\u001b[0m \u001b[38;5;28mlist\u001b[39m(unresolved_inputs\u001b[38;5;241m.\u001b[39mkeys()),\n\u001b[1;32m 685\u001b[0m )\n\u001b[1;32m 687\u001b[0m \u001b[38;5;66;03m# Raise if input is malformed in some way\u001b[39;00m\n\u001b[0;32m--> 688\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 689\u001b[0m \u001b[38;5;66;03m# NOTE: The above NOTE and TODO are technically not true.\u001b[39;00m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;66;03m# This implementation of run supports only the first format, but the second format is actually\u001b[39;00m\n\u001b[1;32m 691\u001b[0m \u001b[38;5;66;03m# never received by this method. It's handled by the `run()` method of the `Pipeline` class\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 695\u001b[0m \u001b[38;5;66;03m# deepcopying the inputs prevents the Pipeline run logic from being altered unexpectedly\u001b[39;00m\n\u001b[1;32m 696\u001b[0m \u001b[38;5;66;03m# when the same input reference is passed to multiple components.\u001b[39;00m\n\u001b[1;32m 697\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m component_name, component_inputs \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems():\n",
33
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/core/pipeline/pipeline.py:594\u001b[0m, in \u001b[0;36mPipeline._validate_input\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m input_name \u001b[38;5;129;01min\u001b[39;00m component_inputs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m input_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m instance\u001b[38;5;241m.\u001b[39m__haystack_input__\u001b[38;5;241m.\u001b[39m_sockets_dict:\n\u001b[0;32m--> 594\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInput \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minput_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in component \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 596\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m component_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph\u001b[38;5;241m.\u001b[39mnodes:\n\u001b[1;32m 597\u001b[0m instance \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph\u001b[38;5;241m.\u001b[39mnodes[component_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstance\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
34
+ "\u001b[0;31mValueError\u001b[0m: Input batch_size not found in component PdfFileConverter."
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "import os\n",
40
+ "from haystack import Pipeline, Document\n",
41
+ "from haystack.components.converters import TextFileToDocument, PyPDFToDocument\n",
42
+ "from haystack.components.writers import DocumentWriter\n",
43
+ "from haystack.components.readers import ExtractiveReader\n",
44
+ "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n",
45
+ "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
46
+ "from pathlib import Path\n",
47
+ "HERE = Path(os.getcwd())\n",
48
+ "print(HERE)\n",
49
+ "\n",
50
+ "data_path = HERE / \"data\"\n",
51
+ "file_paths = [data_path / Path(name) for name in os.listdir(\"data\")]\n",
52
+ "print()\n",
53
+ "chroma_store = ChromaDocumentStore()\n",
54
+ "# Resolve the absolute path\n",
55
+ "# absolute_file_path = file_path.resolve()\n",
56
+ "# print(absolute_file_path)\n",
57
+ "pipeline = Pipeline()\n",
58
+ "pipeline.add_component(\"PdfFileConverter\", PyPDFToDocument())\n",
59
+ "pipeline.add_component(\"TextFileConverter\", TextFileToDocument())\n",
60
+ "pipeline.add_component(\"Pdfwriter_chroma\", DocumentWriter(document_store=chroma_store))\n",
61
+ "pipeline.add_component(\"writer_chroma\", DocumentWriter(document_store=chroma_store))\n",
62
+ "\n",
63
+ "pipeline.connect(\"PdfFileConverter\",\"Pdfwriter_chroma\")\n",
64
+ "pipeline.connect(\"TextFileConverter\", \"writer_chroma\")\n",
65
+ "pipeline.run(\n",
66
+ " {\"PdfFileConverter\": {\"sources\": file_paths, \"batch_size\": 1}},\n",
67
+ " {\"TextFileConverter\": {\"sources\": file_paths, \"batch_size\": 1}},\n",
68
+ ")\n",
69
+ " \n",
70
+ " \n",
71
+ "querying = Pipeline()\n",
72
+ "reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
73
+ "querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
74
+ "querying.add_component(\"reader\",reader)\n",
75
+ "results = querying.run({\"retriever\": {\"query\": \"Vishwam\", \"top_k\": 3}})\n"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stdout",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "/unity/f2/asugandhi/Downloads/LLM_Playground\n",
88
+ "{'reader': {'answers': [ExtractedAnswer(query='Who is Aditya?', score=0.6858945488929749, data='Software Engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=31, end=48), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.627069890499115, data='Sugandhi', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=7, end=15), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.5672385096549988, data='Software Engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=4616, end=4633), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.5219605565071106, data='software engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=4961, end=4978), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.5016087889671326, data='Sugandhi', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=4592, end=4600), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.44805991649627686, data='Web Developer Intern', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=3343, end=3363), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.0066661882226549205, data=None, document=None, context=None, document_offset=None, context_offset=None, meta={})]}}\n"
89
+ ]
90
+ }
91
+ ],
92
+ "source": [
93
+ "from pathlib import Path\n",
94
+ "import os\n",
95
+ "from haystack import Pipeline\n",
96
+ "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
97
+ "from haystack.components.converters import PyPDFToDocument, TextFileToDocument\n",
98
+ "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n",
99
+ "from haystack.components.readers import ExtractiveReader\n",
100
+ "from haystack.components.routers import FileTypeRouter\n",
101
+ "from haystack.components.joiners import DocumentJoiner\n",
102
+ "from haystack.components.writers import DocumentWriter\n",
103
+ "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n",
104
+ "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
105
+ "\n",
106
+ "HERE = Path(os.getcwd())\n",
107
+ "print(HERE)\n",
108
+ "\n",
109
+ "data_path = HERE / \"data\"\n",
110
+ "file_paths = [str(data_path / name) for name in os.listdir(data_path)]\n",
111
+ "\n",
112
+ "chroma_store = ChromaDocumentStore()\n",
113
+ "\n",
114
+ "pipeline = Pipeline()\n",
115
+ "pipeline.add_component(\"FileTypeRouter\", FileTypeRouter(mime_types=[\"text/plain\", \"application/pdf\"]))\n",
116
+ "pipeline.add_component(\"TextFileConverter\", TextFileToDocument())\n",
117
+ "pipeline.add_component(\"PdfFileConverter\", PyPDFToDocument())\n",
118
+ "pipeline.add_component(\"Joiner\", DocumentJoiner())\n",
119
+ "pipeline.add_component(\"Cleaner\", DocumentCleaner())\n",
120
+ "pipeline.add_component(\"Splitter\", DocumentSplitter(split_by=\"sentence\", split_length=250, split_overlap=30))\n",
121
+ "# pipeline.add_component(\"Embedder\", SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\"))\n",
122
+ "pipeline.add_component(\"Writer\", DocumentWriter(document_store=chroma_store))\n",
123
+ "\n",
124
+ "pipeline.connect(\"FileTypeRouter.text/plain\", \"TextFileConverter.sources\")\n",
125
+ "pipeline.connect(\"FileTypeRouter.application/pdf\", \"PdfFileConverter.sources\")\n",
126
+ "pipeline.connect(\"TextFileConverter.documents\", \"Joiner.documents\")\n",
127
+ "pipeline.connect(\"PdfFileConverter.documents\", \"Joiner.documents\")\n",
128
+ "pipeline.connect(\"Joiner.documents\", \"Cleaner.documents\")\n",
129
+ "pipeline.connect(\"Cleaner.documents\", \"Splitter.documents\")\n",
130
+ "pipeline.connect(\"Splitter.documents\", \"Writer.documents\")\n",
131
+ "# pipeline.connect(\"Embedder.documents\", \"Writer.documents\")\n",
132
+ "\n",
133
+ "pipeline.run(\n",
134
+ " {\"FileTypeRouter\": {\"sources\": file_paths}},\n",
135
+ ")\n",
136
+ "\n",
137
+ "# Querying pipeline\n",
138
+ "querying = Pipeline()\n",
139
+ "reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
140
+ "querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
141
+ "querying.add_component(\"reader\", reader)\n",
142
+ "querying.connect(\"retriever\", \"reader\")\n",
143
+ "query = \"Who is Aditya?\"\n",
144
+ "input_data = {\n",
145
+ " \"retriever\": {\"query\": query, \"top_k\": 1},\n",
146
+ " \"reader\": {\"query\": query},\n",
147
+ " # Use 'max_tokens' instead of 'max_new_tokens'\n",
148
+ " }\n",
149
+ "results = querying.run(input_data)\n",
150
+ "print(results)\n"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "markdown",
155
+ "metadata": {},
156
+ "source": [
157
+ "#DON'T RUN"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": []
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 7,
170
+ "metadata": {},
171
+ "outputs": [
172
+ {
173
+ "name": "stdout",
174
+ "output_type": "stream",
175
+ "text": [
176
+ "who is Aditya?\n",
177
+ "{'llm': {'replies': ['Aditya Sugandhi is a Software Engineer with a strong foundation in both theoretical knowledge and practical application, known for his commitment to excellence, passion for technological advancements, and dedication to pushing boundaries in software development. He has experience in various roles such as a Research Assistant, Full Stack Developer, Customer Service Executive, and Web Developer Intern. Aditya is currently pursuing a Master’s of Science in Computer Science at Florida State University and holds a Bachelor of Technology in Computer Science Engineering from SRM University. He is characterized by technical excellence, innovation, and a holistic understanding of software development. Aditya enjoys spending time with his friends SAS, Hunterr, MF, and Rocco.'], 'meta': [{'model': 'gpt-3.5-turbo-0125', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 138, 'prompt_tokens': 917, 'total_tokens': 1055}}]}}\n"
178
+ ]
179
+ }
180
+ ],
181
+ "source": [
182
+ "from haystack import Pipeline\n",
183
+ "from haystack.utils import Secret\n",
184
+ "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
185
+ "from haystack.components.readers import ExtractiveReader\n",
186
+ "from haystack.components.generators import GPTGenerator\n",
187
+ "from haystack.components.builders.prompt_builder import PromptBuilder\n",
188
+ "from haystack.components.generators import OpenAIGenerator\n",
189
+ "\n",
190
+ "template = \"\"\"\n",
191
+ "Answer all the questions in the following format and based on Aditya.\n",
192
+ "\n",
193
+ "Context:\n",
194
+ "{% for doc in documents %}\n",
195
+ " {{ doc.content }}\n",
196
+ "{% endfor %}\n",
197
+ "Question: {{question}}\n",
198
+ "Answer:\n",
199
+ "\"\"\"\n",
200
+ "\n",
201
+ "prompt_builder = PromptBuilder(template=template)\n",
202
+ "retriever = ChromaQueryTextRetriever(document_store = chroma_store)\n",
203
+ "#ExtractiveReader to extract answers from the relevant context\n",
204
+ "api_key = Secret.from_token(\"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\")\n",
205
+ "llm = OpenAIGenerator(model=\"gpt-3.5-turbo-0125\",api_key=api_key)\n",
206
+ "reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
207
+ "\n",
208
+ "extractive_qa_pipeline = Pipeline()\n",
209
+ "extractive_qa_pipeline.add_component(\"retriever\", retriever)\n",
210
+ "# extractive_qa_pipeline.add_component(\"reader\",reader)\n",
211
+ "extractive_qa_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n",
212
+ "extractive_qa_pipeline.add_component(\"llm\", llm)\n",
213
+ "\n",
214
+ "\n",
215
+ "# extractive_qa_pipeline.connect(\"retriever\", \"reader\")\n",
216
+ "extractive_qa_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
217
+ "extractive_qa_pipeline.connect(\"prompt_builder\", \"llm\")\n",
218
+ "\n",
219
+ "\n",
220
+ "query = \"who is Aditya?\"\n",
221
+ "print(query)\n",
222
+ "# Define the input data for the pipeline components\n",
223
+ "input_data = {\n",
224
+ " \"retriever\": {\"query\": query, \"top_k\": 1},\n",
225
+ " \"prompt_builder\": {\"question\": query},\n",
226
+ " # Use 'max_tokens' instead of 'max_new_tokens'\n",
227
+ "}\n",
228
+ "\n",
229
+ "# Run the pipeline with the updated input data\n",
230
+ "results = extractive_qa_pipeline.run(input_data)\n",
231
+ "print(results)"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "metadata": {},
238
+ "outputs": [],
239
+ "source": []
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 20,
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stderr",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from openchat-3.5-1210.Q3_K_S.ggml (version GGUF V3 (latest))\n",
251
+ "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
252
+ "llama_model_loader: - kv 0: general.architecture str = llama\n",
253
+ "llama_model_loader: - kv 1: general.name str = openchat_openchat-3.5-1210\n",
254
+ "llama_model_loader: - kv 2: llama.context_length u32 = 8192\n",
255
+ "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n",
256
+ "llama_model_loader: - kv 4: llama.block_count u32 = 32\n",
257
+ "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n",
258
+ "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n",
259
+ "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n",
260
+ "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n",
261
+ "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n",
262
+ "llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n",
263
+ "llama_model_loader: - kv 11: general.file_type u32 = 11\n",
264
+ "llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n",
265
+ "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32002] = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
266
+ "llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32002] = [0.000000, 0.000000, 0.000000, 0.0000...\n",
267
+ "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32002] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
268
+ "llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n",
269
+ "llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n",
270
+ "llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0\n",
271
+ "llama_model_loader: - kv 19: tokenizer.ggml.add_bos_token bool = true\n",
272
+ "llama_model_loader: - kv 20: tokenizer.ggml.add_eos_token bool = false\n",
273
+ "llama_model_loader: - kv 21: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...\n",
274
+ "llama_model_loader: - kv 22: general.quantization_version u32 = 2\n",
275
+ "llama_model_loader: - type f32: 65 tensors\n",
276
+ "llama_model_loader: - type q3_K: 225 tensors\n",
277
+ "llama_model_loader: - type q6_K: 1 tensors\n",
278
+ "llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
279
+ "llm_load_print_meta: format = GGUF V3 (latest)\n",
280
+ "llm_load_print_meta: arch = llama\n",
281
+ "llm_load_print_meta: vocab type = SPM\n",
282
+ "llm_load_print_meta: n_vocab = 32002\n",
283
+ "llm_load_print_meta: n_merges = 0\n",
284
+ "llm_load_print_meta: n_ctx_train = 8192\n",
285
+ "llm_load_print_meta: n_embd = 4096\n",
286
+ "llm_load_print_meta: n_head = 32\n",
287
+ "llm_load_print_meta: n_head_kv = 8\n",
288
+ "llm_load_print_meta: n_layer = 32\n",
289
+ "llm_load_print_meta: n_rot = 128\n",
290
+ "llm_load_print_meta: n_embd_head_k = 128\n",
291
+ "llm_load_print_meta: n_embd_head_v = 128\n",
292
+ "llm_load_print_meta: n_gqa = 4\n",
293
+ "llm_load_print_meta: n_embd_k_gqa = 1024\n",
294
+ "llm_load_print_meta: n_embd_v_gqa = 1024\n",
295
+ "llm_load_print_meta: f_norm_eps = 0.0e+00\n",
296
+ "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
297
+ "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
298
+ "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
299
+ "llm_load_print_meta: n_ff = 14336\n",
300
+ "llm_load_print_meta: n_expert = 0\n",
301
+ "llm_load_print_meta: n_expert_used = 0\n",
302
+ "llm_load_print_meta: rope scaling = linear\n",
303
+ "llm_load_print_meta: freq_base_train = 10000.0\n",
304
+ "llm_load_print_meta: freq_scale_train = 1\n",
305
+ "llm_load_print_meta: n_yarn_orig_ctx = 8192\n",
306
+ "llm_load_print_meta: rope_finetuned = unknown\n",
307
+ "llm_load_print_meta: model type = 7B\n",
308
+ "llm_load_print_meta: model ftype = Q3_K - Small\n",
309
+ "llm_load_print_meta: model params = 7.24 B\n",
310
+ "llm_load_print_meta: model size = 2.95 GiB (3.50 BPW) \n",
311
+ "llm_load_print_meta: general.name = openchat_openchat-3.5-1210\n",
312
+ "llm_load_print_meta: BOS token = 1 '<s>'\n",
313
+ "llm_load_print_meta: EOS token = 32000 '<|end_of_turn|>'\n",
314
+ "llm_load_print_meta: UNK token = 0 '<unk>'\n",
315
+ "llm_load_print_meta: PAD token = 0 '<unk>'\n",
316
+ "llm_load_print_meta: LF token = 13 '<0x0A>'\n",
317
+ "llm_load_tensors: ggml ctx size = 0.56 MiB\n",
318
+ "llm_load_tensors: offloading 32 repeating layers to GPU\n",
319
+ "llm_load_tensors: offloading non-repeating layers to GPU\n",
320
+ "llm_load_tensors: offloaded 33/33 layers to GPU\n",
321
+ "llm_load_tensors: CPU buffer size = 53.71 MiB\n",
322
+ "llm_load_tensors: CUDA0 buffer size = 804.66 MiB\n",
323
+ "llm_load_tensors: CUDA1 buffer size = 715.25 MiB\n",
324
+ "llm_load_tensors: CUDA2 buffer size = 715.25 MiB\n",
325
+ "llm_load_tensors: CUDA3 buffer size = 728.40 MiB\n",
326
+ ".................................................................................................\n",
327
+ "llama_new_context_with_model: n_ctx = 10000\n",
328
+ "llama_new_context_with_model: freq_base = 10000.0\n",
329
+ "llama_new_context_with_model: freq_scale = 1\n",
330
+ "llama_kv_cache_init: CUDA0 KV buffer size = 351.56 MiB\n",
331
+ "llama_kv_cache_init: CUDA1 KV buffer size = 312.50 MiB\n",
332
+ "llama_kv_cache_init: CUDA2 KV buffer size = 312.50 MiB\n",
333
+ "llama_kv_cache_init: CUDA3 KV buffer size = 273.44 MiB\n",
334
+ "llama_new_context_with_model: KV self size = 1250.00 MiB, K (f16): 625.00 MiB, V (f16): 625.00 MiB\n"
335
+ ]
336
+ },
337
+ {
338
+ "ename": "",
339
+ "evalue": "",
340
+ "output_type": "error",
341
+ "traceback": [
342
+ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
343
+ "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
344
+ "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
345
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
346
+ ]
347
+ }
348
+ ],
349
+ "source": [
350
+ "from haystack import Pipeline\n",
351
+ "from haystack.utils import Secret\n",
352
+ "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
353
+ "from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator\n",
354
+ "from haystack.components.readers import ExtractiveReader\n",
355
+ "from haystack.components.generators import GPTGenerator\n",
356
+ "from haystack.components.builders.prompt_builder import PromptBuilder\n",
357
+ "from haystack.components.builders.answer_builder import AnswerBuilder\n",
358
+ "from haystack.components.generators import OpenAIGenerator\n",
359
+ "\n",
360
+ "\n",
361
+ "\n",
362
+ "\n",
363
+ "template = \"\"\"\n",
364
+ "Answer all the questions in the following format and based on Aditya \n",
365
+ "and if not found generate answer accordingly using the given information.\n",
366
+ "\n",
367
+ "Context:\n",
368
+ "{% for doc in documents %}\n",
369
+ "{{ doc.content }}\n",
370
+ "{% endfor %}\n",
371
+ "Question: {{question}}\n",
372
+ "Answer:\n",
373
+ "\"\"\"\n",
374
+ "\n",
375
+ "prompt_builder = PromptBuilder(template=template)\n",
376
+ "retriever = ChromaQueryTextRetriever(document_store = chroma_store)\n",
377
+ "#ExtractiveReader to extract answers from the relevant context\n",
378
+ "\n",
379
+ "llm = LlamaCppGenerator(\n",
380
+ "model_path=\"openchat-3.5-1210.Q3_K_S.ggml\", \n",
381
+ "n_ctx=10000,\n",
382
+ "n_batch=256,\n",
383
+ "model_kwargs={\"n_gpu_layers\": -1},\n",
384
+ "generation_kwargs={\"max_tokens\": 250, \"temperature\": 0.9},\n",
385
+ ")\n",
386
+ "\n",
387
+ "reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\",)\n",
388
+ "\n",
389
+ "extractive_qa_pipeline = Pipeline()\n",
390
+ "extractive_qa_pipeline.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
391
+ "# extractive_qa_pipeline.add_component(\"reader\",reader)\n",
392
+ "extractive_qa_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n",
393
+ "extractive_qa_pipeline.add_component(\"llm\", llm)\n",
394
+ "extractive_qa_pipeline.add_component(instance=AnswerBuilder(), name=\"answer_builder\")\n",
395
+ "\n",
396
+ "# extractive_qa_pipeline.connect(\"retriever.documents\", \"reader\")\n",
397
+ "extractive_qa_pipeline.connect(\"retriever\", \"prompt_builder.documents\") \n",
398
+ "extractive_qa_pipeline.connect(\"prompt_builder\", \"llm\")\n",
399
+ "extractive_qa_pipeline.connect(\"llm.replies\", \"answer_builder.replies\")\n",
400
+ "extractive_qa_pipeline.connect(\"retriever\", \"answer_builder.documents\")\n",
401
+ "\n",
402
+ "query = \"who is Aditya did Aditya Pursued his Masters from?\"\n",
403
+ "\n",
404
+ "# Define the input data for the pipeline components\n",
405
+ "input_data = {\n",
406
+ " \"retriever\": {\"query\": query, \"top_k\": 3},\n",
407
+ " # \"reader\": {\"query\": query},\n",
408
+ " \"prompt_builder\": {\"question\": query},\n",
409
+ " \"answer_builder\": {\"query\": query},\n",
410
+ " # Use 'max_tokens' instead of 'max_new_tokens'\n",
411
+ "}\n",
412
+ "\n",
413
+ "# Run the pipeline with the updated input data\n",
414
+ "results = extractive_qa_pipeline.run(input_data)\n",
415
+ "\n",
416
+ " \n",
417
+ " "
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": 19,
423
+ "metadata": {},
424
+ "outputs": [
425
+ {
426
+ "name": "stdout",
427
+ "output_type": "stream",
428
+ "text": [
429
+ " Aditya pursued his Masters from Florida State University.\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "# Assuming results is the dictionary containing the output\n",
435
+ "generated_content = results['llm']['meta'][0]['choices'][0]['text']\n",
436
+ "#print(results)\n",
437
+ "# Print the generated content\n",
438
+ "print(generated_content)\n"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": null,
444
+ "metadata": {},
445
+ "outputs": [],
446
+ "source": []
447
+ }
448
+ ],
449
+ "metadata": {
450
+ "kernelspec": {
451
+ "display_name": "RAGAPP",
452
+ "language": "python",
453
+ "name": "python3"
454
+ },
455
+ "language_info": {
456
+ "codemirror_mode": {
457
+ "name": "ipython",
458
+ "version": 3
459
+ },
460
+ "file_extension": ".py",
461
+ "mimetype": "text/x-python",
462
+ "name": "python",
463
+ "nbconvert_exporter": "python",
464
+ "pygments_lexer": "ipython3",
465
+ "version": "3.10.13"
466
+ }
467
+ },
468
+ "nbformat": 4,
469
+ "nbformat_minor": 2
470
+ }
test2.ipynb ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/adityasugandhi/Documents/GitHub/LLM_Playground/.newenv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ },
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "/Users/adityasugandhi/Documents/GitHub/LLM_Playground\n"
21
+ ]
22
+ },
23
+ {
24
+ "name": "stderr",
25
+ "output_type": "stream",
26
+ "text": [
27
+ "Batches: 100%|██████████| 1/1 [00:03<00:00, 3.22s/it]\n",
28
+ "/Users/adityasugandhi/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:11<00:00, 7.14MiB/s]\n"
29
+ ]
30
+ },
31
+ {
32
+ "name": "stdout",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "{'retriever': {'documents': [Document(id=fee80856fdb487fb694c739e089614d733502a7bd6d8b192f29ed6dad2088f44, content: 'Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Ma...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/mf.txt', 'source_id': '99393e97120fcb9e88daa2d490060e9a91385ae63c7890d12b351978c02d3d93'}, score: 1.0066444873809814, embedding: vector of size 384), Document(id=e700bf2b5df175311a60ca00ffb6ed77b65b09c4221a2466b68e4802d90a831a, content: 'VISHWAM SHAH\n",
36
+ "Tallahassee, FL |[email protected] |+1 (850) 666 - 0095 |https://www.linkedin.com/...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/Resume_Vishwam_Shah_Back_end.pdf', 'source_id': 'd23089ee94ea955eb9ef0045999019220184668c96631b25686fc002722e8753'}, score: 1.5628944635391235, embedding: vector of size 384), Document(id=299afa7bfc84e7700fd38b178933ab2bf3a67b09298662651b173af03fde7968, content: ' The\n",
37
+ "“ECMWF Parameter ID” column is a ECMWF’s numeric label, and can be used to construct the URL fo...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/2212.12794.pdf', 'source_id': '314ee646f1f3143cad0677f2cdf057f1d625e5f2a1891449011557e1f75249d5'}, score: 1.6514018774032593, embedding: vector of size 384)]}}\n"
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "from pathlib import Path\n",
43
+ "import os\n",
44
+ "from haystack import Pipeline\n",
45
+ "from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder\n",
46
+ "from haystack.components.converters import PyPDFToDocument, TextFileToDocument\n",
47
+ "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n",
48
+ "from haystack.components.routers import FileTypeRouter\n",
49
+ "from haystack.components.joiners import DocumentJoiner\n",
50
+ "from haystack.components.writers import DocumentWriter\n",
51
+ "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n",
52
+ "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
53
+ "\n",
54
+ "HERE = Path(os.getcwd())\n",
55
+ "print(HERE)\n",
56
+ "\n",
57
+ "data_path = HERE / \"data\"\n",
58
+ "file_paths = [str(data_path / name) for name in os.listdir(data_path)]\n",
59
+ "\n",
60
+ "chroma_store = ChromaDocumentStore()\n",
61
+ "\n",
62
+ "pipeline = Pipeline()\n",
63
+ "pipeline.add_component(\"FileTypeRouter\", FileTypeRouter(mime_types=[\"text/plain\", \"application/pdf\"]))\n",
64
+ "pipeline.add_component(\"TextFileConverter\", TextFileToDocument())\n",
65
+ "pipeline.add_component(\"PdfFileConverter\", PyPDFToDocument())\n",
66
+ "\n",
67
+ "pipeline.add_component(\"Joiner\", DocumentJoiner())\n",
68
+ "pipeline.add_component(\"Cleaner\", DocumentCleaner())\n",
69
+ "pipeline.add_component(\"Splitter\", DocumentSplitter(split_by=\"sentence\", split_length=250, split_overlap=30))\n",
70
+ "# pipeline.add_component(\"TextEmbedder\", SentenceTransformersTextEmbedder())\n",
71
+ "pipeline.add_component(\"Embedder\", SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\"))\n",
72
+ "\n",
73
+ "pipeline.add_component(\"Writer\", DocumentWriter(document_store=chroma_store))\n",
74
+ "\n",
75
+ "pipeline.connect(\"FileTypeRouter.text/plain\", \"TextFileConverter.sources\")\n",
76
+ "pipeline.connect(\"FileTypeRouter.application/pdf\", \"PdfFileConverter.sources\")\n",
77
+ "pipeline.connect(\"TextFileConverter.documents\", \"Joiner.documents\")\n",
78
+ "pipeline.connect(\"PdfFileConverter.documents\", \"Joiner.documents\")\n",
79
+ "pipeline.connect(\"Joiner.documents\", \"Cleaner.documents\")\n",
80
+ "pipeline.connect(\"Cleaner.documents\", \"Splitter.documents\")\n",
81
+ "pipeline.connect(\"Splitter.documents\", \"Embedder.documents\")\n",
82
+ "# pipeline.connect(\"TextEmbedder.embeddings\", \"Embedder.documents\")\n",
83
+ "pipeline.connect(\"Embedder.documents\", \"Writer.documents\")\n",
84
+ "\n",
85
+ "pipeline.run(\n",
86
+ " {\"FileTypeRouter\": {\"sources\": file_paths}},\n",
87
+ ")\n",
88
+ "\n",
89
+ "# Querying pipeline\n",
90
+ "querying = Pipeline()\n",
91
+ "querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
92
+ "results = querying.run({\"retriever\": {\"query\": \"Vishwam\", \"top_k\": 3}})\n",
93
+ "print(results)\n"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "markdown",
98
+ "metadata": {},
99
+ "source": [
100
+ "#Information Retriver"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 4,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "{'retriever': {'documents': [Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.1221085786819458, embedding: vector of size 384), Document(id=11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9, content: 'Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encomp...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_test.txt', 'source_id': 'c85a2287836cae980897693decb5e9d07e80f60b7c96b4e542ef3057e11fc228'}, score: 1.2236461639404297, embedding: vector of size 384), Document(id=a6ad41c3febd74d1f6825aac59c2d6dd7589ae8088bb3b449ea239c97d6f1b1c, content: ' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18\n",
113
+ "1.2 HRES . . . . . . . . . . . . . ....', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/2212.12794.pdf', 'source_id': 'aa504618a25e65b870dde2fe288f395a44ff6a05c640fa7a2e6c5a5d3a9a44ef'}, score: 1.6584246158599854, embedding: vector of size 384)]}}\n"
114
+ ]
115
+ }
116
+ ],
117
+ "source": [
118
+ "# # Querying pipeline\n",
119
+ "# querying = Pipeline()\n",
120
+ "# querying.add_component(\"retriever\", ChromaQueryTextRetriever(chroma_store))\n",
121
+ "# results = querying.run({\"retriever\": {\"query\": \"Aditya\", \"top_k\": 3}})\n",
122
+ "# print(results)\n"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 28,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "ename": "AttributeError",
132
+ "evalue": "'str' object has no attribute 'resolve_value'",
133
+ "output_type": "error",
134
+ "traceback": [
135
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
136
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
137
+ "Cell \u001b[0;32mIn[28], line 29\u001b[0m\n\u001b[1;32m 25\u001b[0m api_key \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39menviron\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOPENAI_API_KEY\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m#ExtractiveReader to extract answers from the relevant context\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# api_key = Secret.from_token(\"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\")\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mOpenAIGenerator\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgpt-3.5-turbo-0125\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mapi_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 30\u001b[0m reader \u001b[38;5;241m=\u001b[39m ExtractiveReader(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeepset/roberta-base-squad2-distilled\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m extractive_qa_pipeline \u001b[38;5;241m=\u001b[39m Pipeline()\n",
138
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/core/component/component.py:122\u001b[0m, in \u001b[0;36mComponentMeta.__call__\u001b[0;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124;03mThis method is called when clients instantiate a Component and\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03mruns before __new__ and __init__.\u001b[39;00m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# This will call __new__ then __init__, giving us back the Component instance\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m instance \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;66;03m# Before returning, we have the chance to modify the newly created\u001b[39;00m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;66;03m# Component instance, so we take the chance and set up the I/O sockets\u001b[39;00m\n\u001b[1;32m 126\u001b[0m \n\u001b[1;32m 127\u001b[0m \u001b[38;5;66;03m# If `component.set_output_types()` was called in the component constructor,\u001b[39;00m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;66;03m# `__haystack_output__` is already populated, no need to do anything.\u001b[39;00m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(instance, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__haystack_output__\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 130\u001b[0m \u001b[38;5;66;03m# If that's not the case, we need to populate `__haystack_output__`\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;66;03m# We deepcopy the content of the cache to transfer ownership from the class method\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;66;03m# to the actual instance, so that different instances of the same class won't share this data.\u001b[39;00m\n",
139
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/site-packages/haystack/components/generators/openai.py:103\u001b[0m, in \u001b[0;36mOpenAIGenerator.__init__\u001b[0;34m(self, api_key, model, streaming_callback, api_base_url, organization, system_prompt, generation_kwargs)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi_base_url \u001b[38;5;241m=\u001b[39m api_base_url\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morganization \u001b[38;5;241m=\u001b[39m organization\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient \u001b[38;5;241m=\u001b[39m OpenAI(api_key\u001b[38;5;241m=\u001b[39m\u001b[43mapi_key\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve_value\u001b[49m(), organization\u001b[38;5;241m=\u001b[39morganization, base_url\u001b[38;5;241m=\u001b[39mapi_base_url)\n",
140
+ "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'resolve_value'"
141
+ ]
142
+ }
143
+ ],
144
+ "source": [
145
+ "from dotenv import load_dotenv\n",
146
+ "\n",
147
+ "load_dotenv() \n",
148
+ "from haystack import Pipeline\n",
149
+ "from haystack.utils import Secret\n",
150
+ "from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever\n",
151
+ "from haystack.components.readers import ExtractiveReader\n",
152
+ "from haystack.components.generators import GPTGenerator\n",
153
+ "from haystack.components.builders.prompt_builder import PromptBuilder\n",
154
+ "from haystack.components.generators import OpenAIGenerator\n",
155
+ "\n",
156
+ "template = \"\"\"\n",
157
+ " ` Answer the question using the provided context based on Aditya.\n",
158
+ "\n",
159
+ " Context:\n",
160
+ " {% for context in answers %}\n",
161
+ " {{ context }}\n",
162
+ " {% endfor %}\n",
163
+ " Question: {{question}}\n",
164
+ " Answer:\n",
165
+ " \"\"\"\n",
166
+ "\n",
167
+ "prompt_builder = PromptBuilder(template=template)\n",
168
+ "retriever = ChromaQueryTextRetriever(document_store = chroma_store)\n",
169
+ "api_key = os.environ.get(\"OPENAI_API_KEY\")\n",
170
+ "\n",
171
+ "#ExtractiveReader to extract answers from the relevant context\n",
172
+ "api_key = Secret.from_token(api_key)\n",
173
+ "llm = OpenAIGenerator(model=\"gpt-3.5-turbo-0125\",api_key=api_key)\n",
174
+ "reader = ExtractiveReader(model=\"deepset/roberta-base-squad2-distilled\")\n",
175
+ "\n",
176
+ "extractive_qa_pipeline = Pipeline()\n",
177
+ "extractive_qa_pipeline.add_component(\"retriever\", retriever)\n",
178
+ "extractive_qa_pipeline.add_component('reader', reader)\n",
179
+ "extractive_qa_pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n",
180
+ "extractive_qa_pipeline.add_component(\"llm\", llm)\n",
181
+ "\n",
182
+ "extractive_qa_pipeline.connect(\"retriever.documents\", \"reader.documents\")\n",
183
+ "extractive_qa_pipeline.connect(\"reader.answers\", \"prompt_builder.answers\")\n",
184
+ "extractive_qa_pipeline.connect(\"prompt_builder\", \"llm\")\n",
185
+ "\n",
186
+ "\n",
187
+ "query = \"what is Aditya Pursuing ?\"\n",
188
+ "print(query)\n",
189
+ "# Define the input data for the pipeline components\n",
190
+ "input_data = {\n",
191
+ " \"retriever\": {\"query\": query, \"top_k\": 2},\n",
192
+ " \"reader\": {\"query\": query, \"top_k\": 2},\n",
193
+ " \"prompt_builder\": {\"question\": query},\n",
194
+ " # Use 'max_tokens' instead of 'max_new_tokens'\n",
195
+ "}\n",
196
+ "\n",
197
+ "# Run the pipeline with the updated input data\n",
198
+ "results = extractive_qa_pipeline.run(input_data)\n",
199
+ "print(results)"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": 5,
205
+ "metadata": {},
206
+ "outputs": [
207
+ {
208
+ "ename": "TypeError",
209
+ "evalue": "isinstance() arg 2 must be a type, a tuple of types, or a union",
210
+ "output_type": "error",
211
+ "traceback": [
212
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
213
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
214
+ "Cell \u001b[0;32mIn[5], line 9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mdefault(obj)\n\u001b[0;32m----> 9\u001b[0m json_results \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdumps\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExtractedAnswerEncoder\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28mprint\u001b[39m(json_results)\n",
215
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/__init__.py:238\u001b[0m, in \u001b[0;36mdumps\u001b[0;34m(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 233\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONEncoder\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 235\u001b[0m \u001b[43m \u001b[49m\u001b[43mskipkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mensure_ascii\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mensure_ascii\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 236\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_circular\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_circular\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallow_nan\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallow_nan\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 237\u001b[0m \u001b[43m \u001b[49m\u001b[43mseparators\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mseparators\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdefault\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msort_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m--> 238\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n",
216
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:201\u001b[0m, in \u001b[0;36mJSONEncoder.encode\u001b[0;34m(self, o)\u001b[0m\n\u001b[1;32m 199\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterencode(o, _one_shot\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(chunks, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[0;32m--> 201\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mchunks\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(chunks)\n",
217
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:431\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode_list(o, _current_indent_level)\n\u001b[1;32m 430\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(o, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 431\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode_dict(o, _current_indent_level)\n\u001b[1;32m 432\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m markers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
218
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:405\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 404\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 405\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 406\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 407\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
219
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:405\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 404\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 405\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 406\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 407\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
220
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:325\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 324\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 325\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 327\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
221
+ "File \u001b[0;32m/conda/asugandhi/miniconda3/envs/RAGAPP/lib/python3.10/json/encoder.py:438\u001b[0m, in \u001b[0;36m_make_iterencode.<locals>._iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCircular reference detected\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 437\u001b[0m markers[markerid] \u001b[38;5;241m=\u001b[39m o\n\u001b[0;32m--> 438\u001b[0m o \u001b[38;5;241m=\u001b[39m \u001b[43m_default\u001b[49m\u001b[43m(\u001b[49m\u001b[43mo\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode(o, _current_indent_level)\n\u001b[1;32m 440\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m markers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
222
+ "Cell \u001b[0;32mIn[5], line 5\u001b[0m, in \u001b[0;36mExtractedAnswerEncoder.default\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdefault\u001b[39m(\u001b[38;5;28mself\u001b[39m, obj):\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28;43misinstance\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresults\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Convert ExtractedAnswer to a dictionary\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mdefault(obj)\n",
223
+ "\u001b[0;31mTypeError\u001b[0m: isinstance() arg 2 must be a type, a tuple of types, or a union"
224
+ ]
225
+ }
226
+ ],
227
+ "source": [
228
+ "import json\n",
229
+ "\n",
230
+ "class ExtractedAnswerEncoder(json.JSONEncoder):\n",
231
+ " def default(self, obj):\n",
232
+ " if isinstance(obj, results):\n",
233
+ " # Convert ExtractedAnswer to a dictionary\n",
234
+ " return obj.__dict__\n",
235
+ " return super().default(obj)\n",
236
+ "json_results = json.dumps(results, indent=2, cls=ExtractedAnswerEncoder)\n",
237
+ "\n",
238
+ "print(json_results)"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": null,
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "p"
248
+ ]
249
+ }
250
+ ],
251
+ "metadata": {
252
+ "kernelspec": {
253
+ "display_name": "RAGAPP",
254
+ "language": "python",
255
+ "name": "python3"
256
+ },
257
+ "language_info": {
258
+ "codemirror_mode": {
259
+ "name": "ipython",
260
+ "version": 3
261
+ },
262
+ "file_extension": ".py",
263
+ "mimetype": "text/x-python",
264
+ "name": "python",
265
+ "nbconvert_exporter": "python",
266
+ "pygments_lexer": "ipython3",
267
+ "version": "3.9.13"
268
+ }
269
+ },
270
+ "nbformat": 4,
271
+ "nbformat_minor": 2
272
+ }
test_trainer/runs/.DS_Store ADDED
Binary file (8.2 kB). View file
 
test_trainer/runs/Feb22_22-15-01_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658123.bfs-v13-skynet.coaps.fsu.edu.3062760.0 ADDED
Binary file (4.54 kB). View file
 
test_trainer/runs/Feb22_22-17-41_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658261.bfs-v13-skynet.coaps.fsu.edu.3062760.1 ADDED
Binary file (4.54 kB). View file
 
test_trainer/runs/Feb22_22-17-41_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658535.bfs-v13-skynet.coaps.fsu.edu.3062760.2 ADDED
Binary file (4.54 kB). View file
 
test_trainer/runs/Feb22_22-24-50_bfs-v13-skynet.coaps.fsu.edu/events.out.tfevents.1708658690.bfs-v13-skynet.coaps.fsu.edu.3062760.3 ADDED
Binary file (4.54 kB). View file
 
utils/ExtractQA.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
2
+ from haystack.nodes import JoinDocuments
3
+ from haystack import Pipeline
4
+
5
+
6
+
7
+
8
+ def ExtracQA(reader,retriever,query):
9
+ qa_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
10
+ result = qa_pipeline.run(query=query, params={"retriever": {"top_k": 3}, "reader": {"top_k": 5}})
11
+
12
+
13
+ return result
14
+
15
+
16
+
17
+
18
+ def MultipleRetriever(reader,es_retriever,dpr_retriever,query):
19
+ p = Pipeline()
20
+ p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
21
+ p.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
22
+ p.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"])
23
+ p.add_node(component=reader, name="QAReader", inputs=["JoinResults"])
24
+ result = p.run(query=query, params={"ESRetriever": {"top_k": 10}, "DPRRetriever": {"top_k": 10}, "QAReader": {"top_k": 5}})
25
+
26
+ return result
27
+
utils/__init__.py ADDED
File without changes
utils/dataloader.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from haystack import Pipeline
5
+ from haystack.components.converters import TextFileToDocument
6
+ from haystack.components.writers import DocumentWriter
7
+
8
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
9
+
10
+
11
+
12
+
13
+
14
+ def load_data():
15
+ file_paths = ["data" / Path(name) for name in os.listdir("data")]
16
+
17
+ # Chroma is used in-memory so we use the same instances in the two pipelines below
18
+ document_store = ChromaDocumentStore()
19
+
20
+ indexing = Pipeline()
21
+ indexing.add_component("converter", TextFileToDocument())
22
+ indexing.add_component("writer", DocumentWriter(document_store))
23
+ indexing.connect("converter", "writer")
24
+ indexing.run({"converter": {"sources": file_paths}})
25
+
26
+ return document_store
27
+
28
+
29
+
utils/prompt_builder.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
2
+ from haystack.components.generators import HuggingFaceTGIGenerator
3
+ from haystack.components.builders import PromptBuilder
4
+ from haystack.agents.memory import ConversationSummaryMemory
5
+ from dataloader import load_data
6
+ from hayst
7
+ prompt = """
8
+ Answer the query based on the provided context for Aditya.
9
+ If the context does not contain the answer, say 'Answer not found'.
10
+ Context:
11
+ {% for doc in documents %}
12
+ {{ doc.content }}
13
+ {% endfor %}
14
+ query: {{query}}
15
+ Answer:
16
+ """
17
+ prompt_builder = PromptBuilder(template=prompt)
18
+
19
+ llm = HuggingFaceTGIGenerator(model="mistralai/Mixtral-8x7B-Instruct-v0.1")
20
+ llm.warm_up()
21
+ retriever = ChromaQueryTextRetriever(load_data())
22
+
23
+ querying = Pipeline()
24
+ querying.add_component("retriever", retriever)
25
+ querying.add_component("prompt_builder", prompt_builder)
26
+ querying.add_component("llm", llm)
27
+
28
+ querying.connect("retriever.documents", "prompt_builder.documents")
29
+ querying.connect("prompt_builder", "llm")