Adarsh-aot commited on
Commit
5e75742
·
verified ·
1 Parent(s): b145729

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -96
app.py CHANGED
@@ -1,96 +1,99 @@
1
- # import csv
2
-
3
- # # Load sample data (a restaurant menu of items)
4
- # with open('./data.csv') as file:
5
- # lines = csv.reader(file)
6
-
7
- # # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
8
- # documents = []
9
-
10
- # # Store the corresponding menu item IDs in this array.
11
- # metadatas = []
12
-
13
- # # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
14
- # ids = []
15
- # id = 1
16
-
17
- # # Loop thru each line and populate the 3 arrays.
18
- # for i, line in enumerate(lines):
19
- # if i==0:
20
- # # Skip the first row (the column headers)
21
- # continue
22
-
23
- # documents.append(line[0])
24
- # metadatas.append({"item_id": line[1]})
25
- # ids.append(str(id))
26
- # id+=1
27
-
28
-
29
- import chromadb
30
- from chromadb.utils import embedding_functions
31
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
32
- from transformers import pipeline
33
- from langchain.llms import HuggingFacePipeline
34
- import torch
35
- # Instantiate chromadb instance. Data is stored in memory only.
36
- # chroma_client = chromadb.Client()
37
-
38
- # Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
39
- chroma_client = chromadb.PersistentClient(path="vector_db")
40
-
41
- # Select the embedding model to use.
42
- # List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
43
- sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
44
-
45
- # Use this to delete the database
46
- # chroma_client.delete_collection(name="my_collection")
47
-
48
- # Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
49
- collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)
50
-
51
-
52
- # collection.add(
53
- # documents=documents,
54
- # metadatas=metadatas,
55
- # ids=ids
56
- # )
57
-
58
-
59
-
60
- results = collection.query(
61
- query_texts=["director"],
62
- n_results=1,
63
- include=['documents', 'distances', 'metadatas']
64
- )
65
- print(results['metadatas'])
66
-
67
-
68
-
69
- tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
70
- model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")
71
-
72
- pipe = pipeline(
73
- "text2text-generation",
74
- model=model,
75
- tokenizer=tokenizer,
76
- max_length=512
77
- )
78
-
79
- local_llm = HuggingFacePipeline(pipeline=pipe)
80
-
81
-
82
- context = results['documents'][0][0]
83
- question = "director job"
84
-
85
-
86
- l = f"""
87
- Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer.
88
-
89
- {context}
90
-
91
- Question: {question}
92
- Helpful Answer:
93
- """
94
-
95
-
96
- print(local_llm(l))
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import csv
4
+ import chromadb
5
+ from chromadb.utils import embedding_functions
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+ from transformers import pipeline
8
+ from langchain.llms import HuggingFacePipeline
9
+
10
+ # Load sample data (a restaurant menu of items)
11
+ # with open('./data.csv') as file:
12
+ # lines = csv.reader(file)
13
+
14
+ # # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
15
+ # documents = []
16
+
17
+ # # Store the corresponding menu item IDs in this array.
18
+ # metadatas = []
19
+
20
+ # # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
21
+ # ids = []
22
+ # id = 1
23
+
24
+ # # Loop thru each line and populate the 3 arrays.
25
+ # for i, line in enumerate(lines):
26
+ # if i == 0:
27
+ # # Skip the first row (the column headers)
28
+ # continue
29
+
30
+ # documents.append(line[0])
31
+ # metadatas.append({"item_id": line[1]})
32
+ # ids.append(str(id))
33
+ # id += 1
34
+
35
+ # Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
36
+ chroma_client = chromadb.PersistentClient(path="vector_db")
37
+
38
+ # Select the embedding model to use.
39
+ # List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
40
+ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
41
+
42
+ # Use this to delete the database
43
+ # chroma_client.delete_collection(name="my_collection")
44
+
45
+ # Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
46
+ collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)
47
+
48
+ # Add the data to the collection
49
+ # collection.add(
50
+ # documents=documents,
51
+ # metadatas=metadatas,
52
+ # ids=ids
53
+ # )
54
+
55
+ # Streamlit app layout
56
+ st.title("ChromaDB and HuggingFace Pipeline Integration")
57
+
58
+ query = st.text_input("Enter your query:", value="director")
59
+
60
+ if st.button("Search"):
61
+ results = collection.query(
62
+ query_texts=[query],
63
+ n_results=1,
64
+ include=['documents', 'distances', 'metadatas']
65
+ )
66
+ st.write("Query Results:")
67
+ st.write(results['metadatas'])
68
+
69
+ if results['documents']:
70
+ context = results['documents'][0][0]
71
+ st.write("Context:")
72
+ st.write(context)
73
+ tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
74
+ model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")
75
+
76
+ pipe = pipeline(
77
+ "text2text-generation",
78
+ model=model,
79
+ tokenizer=tokenizer,
80
+ max_length=512
81
+ )
82
+
83
+ local_llm = HuggingFacePipeline(pipeline=pipe)
84
+
85
+ l = f"""
86
+ Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
87
+
88
+ {context}
89
+
90
+ Question: {query}
91
+ Helpful Answer:
92
+ """
93
+
94
+ answer = local_llm(l)
95
+ st.write("Answer:")
96
+ st.write(answer)
97
+
98
+
99
+