File size: 3,370 Bytes
5e75742
 
 
 
 
f0c50f4
59cb95a
94447f8
 
784ac8a
94447f8
 
5e75742
94447f8
 
5e75742
f0c50f4
5e75742
 
94447f8
 
 
 
 
 
68cc6ea
94447f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e75742
 
f0c50f4
5e75742
 
f0c50f4
5e75742
 
f0c50f4
5e75742
 
 
f0c50f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import chromadb
from chromadb.utils import embedding_functions
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

# # Initialize ChromaDB client
# chroma_client = chromadb.PersistentClient(path="data_db")

# # Define the embedding function
# sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# # Get or create a collection
# collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)

# Streamlit UI elements
st.title("ChromaDB and HuggingFace Pipeline Integration")
query = st.text_input("Enter your query:", value="director")
import csv
import chromadb
from chromadb.utils import embedding_functions



with open('./data.csv' ,  encoding="utf-8") as file:
    lines = csv.reader(file)

    
    documents = []

   
    metadatas = []

    
    ids = []
    id = 1

    
    for i, line in enumerate(lines):
        if i == 0:
            
            continue

        documents.append(line[0])
        metadatas.append({"item_id": line[1]})
        ids.append(str(id))
        id += 1


chroma_client = chromadb.PersistentClient(path="db")


sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")


collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)


collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)


if st.button("Search"):
    # Query the collection
    results = collection.query(
        query_texts=[query],
        n_results=1,
        include=['documents', 'distances', 'metadatas']
    )

    st.write("Query Results:")
    st.write(results['metadatas'])

    # Log the structure of results
    st.write("Results Structure:")
    st.write(results)

    if 'documents' in results and results['documents']:
        # Check if the structure of results['documents'] is as expected
        if len(results['documents']) > 0 and isinstance(results['documents'][0], list) and len(results['documents'][0]) > 0:
            context = results['documents'][0][0]
            st.write("Context:")
            st.write(context)

            # Load tokenizer and model
            tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
            model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")

            # Create pipeline
            pipe = pipeline(
                "text2text-generation",
                model=model,
                tokenizer=tokenizer,
                max_length=512
            )

            local_llm = HuggingFacePipeline(pipeline=pipe)

            l = f"""
            Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

            {context}

            Question: {query}
            Helpful Answer:
            """

            # Generate answer
            answer = local_llm(l)
            st.write("Answer:")
            st.write(answer)
        else:
            st.write("No valid context found in the results.")
    else:
        st.write("No documents found for the query.")