Qdrant-knowledge

Sleeping

App Files Files Community

jonathanjordan21 commited on Apr 21

Commit

109534c

verified ·

1 Parent(s): 20469f9

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -28

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from qdrant_client import QdrantClient
 from qdrant_client.http.models import Distance, VectorParams
 from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
 from qdrant_client import QdrantClient, models
 from qdrant_client.http.models import Distance, SparseVectorParams, VectorParams
@@ -17,54 +19,124 @@ from pydantic import BaseModel, Field
 class Data(BaseModel):
     items: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(..., description="Either a dictionary or a list of dictionaries.")
 document_1 = Document(
-    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
-    metadata={"source": "tweet"},
 )
 document_2 = Document(
-    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
-    metadata={"source": "news"},
 )
 document_3 = Document(
-    page_content="Building an exciting new project with LangChain - come check it out!",
-    metadata={"source": "tweet"},
 )
 document_4 = Document(
-    page_content="Robbers broke into the city bank and stole $1 million in cash.",
-    metadata={"source": "news"},
 )
 document_5 = Document(
-    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
-    metadata={"source": "tweet"},
 )
 document_6 = Document(
-    page_content="Is the new iPhone worth the price? Read this review to find out.",
-    metadata={"source": "website"},
 )
 document_7 = Document(
-    page_content="The top 10 soccer players in the world right now.",
-    metadata={"source": "website"},
 )
 document_8 = Document(
-    page_content="LangGraph is the best framework for building stateful, agentic applications!",
-    metadata={"source": "tweet"},
 )
 document_9 = Document(
-    page_content="The stock market is down 500 points today due to fears of a recession.",
-    metadata={"source": "news"},
 )
 document_10 = Document(
-    page_content="I have a bad feeling I am going to get deleted :(",
-    metadata={"source": "tweet"},
 )
 documents = [
@@ -79,29 +151,31 @@ documents = [
     document_9,
     document_10,
 ]
-uuids = [str(uuid4()) for _ in range(len(documents))]
 docs = documents
-sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
 client = QdrantClient(path="tmp/langchain_qdrant")
 # Create a collection with sparse vectors
 client.create_collection(
     collection_name="my_documents",
-    vectors_config={"dense": VectorParams(size=3072, distance=Distance.COSINE)},
-    sparse_vectors_config={
-        "sparse": SparseVectorParams(index=models.SparseIndexParams(on_disk=False))
-    },
 )
 qdrant = QdrantVectorStore(
     client=client,
     collection_name="my_documents",
     sparse_embedding=sparse_embeddings,
-    retrieval_mode=RetrievalMode.SPARSE,
-    sparse_vector_name="sparse",
 )

 from qdrant_client.http.models import Distance, VectorParams
 from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 from qdrant_client import QdrantClient, models
 from qdrant_client.http.models import Distance, SparseVectorParams, VectorParams
 class Data(BaseModel):
     items: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(..., description="Either a dictionary or a list of dictionaries.")
+# document_1 = Document(
+#     page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
+#     metadata={"source": "tweet"},
+# )
+# document_2 = Document(
+#     page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
+#     metadata={"source": "news"},
+# )
+# document_3 = Document(
+#     page_content="Building an exciting new project with LangChain - come check it out!",
+#     metadata={"source": "tweet"},
+# )
+# document_4 = Document(
+#     page_content="Robbers broke into the city bank and stole $1 million in cash.",
+#     metadata={"source": "news"},
+# )
+# document_5 = Document(
+#     page_content="Wow! That was an amazing movie. I can't wait to see it again.",
+#     metadata={"source": "tweet"},
+# )
+# document_6 = Document(
+#     page_content="Is the new iPhone worth the price? Read this review to find out.",
+#     metadata={"source": "website"},
+# )
+# document_7 = Document(
+#     page_content="The top 10 soccer players in the world right now.",
+#     metadata={"source": "website"},
+# )
+# document_8 = Document(
+#     page_content="LangGraph is the best framework for building stateful, agentic applications!",
+#     metadata={"source": "tweet"},
+# )
+# document_9 = Document(
+#     page_content="The stock market is down 500 points today due to fears of a recession.",
+#     metadata={"source": "news"},
+# )
+# document_10 = Document(
+#     page_content="I have a bad feeling I am going to get deleted :(",
+#     metadata={"source": "tweet"},
+# )
+# documents = [
+#     document_1,
+#     document_2,
+#     document_3,
+#     document_4,
+#     document_5,
+#     document_6,
+#     document_7,
+#     document_8,
+#     document_9,
+#     document_10,
+# ]
+# uuids = [str(uuid4()) for _ in range(len(documents))]
+# docs = documents
+# from uuid import uuid4
+# from langchain.schema import Document
 document_1 = Document(
+    page_content="Aduan: Saya tidak bisa login ke sistem e-learning.\nJawaban: Kami menemukan bahwa akun Anda terkunci setelah tiga kali gagal login. Kami telah membuka kunci akun dan menyarankan Anda untuk melakukan reset password menggunakan fitur 'Lupa Kata Sandi'.",
+    metadata={"source": "Aduan"},
 )
 document_2 = Document(
+    page_content="Request: Mohon bantuannya untuk mendapatkan akses ke folder tim di server.\nJawaban: Kami telah menambahkan akun Anda ke grup pengguna 'Tim IT' di Active Directory. Akses ke folder sekarang dapat dilakukan setelah Anda login ulang.",
+    metadata={"source": "Request"},
 )
 document_3 = Document(
+    page_content="Incident: Laporan printer di lantai 3 tidak bisa mencetak.\nJawaban: Kami melakukan restart pada spooler service di perangkat printer dan membersihkan antrian cetak yang bermasalah. Printer sudah kembali normal.",
+    metadata={"source": "Incident"},
 )
 document_4 = Document(
+    page_content="Aduan: Email saya sering masuk ke folder spam penerima.\nJawaban: Kami periksa konfigurasi SPF, DKIM, dan DMARC pada domain Anda. Ternyata ada konfigurasi SPF yang tidak lengkap. Kami telah memperbaikinya dan hasil pengujian sudah menunjukkan pengiriman email berjalan normal.",
+    metadata={"source": "Aduan"},
 )
 document_5 = Document(
+    page_content="Request: Saya membutuhkan instalasi software AutoCAD untuk proyek desain.\nJawaban: Kami telah mengunduh versi terbaru dari situs resmi AutoDesk dan melakukan instalasi di laptop Anda. Lisensi telah diaktivasi menggunakan akun universitas.",
+    metadata={"source": "Request"},
 )
 document_6 = Document(
+    page_content="Incident: Sistem ERP tidak bisa mengakses modul keuangan sejak pagi.\nJawaban: Kami temukan bahwa service database MySQL berhenti secara tiba-tiba. Service telah kami nyalakan kembali dan modul keuangan kini dapat diakses kembali.",
+    metadata={"source": "Incident"},
 )
 document_7 = Document(
+    page_content="Aduan: Aplikasi mobile sering crash saat dibuka.\nJawaban: Kami analisis log error dan menemukan bug pada fitur notifikasi. Kami telah melakukan patch pada versi 1.2.3 dan memperbarui aplikasi Anda melalui MDM.",
+    metadata={"source": "Aduan"},
 )
 document_8 = Document(
+    page_content="Request: Mohon dibuatkan email dinas baru untuk staf baru di departemen HR.\nJawaban: Email telah dibuat dengan format [email protected] dan password default. Informasi login telah kami kirimkan melalui email pribadi yang terdaftar.",
+    metadata={"source": "Request"},
 )
 document_9 = Document(
+    page_content="Incident: Koneksi internet putus-putus di gedung B.\nJawaban: Kami lakukan pengecekan router dan mengganti kabel jaringan yang rusak di lantai 2. Koneksi kini stabil dan normal.",
+    metadata={"source": "Incident"},
 )
 document_10 = Document(
+    page_content="Aduan: Layar laptop saya berkedip-kedip.\nJawaban: Masalah disebabkan oleh driver grafis yang tidak kompatibel. Kami telah menginstal versi driver yang sesuai dengan perangkat Anda dan masalah layar sudah tidak muncul lagi.",
+    metadata={"source": "Aduan"},
 )
 documents = [
     document_9,
     document_10,
 ]
+uuids = [str(uuid4()) for _ in range(len(documents))]
 docs = documents
+# sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
+sparse_embeddings = FastEmbedEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 client = QdrantClient(path="tmp/langchain_qdrant")
 # Create a collection with sparse vectors
 client.create_collection(
     collection_name="my_documents",
+    # vectors_config={"dense": VectorParams(size=3072, distance=Distance.COSINE)},
+    # sparse_vectors_config={
+    #     "sparse": SparseVectorParams(index=models.SparseIndexParams(on_disk=False))
+    # },
 )
 qdrant = QdrantVectorStore(
     client=client,
     collection_name="my_documents",
     sparse_embedding=sparse_embeddings,
+    # retrieval_mode=RetrievalMode.SPARSE,
+    # sparse_vector_name="sparse",
 )