Spaces:
Sleeping
Sleeping
AFischer1985
commited on
Update run.py
Browse files
run.py
CHANGED
@@ -111,41 +111,49 @@ def split_with_overlap(text,chunk_size=3500, overlap=700):
|
|
111 |
def add_doc(path):
|
112 |
print("def add_doc!")
|
113 |
print(path)
|
114 |
-
|
|
|
115 |
doc=convertPDF(path)
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
120 |
client = chromadb.PersistentClient(path="output/general_knowledge")
|
121 |
print(str(client.list_collections()))
|
122 |
#global collection
|
123 |
dbName="test"
|
124 |
-
if("name="+dbName in str(client.list_collections())):
|
125 |
-
client.delete_collection(name=dbName)
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
149 |
return(collection)
|
150 |
|
151 |
#split_with_overlap("test me if you can",2,1)
|
|
|
111 |
def add_doc(path):
|
112 |
print("def add_doc!")
|
113 |
print(path)
|
114 |
+
anhang=False
|
115 |
+
if(str.lower(path).endswith(".pdf") and os.path.exists(path)):
|
116 |
doc=convertPDF(path)
|
117 |
+
if(len(doc[0])>5):
|
118 |
+
gr.Info("PDF uploaded, start Indexing excerpt (first 5 pages)!")
|
119 |
+
else:
|
120 |
+
gr.Info("PDF uploaded, start Indexing!")
|
121 |
+
doc="\n\n".join(doc[0][0:5])
|
122 |
+
anhang=True
|
123 |
+
|
124 |
client = chromadb.PersistentClient(path="output/general_knowledge")
|
125 |
print(str(client.list_collections()))
|
126 |
#global collection
|
127 |
dbName="test"
|
128 |
+
if(not "name="+dbName in str(client.list_collections())):
|
129 |
+
# client.delete_collection(name=dbName)
|
130 |
+
collection = client.create_collection(
|
131 |
+
name=dbName,
|
132 |
+
embedding_function=embeddingModel,
|
133 |
+
metadata={"hnsw:space": "cosine"})
|
134 |
+
else:
|
135 |
+
collection = client.get_collection(
|
136 |
+
name=dbName, embedding_function=embeddingModel)
|
137 |
+
if(anhang==True):
|
138 |
+
corpus=split_with_overlap(doc,3500,700)
|
139 |
+
print(len(corpus))
|
140 |
+
then = datetime.now()
|
141 |
+
x=collection.get(include=[])["ids"]
|
142 |
+
print(len(x))
|
143 |
+
if(len(x)==0):
|
144 |
+
chunkSize=40000
|
145 |
+
for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
|
146 |
+
print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
|
147 |
+
ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
|
148 |
+
batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
|
149 |
+
textIDs=[str(id) for id in ids[0:len(batch)]]
|
150 |
+
ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
|
151 |
+
collection.add(documents=batch, ids=ids,
|
152 |
+
metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
|
153 |
+
print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
|
154 |
+
now = datetime.now()
|
155 |
+
gr.Info(f"Indexing complete!")
|
156 |
+
print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
|
157 |
return(collection)
|
158 |
|
159 |
#split_with_overlap("test me if you can",2,1)
|