AFischer1985 commited on
Commit
0182410
·
verified ·
1 Parent(s): de6ea12

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +38 -30
run.py CHANGED
@@ -111,41 +111,49 @@ def split_with_overlap(text,chunk_size=3500, overlap=700):
111
  def add_doc(path):
112
  print("def add_doc!")
113
  print(path)
114
- if(str.lower(path).endswith(".pdf")):
 
115
  doc=convertPDF(path)
116
- doc="\n\n".join(doc[0])
117
- gr.Info("PDF uploaded, start Indexing!")
118
- else:
119
- gr.Info("Error: Only pdfs are accepted!")
 
 
 
120
  client = chromadb.PersistentClient(path="output/general_knowledge")
121
  print(str(client.list_collections()))
122
  #global collection
123
  dbName="test"
124
- if("name="+dbName in str(client.list_collections())):
125
- client.delete_collection(name=dbName)
126
- collection = client.create_collection(
127
- dbName,
128
- embedding_function=embeddingModel,
129
- metadata={"hnsw:space": "cosine"})
130
- corpus=split_with_overlap(doc,3500,700)
131
- print(len(corpus))
132
- then = datetime.now()
133
- x=collection.get(include=[])["ids"]
134
- print(len(x))
135
- if(len(x)==0):
136
- chunkSize=40000
137
- for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
138
- print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
139
- ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
140
- batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
141
- textIDs=[str(id) for id in ids[0:len(batch)]]
142
- ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
143
- collection.add(documents=batch, ids=ids,
144
- metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
145
- print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
146
- now = datetime.now()
147
- gr.Info(f"Indexing complete!")
148
- print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
 
 
 
 
149
  return(collection)
150
 
151
  #split_with_overlap("test me if you can",2,1)
 
111
  def add_doc(path):
112
  print("def add_doc!")
113
  print(path)
114
+ anhang=False
115
+ if(str.lower(path).endswith(".pdf") and os.path.exists(path)):
116
  doc=convertPDF(path)
117
+ if(len(doc[0])>5):
118
+ gr.Info("PDF uploaded, start Indexing excerpt (first 5 pages)!")
119
+ else:
120
+ gr.Info("PDF uploaded, start Indexing!")
121
+ doc="\n\n".join(doc[0][0:5])
122
+ anhang=True
123
+
124
  client = chromadb.PersistentClient(path="output/general_knowledge")
125
  print(str(client.list_collections()))
126
  #global collection
127
  dbName="test"
128
+ if(not "name="+dbName in str(client.list_collections())):
129
+ # client.delete_collection(name=dbName)
130
+ collection = client.create_collection(
131
+ name=dbName,
132
+ embedding_function=embeddingModel,
133
+ metadata={"hnsw:space": "cosine"})
134
+ else:
135
+ collection = client.get_collection(
136
+ name=dbName, embedding_function=embeddingModel)
137
+ if(anhang==True):
138
+ corpus=split_with_overlap(doc,3500,700)
139
+ print(len(corpus))
140
+ then = datetime.now()
141
+ x=collection.get(include=[])["ids"]
142
+ print(len(x))
143
+ if(len(x)==0):
144
+ chunkSize=40000
145
+ for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
146
+ print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
147
+ ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
148
+ batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
149
+ textIDs=[str(id) for id in ids[0:len(batch)]]
150
+ ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
151
+ collection.add(documents=batch, ids=ids,
152
+ metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
153
+ print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
154
+ now = datetime.now()
155
+ gr.Info(f"Indexing complete!")
156
+ print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
157
  return(collection)
158
 
159
  #split_with_overlap("test me if you can",2,1)