Spaces:
Runtime error
Runtime error
Update streamlit_langchain_chat/dataset.py
Browse files
streamlit_langchain_chat/dataset.py
CHANGED
@@ -98,8 +98,6 @@ def parse_docx(path, citation, key, chunk_chars=2000, overlap=50):
|
|
98 |
return [], []
|
99 |
|
100 |
|
101 |
-
# TODO: si pones un conector con el formato loader = ... ; data = loader.load();
|
102 |
-
# podrás poner todos los conectores de langchain
|
103 |
# https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
|
104 |
def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
|
105 |
pdfFileObj = open(path, "rb")
|
@@ -111,9 +109,6 @@ def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
|
|
111 |
for i, page in enumerate(pdfReader.pages):
|
112 |
split += page.extract_text()
|
113 |
pages.append(str(i + 1))
|
114 |
-
# split could be so long it needs to be split
|
115 |
-
# into multiple chunks. Or it could be so short
|
116 |
-
# that it needs to be combined with the next chunk.
|
117 |
while len(split) > chunk_chars:
|
118 |
splits.append(split[:chunk_chars])
|
119 |
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
@@ -184,7 +179,6 @@ def parse_txt(path, citation, key, chunk_chars=2000, overlap=50, html=False):
|
|
184 |
doc = f.read()
|
185 |
if html:
|
186 |
doc = html2text(doc)
|
187 |
-
# yo, no idea why but the texts are not split correctly
|
188 |
text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
|
189 |
texts = text_splitter.split_text(doc)
|
190 |
return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
|
@@ -289,8 +283,8 @@ def read_source(path: str = None,
|
|
289 |
return parse_pptx(path, citation, key, chunk_chars, overlap)
|
290 |
elif path.startswith("http://") or path.startswith("https://"):
|
291 |
return parse_url(path, citation, key, chunk_chars, overlap)
|
292 |
-
#
|
293 |
-
#
|
294 |
# return parse_code_txt(path, citation, key, chunk_chars, overlap)
|
295 |
else:
|
296 |
raise "unknown extension"
|
@@ -510,20 +504,6 @@ class Dataset:
|
|
510 |
lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
|
511 |
)
|
512 |
|
513 |
-
# TODO: que cuando exista que no lo borre, sino que lo actualice
|
514 |
-
# index_name = "langchain-demo1"
|
515 |
-
# if index_name in pinecone.list_indexes():
|
516 |
-
# self.index_docstore = pinecone.Index(index_name)
|
517 |
-
# vectors = []
|
518 |
-
# for text, metadata in zip(texts, metadatas):
|
519 |
-
# # embed = <faltaria saber con que embedding se hizo el index que ya existia>
|
520 |
-
# self.index_docstore.upsert(vectors=vectors)
|
521 |
-
# else:
|
522 |
-
# if openai.api_type == 'azure':
|
523 |
-
# self.index_docstore = Pinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
|
524 |
-
# else:
|
525 |
-
# self.index_docstore = OriginalPinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
|
526 |
-
|
527 |
index_name = "langchain-demo1"
|
528 |
|
529 |
# if the index exists, delete it
|
@@ -593,7 +573,6 @@ class Dataset:
|
|
593 |
break
|
594 |
if OPERATING_MODE == "debug":
|
595 |
print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
|
596 |
-
# no se printea el ultimo caracter porque es un \n
|
597 |
print(partial_summary_time[:-1])
|
598 |
context_str = "\n\n".join(
|
599 |
[f"{citation}: {summary_of_chunked_text}"
|
@@ -693,7 +672,7 @@ class Dataset:
|
|
693 |
'total_tokens': cb.total_tokens
|
694 |
})
|
695 |
|
696 |
-
# it still happens
|
697 |
if "(Foo2012)" in answer_text:
|
698 |
answer_text = answer_text.replace("(Foo2012)", "")
|
699 |
for key, citation, summary, text in answer.packages:
|
|
|
98 |
return [], []
|
99 |
|
100 |
|
|
|
|
|
101 |
# https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
|
102 |
def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
|
103 |
pdfFileObj = open(path, "rb")
|
|
|
109 |
for i, page in enumerate(pdfReader.pages):
|
110 |
split += page.extract_text()
|
111 |
pages.append(str(i + 1))
|
|
|
|
|
|
|
112 |
while len(split) > chunk_chars:
|
113 |
splits.append(split[:chunk_chars])
|
114 |
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
|
|
179 |
doc = f.read()
|
180 |
if html:
|
181 |
doc = html2text(doc)
|
|
|
182 |
text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
|
183 |
texts = text_splitter.split_text(doc)
|
184 |
return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
|
|
|
283 |
return parse_pptx(path, citation, key, chunk_chars, overlap)
|
284 |
elif path.startswith("http://") or path.startswith("https://"):
|
285 |
return parse_url(path, citation, key, chunk_chars, overlap)
|
286 |
+
# WIP
|
287 |
+
#else:
|
288 |
# return parse_code_txt(path, citation, key, chunk_chars, overlap)
|
289 |
else:
|
290 |
raise "unknown extension"
|
|
|
504 |
lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
|
505 |
)
|
506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
index_name = "langchain-demo1"
|
508 |
|
509 |
# if the index exists, delete it
|
|
|
573 |
break
|
574 |
if OPERATING_MODE == "debug":
|
575 |
print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
|
|
|
576 |
print(partial_summary_time[:-1])
|
577 |
context_str = "\n\n".join(
|
578 |
[f"{citation}: {summary_of_chunked_text}"
|
|
|
672 |
'total_tokens': cb.total_tokens
|
673 |
})
|
674 |
|
675 |
+
# it still happens ulol
|
676 |
if "(Foo2012)" in answer_text:
|
677 |
answer_text = answer_text.replace("(Foo2012)", "")
|
678 |
for key, citation, summary, text in answer.packages:
|