hlydecker commited on
Commit
892178c
·
1 Parent(s): 55a9545

Update streamlit_langchain_chat/dataset.py

Browse files
Files changed (1) hide show
  1. streamlit_langchain_chat/dataset.py +3 -24
streamlit_langchain_chat/dataset.py CHANGED
@@ -98,8 +98,6 @@ def parse_docx(path, citation, key, chunk_chars=2000, overlap=50):
98
  return [], []
99
 
100
 
101
- # TODO: si pones un conector con el formato loader = ... ; data = loader.load();
102
- # podrás poner todos los conectores de langchain
103
  # https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
104
  def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
105
  pdfFileObj = open(path, "rb")
@@ -111,9 +109,6 @@ def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
111
  for i, page in enumerate(pdfReader.pages):
112
  split += page.extract_text()
113
  pages.append(str(i + 1))
114
- # split could be so long it needs to be split
115
- # into multiple chunks. Or it could be so short
116
- # that it needs to be combined with the next chunk.
117
  while len(split) > chunk_chars:
118
  splits.append(split[:chunk_chars])
119
  # pretty formatting of pages (e.g. 1-3, 4, 5-7)
@@ -184,7 +179,6 @@ def parse_txt(path, citation, key, chunk_chars=2000, overlap=50, html=False):
184
  doc = f.read()
185
  if html:
186
  doc = html2text(doc)
187
- # yo, no idea why but the texts are not split correctly
188
  text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
189
  texts = text_splitter.split_text(doc)
190
  return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
@@ -289,8 +283,8 @@ def read_source(path: str = None,
289
  return parse_pptx(path, citation, key, chunk_chars, overlap)
290
  elif path.startswith("http://") or path.startswith("https://"):
291
  return parse_url(path, citation, key, chunk_chars, overlap)
292
- # TODO: poner mas conectores
293
- # else:
294
  # return parse_code_txt(path, citation, key, chunk_chars, overlap)
295
  else:
296
  raise "unknown extension"
@@ -510,20 +504,6 @@ class Dataset:
510
  lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
511
  )
512
 
513
- # TODO: que cuando exista que no lo borre, sino que lo actualice
514
- # index_name = "langchain-demo1"
515
- # if index_name in pinecone.list_indexes():
516
- # self.index_docstore = pinecone.Index(index_name)
517
- # vectors = []
518
- # for text, metadata in zip(texts, metadatas):
519
- # # embed = <faltaria saber con que embedding se hizo el index que ya existia>
520
- # self.index_docstore.upsert(vectors=vectors)
521
- # else:
522
- # if openai.api_type == 'azure':
523
- # self.index_docstore = Pinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
524
- # else:
525
- # self.index_docstore = OriginalPinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
526
-
527
  index_name = "langchain-demo1"
528
 
529
  # if the index exists, delete it
@@ -593,7 +573,6 @@ class Dataset:
593
  break
594
  if OPERATING_MODE == "debug":
595
  print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
596
- # no se printea el ultimo caracter porque es un \n
597
  print(partial_summary_time[:-1])
598
  context_str = "\n\n".join(
599
  [f"{citation}: {summary_of_chunked_text}"
@@ -693,7 +672,7 @@ class Dataset:
693
  'total_tokens': cb.total_tokens
694
  })
695
 
696
- # it still happens lol
697
  if "(Foo2012)" in answer_text:
698
  answer_text = answer_text.replace("(Foo2012)", "")
699
  for key, citation, summary, text in answer.packages:
 
98
  return [], []
99
 
100
 
 
 
101
  # https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
102
  def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
103
  pdfFileObj = open(path, "rb")
 
109
  for i, page in enumerate(pdfReader.pages):
110
  split += page.extract_text()
111
  pages.append(str(i + 1))
 
 
 
112
  while len(split) > chunk_chars:
113
  splits.append(split[:chunk_chars])
114
  # pretty formatting of pages (e.g. 1-3, 4, 5-7)
 
179
  doc = f.read()
180
  if html:
181
  doc = html2text(doc)
 
182
  text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
183
  texts = text_splitter.split_text(doc)
184
  return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
 
283
  return parse_pptx(path, citation, key, chunk_chars, overlap)
284
  elif path.startswith("http://") or path.startswith("https://"):
285
  return parse_url(path, citation, key, chunk_chars, overlap)
286
+ # WIP
287
+ #else:
288
  # return parse_code_txt(path, citation, key, chunk_chars, overlap)
289
  else:
290
  raise "unknown extension"
 
504
  lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
505
  )
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  index_name = "langchain-demo1"
508
 
509
  # if the index exists, delete it
 
573
  break
574
  if OPERATING_MODE == "debug":
575
  print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
 
576
  print(partial_summary_time[:-1])
577
  context_str = "\n\n".join(
578
  [f"{citation}: {summary_of_chunked_text}"
 
672
  'total_tokens': cb.total_tokens
673
  })
674
 
675
+ # it still happens ulol
676
  if "(Foo2012)" in answer_text:
677
  answer_text = answer_text.replace("(Foo2012)", "")
678
  for key, citation, summary, text in answer.packages: