Rauhan commited on
Commit
9245bf5
1 Parent(s): ea8ad26

DEBUG: base64 -> plain text

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. functions.py +3 -6
app.py CHANGED
@@ -329,7 +329,7 @@ async def loadText(addTextConfig: AddText):
329
  "output": text,
330
  "source": "Text"
331
  }
332
- numTokens = len(" ".join([base64.b64decode(text[x].encode("utf-8")).decode("utf-8") for x in text]).translate(str.maketrans('', '', string.punctuation)).split(" "))
333
  dct = json.dumps(dct, indent=1).encode("utf-8")
334
  fileName = createDataSourceName(sourceName="Text")
335
  response = supabase.storage.from_("ConversAI").upload(file=dct, path=f"{fileName}_data.json")
 
329
  "output": text,
330
  "source": "Text"
331
  }
332
+ numTokens = len(" ".join([text[x] for x in text]).translate(str.maketrans('', '', string.punctuation)).split(" "))
333
  dct = json.dumps(dct, indent=1).encode("utf-8")
334
  fileName = createDataSourceName(sourceName="Text")
335
  response = supabase.storage.from_("ConversAI").upload(file=dct, path=f"{fileName}_data.json")
functions.py CHANGED
@@ -297,7 +297,7 @@ def getTextFromImagePDF(pdfBytes):
297
  return cleanText(text = text)
298
 
299
  allImages = convert_from_bytes(pdfBytes)
300
- texts = [base64.b64encode(getText(image).encode("utf-8")).decode("utf-8") for image in allImages]
301
  return {x + 1: y for x, y in enumerate(texts)}
302
 
303
 
@@ -313,7 +313,6 @@ def getTranscript(urls: str):
313
  except:
314
  doc = ""
315
  texts.append(doc)
316
- texts = [base64.b64encode(text.encode("utf-8")).decode("utf-8") for text in texts]
317
  return {x: y for x, y in zip(urls, texts)}
318
 
319
 
@@ -331,8 +330,7 @@ def analyzeData(query, dataframe):
331
 
332
 
333
  def extractTextFromPage(page):
334
- text = cleanText(text = page.get_text())
335
- return base64.b64encode(text.encode("utf-8")).decode("utf-8")
336
 
337
 
338
  def extractTextFromPdf(pdf_path):
@@ -349,8 +347,7 @@ def extractTextFromUrl(url):
349
  response.raise_for_status()
350
  html = response.text
351
  soup = BeautifulSoup(html, 'lxml')
352
- text = cleanText(text = soup.get_text(separator=' ', strip=True))
353
- return base64.b64encode(text.encode("utf-8")).decode("utf-8")
354
 
355
 
356
  def extractTextFromUrlList(urls):
 
297
  return cleanText(text = text)
298
 
299
  allImages = convert_from_bytes(pdfBytes)
300
+ texts = [getText(image) for image in allImages]
301
  return {x + 1: y for x, y in enumerate(texts)}
302
 
303
 
 
313
  except:
314
  doc = ""
315
  texts.append(doc)
 
316
  return {x: y for x, y in zip(urls, texts)}
317
 
318
 
 
330
 
331
 
332
  def extractTextFromPage(page):
333
+ return cleanText(text = page.get_text())
 
334
 
335
 
336
  def extractTextFromPdf(pdf_path):
 
347
  response.raise_for_status()
348
  html = response.text
349
  soup = BeautifulSoup(html, 'lxml')
350
+ return cleanText(text = soup.get_text(separator=' ', strip=True))
 
351
 
352
 
353
  def extractTextFromUrlList(urls):