Spaces:

suatatan
/

reportio

Sleeping

reportio / core /parsing.py

Suat ATAN

first commit

9177215 11 months ago

3.4 kB

	from io import BytesIO
	from typing import List, Any, Optional
	import re

	import docx2txt
	from langchain.docstore.document import Document
	import fitz
	from hashlib import md5

	from abc import abstractmethod, ABC
	from copy import deepcopy


	class File(ABC):
	"""Represents an uploaded file comprised of Documents"""

	def __init__(
	self,
	name: str,
	id: str,
	metadata: Optional[dict[str, Any]] = None,
	docs: Optional[List[Document]] = None,
	):
	self.name = name
	self.id = id
	self.metadata = metadata or {}
	self.docs = docs or []

	@classmethod
	@abstractmethod
	def from_bytes(cls, file: BytesIO) -> "File":
	"""Creates a File from a BytesIO object"""

	def __repr__(self) -> str:
	return (
	f"File(name={self.name}, id={self.id},"
	" metadata={self.metadata}, docs={self.docs})"
	)

	def __str__(self) -> str:
	return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"

	def copy(self) -> "File":
	"""Create a deep copy of this File"""
	return self.__class__(
	name=self.name,
	id=self.id,
	metadata=deepcopy(self.metadata),
	docs=deepcopy(self.docs),
	)


	def strip_consecutive_newlines(text: str) -> str:
	"""Strips consecutive newlines from a string
	possibly with whitespace in between
	"""
	return re.sub(r"\s\n\s", "\n", text)


	class DocxFile(File):
	@classmethod
	def from_bytes(cls, file: BytesIO) -> "DocxFile":
	text = docx2txt.process(file)
	text = strip_consecutive_newlines(text)
	doc = Document(page_content=text.strip())
	doc.metadata["source"] = "p-1"
	return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])


	class PdfFile(File):
	@classmethod
	def from_bytes(cls, file: BytesIO) -> "PdfFile":
	pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore
	docs = []
	for i, page in enumerate(pdf):
	text = page.get_text(sort=True)
	text = strip_consecutive_newlines(text)
	doc = Document(page_content=text.strip())
	doc.metadata["page"] = i + 1
	doc.metadata["source"] = f"p-{i+1}"
	docs.append(doc)
	# file.read() mutates the file object, which can affect caching
	# so we need to reset the file pointer to the beginning
	file.seek(0)
	return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs)


	class TxtFile(File):
	@classmethod
	def from_bytes(cls, file: BytesIO) -> "TxtFile":
	text = file.read().decode("utf-8", errors="replace")
	text = strip_consecutive_newlines(text)
	file.seek(0)
	doc = Document(page_content=text.strip())
	doc.metadata["source"] = "p-1"
	return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])


	def read_file(file: BytesIO) -> File:
	"""Reads an uploaded file and returns a File object"""
	if file.name.lower().endswith(".docx"):
	return DocxFile.from_bytes(file)
	elif file.name.lower().endswith(".pdf"):
	return PdfFile.from_bytes(file)
	elif file.name.lower().endswith(".txt"):
	return TxtFile.from_bytes(file)
	else:
	raise NotImplementedError(f"File type {file.name.split('.')[-1]} not supported")