Spaces:
Runtime error
Runtime error
fracapuano
commited on
Commit
·
a96d162
1
Parent(s):
11ce856
fix: now also storing the name of the file
Browse files- qa/utils.py +10 -6
qa/utils.py
CHANGED
@@ -95,7 +95,7 @@ def get_text_splitter(
|
|
95 |
return text_splitter
|
96 |
|
97 |
@st.cache_data
|
98 |
-
def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
|
99 |
"""
|
100 |
Converts a string or frozenset of strings to a list of Documents
|
101 |
with metadata.
|
@@ -112,6 +112,8 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
|
|
112 |
# Add page numbers as metadata
|
113 |
for i, doc in enumerate(page_docs):
|
114 |
doc.metadata["page"] = i + 1
|
|
|
|
|
115 |
# Split pages into chunks
|
116 |
doc_chunks = []
|
117 |
# Get the text splitter
|
@@ -122,15 +124,17 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
|
|
122 |
chunks = text_splitter.split_text(doc.page_content)
|
123 |
for i, chunk in enumerate(chunks):
|
124 |
# Create a new document for each individual chunk
|
125 |
-
|
126 |
-
page_content=chunk,
|
|
|
127 |
)
|
128 |
# Add sources to metadata for retrieval later on
|
129 |
-
|
130 |
-
|
|
|
131 |
|
132 |
return doc_chunks
|
133 |
-
|
134 |
|
135 |
@st.cache_data
|
136 |
def embed_docs(_docs: Tuple[Document]) -> VectorStore:
|
|
|
95 |
return text_splitter
|
96 |
|
97 |
@st.cache_data
|
98 |
+
def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
|
99 |
"""
|
100 |
Converts a string or frozenset of strings to a list of Documents
|
101 |
with metadata.
|
|
|
112 |
# Add page numbers as metadata
|
113 |
for i, doc in enumerate(page_docs):
|
114 |
doc.metadata["page"] = i + 1
|
115 |
+
doc.metadata["file_name"] = kwargs.get("file_name", "")
|
116 |
+
|
117 |
# Split pages into chunks
|
118 |
doc_chunks = []
|
119 |
# Get the text splitter
|
|
|
124 |
chunks = text_splitter.split_text(doc.page_content)
|
125 |
for i, chunk in enumerate(chunks):
|
126 |
# Create a new document for each individual chunk
|
127 |
+
new_doc = HashDocument(
|
128 |
+
page_content=chunk,
|
129 |
+
metadata={"file_name": doc.metadata["file_name"], "page": doc.metadata["page"], "chunk": i}
|
130 |
)
|
131 |
# Add sources to metadata for retrieval later on
|
132 |
+
new_doc.metadata["source"] = \
|
133 |
+
f"{new_doc.metadata['file_name']}/Page-{new_doc.metadata['page']}/Chunk-{new_doc.metadata['chunk']}"
|
134 |
+
doc_chunks.append(new_doc)
|
135 |
|
136 |
return doc_chunks
|
137 |
+
|
138 |
|
139 |
@st.cache_data
|
140 |
def embed_docs(_docs: Tuple[Document]) -> VectorStore:
|