Spaces:
Sleeping
Sleeping
File size: 3,375 Bytes
d777f1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import os
import shutil
import yaml
import logging
import pandas as pd
from pathlib import Path
from jsonargparse import CLI
from docx.api import Document
from types import SimpleNamespace
from llama_index.core import SimpleDirectoryReader
from utils.process_tables import extract_and_replace_docx_tables
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("script.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def load_config(file_path='config.yaml'):
logger.info('Loading config file ...')
try:
with open(file_path, 'r') as file:
cfg = yaml.safe_load(file)
for k, v in cfg.items():
if isinstance(v, dict):
cfg[k] = SimpleNamespace(**v)
logger.info('Config file loaded successfully.')
return SimpleNamespace(**cfg)
except Exception as e:
logger.error(f'Error loading config file: {e}')
raise
cfg = load_config()
def process_docx_files(data_dir=Path(cfg.dataset.data_dir),
processed_data_dir=Path(cfg.dataset.processed_data_dir),
chunk_marker=cfg.dataset.chunk_marker):
try:
if not os.path.exists(processed_data_dir):
shutil.rmtree(processed_data_dir)
docx_files = [file for file in os.listdir(data_dir) if file.endswith('.docx')]
logger.info(f'Found {len(docx_files)} DOCX files to process.')
for fname in docx_files:
document, html_chunked_tables = extract_and_replace_docx_tables(
docx_file=data_dir / fname,
chunk_marker=chunk_marker
)
document.save(processed_data_dir / f'processed_{fname}')
logger.info(f'Processed and saved {fname}')
except Exception as e:
logger.error(f'Error processing DOCX files: {e}')
raise
def load_processed_data(processed_data_dir=Path(cfg.dataset.processed_data_dir)):
try:
documents = SimpleDirectoryReader(
input_dir=processed_data_dir,
required_exts=[cfg.dataset.required_exts],
).load_data()
logger.info('Processed data loaded successfully.')
return documents
except Exception as e:
logger.error(f'Error loading processed data: {e}')
raise
def get_chunks(documents, chunk_marker=cfg.dataset.chunk_marker):
try:
chunks = [chunk.strip() for doc in documents for chunk in doc.text.split(chunk_marker) if chunk.strip()]
logger.info(f'Extracted {len(chunks)} chunks from documents.')
return chunks
except Exception as e:
logger.error(f'Error extracting chunks: {e}')
raise
def main():
logger.info('Starting document processing ...')
try:
process_docx_files()
documents = load_processed_data()
chunks = get_chunks(documents)
num_chunks = len(chunks)
logger.info(f'Total number of chunks: {num_chunks}')
df_chunks = pd.DataFrame({'chunk': chunks})
df_chunks.to_pickle('processed_chunks.pickle')
logger.info('All chunks saved to processed_chunks.pickle')
except Exception as e:
logger.error(f'Error in main processing: {e}')
raise
if __name__ == '__main__':
main() |