import xml.etree.ElementTree as ET import glob import os import logging def process_json_files(start, end): base_path = "texts/tanach" results = {} for i in range(start, end + 1): file_pattern = f"{base_path}/{i:02}*.xml" matching_files = glob.glob(file_pattern) if not matching_files: logging.warning(f"No file matching pattern '{file_pattern}' found.") results[i] = {"title": "No title", "text": []} continue book_texts = [] for file_name in matching_files: try: tree = ET.parse(file_name) root = tree.getroot() chapter_texts = [] for chapter in root.findall('.//c'): verse_texts = [] for verse in chapter.findall('./v'): verse_text = "" for word in verse.findall('./w'): verse_text += " " + "".join(word.itertext()) verse_texts.append(verse_text.strip()) chapter_texts.append(verse_texts) book_texts = chapter_texts book_title = root.find('.//names/name').text if root.find('.//names/name') is not None else os.path.basename(file_name) results[i] = { "title": book_title, "text": book_texts } except FileNotFoundError: logging.warning(f"File {file_name} not found.") results[i] = {"title": "No title", "text": []} except ET.ParseError as e: logging.warning(f"File {file_name} could not be read as XML: {e}") results[i] = {"title": "No title", "text": []} except KeyError as e: logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") results[i] = {"title": "No title", "text": []} return results