Spaces:
Sleeping
Sleeping
import xml.etree.ElementTree as ET | |
import glob | |
import os | |
import logging | |
def process_json_files(start, end): | |
base_path = "texts/tanach" | |
results = {} | |
for i in range(start, end + 1): | |
file_pattern = f"{base_path}/{i:02}*.xml" | |
matching_files = glob.glob(file_pattern) | |
if not matching_files: | |
logging.warning(f"No file matching pattern '{file_pattern}' found.") | |
results[i] = {"title": "No title", "text": []} | |
continue | |
book_texts = [] | |
for file_name in matching_files: | |
try: | |
tree = ET.parse(file_name) | |
root = tree.getroot() | |
chapter_texts = [] | |
for chapter in root.findall('.//c'): | |
verse_texts = [] | |
for verse in chapter.findall('./v'): | |
verse_text = "" | |
for word in verse.findall('./w'): | |
verse_text += " " + "".join(word.itertext()) | |
verse_texts.append(verse_text.strip()) | |
chapter_texts.append(verse_texts) | |
book_texts = chapter_texts | |
book_title = root.find('.//names/name').text if root.find('.//names/name') is not None else os.path.basename(file_name) | |
results[i] = { | |
"title": book_title, | |
"text": book_texts | |
} | |
except FileNotFoundError: | |
logging.warning(f"File {file_name} not found.") | |
results[i] = {"title": "No title", "text": []} | |
except ET.ParseError as e: | |
logging.warning(f"File {file_name} could not be read as XML: {e}") | |
results[i] = {"title": "No title", "text": []} | |
except KeyError as e: | |
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") | |
results[i] = {"title": "No title", "text": []} | |
return results | |