neuralworm's picture
update Tanach to 2.2, new repo
2f7895b
import xml.etree.ElementTree as ET
import glob
import os
import logging
def process_json_files(start, end):
base_path = "texts/tanach"
results = {}
for i in range(start, end + 1):
file_pattern = f"{base_path}/{i:02}*.xml"
matching_files = glob.glob(file_pattern)
if not matching_files:
logging.warning(f"No file matching pattern '{file_pattern}' found.")
results[i] = {"title": "No title", "text": []}
continue
book_texts = []
for file_name in matching_files:
try:
tree = ET.parse(file_name)
root = tree.getroot()
chapter_texts = []
for chapter in root.findall('.//c'):
verse_texts = []
for verse in chapter.findall('./v'):
verse_text = ""
for word in verse.findall('./w'):
verse_text += " " + "".join(word.itertext())
verse_texts.append(verse_text.strip())
chapter_texts.append(verse_texts)
book_texts = chapter_texts
book_title = root.find('.//names/name').text if root.find('.//names/name') is not None else os.path.basename(file_name)
results[i] = {
"title": book_title,
"text": book_texts
}
except FileNotFoundError:
logging.warning(f"File {file_name} not found.")
results[i] = {"title": "No title", "text": []}
except ET.ParseError as e:
logging.warning(f"File {file_name} could not be read as XML: {e}")
results[i] = {"title": "No title", "text": []}
except KeyError as e:
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
results[i] = {"title": "No title", "text": []}
return results