import pandas as pd from pathlib import Path from typing import List, Dict, Tuple import random def load_hindi_dataset(base_path: str = "data", split: str = "train", num_files: int = None) -> str: """ Load Hindi text from dataset with train/validation split structure. Args: base_path: Base directory containing train and validation folders split: Either 'train' or 'valid' num_files: Number of files to load (None for all files) """ base_dir = Path(base_path) split_dir = base_dir / split / split if not split_dir.exists(): raise FileNotFoundError(f"Directory not found: {split_dir}") print(f"\nLoading Hindi dataset from {split_dir}") # Get all txt files in the directory txt_files = list(split_dir.glob("*.txt")) if not txt_files: raise FileNotFoundError(f"No txt files found in {split_dir}") # Sort files by word count (assuming filenames contain word counts) txt_files.sort(key=lambda x: int(x.stem)) # Sample files if num_files is specified if num_files is not None: if num_files < len(txt_files): txt_files = random.sample(txt_files, num_files) print(f"Found {len(txt_files)} files") # Load and combine text from files texts = [] total_chars = 0 total_words = 0 for idx, file_path in enumerate(txt_files, 1): try: with open(file_path, 'r', encoding='utf-8') as f: text = f.read().strip() word_count = int(file_path.stem) # Filename is word count texts.append(text) total_chars += len(text) total_words += word_count if idx % 10 == 0: print(f"Processed {idx}/{len(txt_files)} files. " f"Total characters: {total_chars:,}, " f"Total words: {total_words:,}") except Exception as e: print(f"Error reading file {file_path}: {e}") continue combined_text = "\n\n".join(texts) print(f"\nDataset loading completed:") print(f"Total files: {len(texts)}") print(f"Total characters: {len(combined_text):,}") print(f"Total words: {total_words:,}") print(f"Average words per file: {total_words/len(texts):,.1f}") return combined_text def get_dataset_stats(base_path: str = "data") -> Dict: """Get statistics about the dataset.""" stats = {} for split in ['train', 'valid']: split_dir = Path(base_path) / split if split_dir.exists(): txt_files = list(split_dir.glob("*.txt")) word_counts = [int(f.stem) for f in txt_files] stats[split] = { 'num_files': len(txt_files), 'total_words': sum(word_counts), 'min_words': min(word_counts) if word_counts else 0, 'max_words': max(word_counts) if word_counts else 0, 'avg_words': sum(word_counts)/len(word_counts) if word_counts else 0 } return stats def load_train_valid_split(base_path: str = "data", train_files: int = None, valid_files: int = None) -> Tuple[str, str]: """Load both train and validation splits.""" train_text = load_hindi_dataset(base_path, "train", train_files) valid_text = load_hindi_dataset(base_path, "valid", valid_files) return train_text, valid_text if __name__ == "__main__": # Print dataset statistics stats = get_dataset_stats() print("\nDataset Statistics:") print("-" * 50) for split, split_stats in stats.items(): print(f"\n{split.upper()} Split:") for key, value in split_stats.items(): if isinstance(value, (int, float)): print(f"{key}: {value:,}") else: print(f"{key}: {value}") # Load sample data print("\nLoading sample data...") train_text, valid_text = load_train_valid_split(train_files=5, valid_files=2) print(f"\nSample train text (first 200 chars):\n{train_text[:200]}") print(f"\nSample valid text (first 200 chars):\n{valid_text[:200]}")