import os import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import classification_report import pandas as pd import nltk from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler #nltk.download('punkt_tab') def load_data(base_path, max_files=1000): """ Load text files from directories and split them into sentences. """ texts = [] labels = [] for category in os.listdir(base_path): category_path = os.path.join(base_path, category) if not os.path.isdir(category_path): continue file_count = 0 for filename in os.listdir(category_path): if not filename.endswith('.txt'): continue if file_count >= max_files: break with open(os.path.join(category_path, filename), 'r', encoding='utf-8') as f: content = f.read() sentences = nltk.sent_tokenize(content) texts.extend(sentences) # Create multi-label format labels.extend([[category] for _ in sentences]) file_count += 1 return texts, labels class IndependentMultiLabelClassifier: def __init__(self): self.vectorizer = TfidfVectorizer(max_features=5000) self.mlb = MultiLabelBinarizer() self.classifiers = {} def fit(self, X, y): # Transform text features X_transformed = self.vectorizer.fit_transform(X) # Transform labels and get all possible categories y_transformed = self.mlb.fit_transform(y) self.categories = self.mlb.classes_ # Train a binary classifier for each category for i, category in enumerate(self.categories): print(f"\nTraining classifier for: {category}") clf = LogisticRegression(max_iter=1000, class_weight='balanced') y_binary = y_transformed[:, i] clf.fit(X_transformed, y_binary) self.classifiers[category] = clf # Print performance metrics y_pred = clf.predict(X_transformed) print(classification_report(y_binary, y_pred)) def predict_proba(self, X): # Transform text X_transformed = self.vectorizer.transform(X) # Get independent probabilities for each category predictions = [] for category, clf in self.classifiers.items(): # Get raw probability for positive class prob = clf.predict_proba(X_transformed)[0][1] predictions.append((category, prob)) return sorted(predictions, key=lambda x: x[1], reverse=True) # Example usage if __name__ == "__main__": base_path = "/content/extracted_files/Uzbek_News_Dataset" print("Loading data...") texts, labels = load_data(base_path) print("Training independent classifiers...") classifier = IndependentMultiLabelClassifier() classifier.fit(texts, labels) # Test prediction test_text = "Amerikada zilzila sodir bo'ldi" predictions = classifier.predict_proba([test_text]) print(f"\nIndependent prediction scores for: {test_text}") for category, prob in predictions: print(f"{category}: {prob*100:.1f}%")