Spaces:
Sleeping
Sleeping
import os | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.preprocessing import MultiLabelBinarizer | |
from sklearn.metrics import classification_report | |
import pandas as pd | |
import nltk | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler | |
#nltk.download('punkt_tab') | |
def load_data(base_path, max_files=1000): | |
""" | |
Load text files from directories and split them into sentences. | |
""" | |
texts = [] | |
labels = [] | |
for category in os.listdir(base_path): | |
category_path = os.path.join(base_path, category) | |
if not os.path.isdir(category_path): | |
continue | |
file_count = 0 | |
for filename in os.listdir(category_path): | |
if not filename.endswith('.txt'): | |
continue | |
if file_count >= max_files: | |
break | |
with open(os.path.join(category_path, filename), 'r', encoding='utf-8') as f: | |
content = f.read() | |
sentences = nltk.sent_tokenize(content) | |
texts.extend(sentences) | |
# Create multi-label format | |
labels.extend([[category] for _ in sentences]) | |
file_count += 1 | |
return texts, labels | |
class IndependentMultiLabelClassifier: | |
def __init__(self): | |
self.vectorizer = TfidfVectorizer(max_features=5000) | |
self.mlb = MultiLabelBinarizer() | |
self.classifiers = {} | |
def fit(self, X, y): | |
# Transform text features | |
X_transformed = self.vectorizer.fit_transform(X) | |
# Transform labels and get all possible categories | |
y_transformed = self.mlb.fit_transform(y) | |
self.categories = self.mlb.classes_ | |
# Train a binary classifier for each category | |
for i, category in enumerate(self.categories): | |
print(f"\nTraining classifier for: {category}") | |
clf = LogisticRegression(max_iter=1000, class_weight='balanced') | |
y_binary = y_transformed[:, i] | |
clf.fit(X_transformed, y_binary) | |
self.classifiers[category] = clf | |
# Print performance metrics | |
y_pred = clf.predict(X_transformed) | |
print(classification_report(y_binary, y_pred)) | |
def predict_proba(self, X): | |
# Transform text | |
X_transformed = self.vectorizer.transform(X) | |
# Get independent probabilities for each category | |
predictions = [] | |
for category, clf in self.classifiers.items(): | |
# Get raw probability for positive class | |
prob = clf.predict_proba(X_transformed)[0][1] | |
predictions.append((category, prob)) | |
return sorted(predictions, key=lambda x: x[1], reverse=True) | |
# Example usage | |
if __name__ == "__main__": | |
base_path = "/content/extracted_files/Uzbek_News_Dataset" | |
print("Loading data...") | |
texts, labels = load_data(base_path) | |
print("Training independent classifiers...") | |
classifier = IndependentMultiLabelClassifier() | |
classifier.fit(texts, labels) | |
# Test prediction | |
test_text = "Amerikada zilzila sodir bo'ldi" | |
predictions = classifier.predict_proba([test_text]) | |
print(f"\nIndependent prediction scores for: {test_text}") | |
for category, prob in predictions: | |
print(f"{category}: {prob*100:.1f}%") |