text_class_ok / textclassification.py
OrifjonKenjayev's picture
Upload 2 files
1573b7e verified
raw
history blame
3.58 kB
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import pandas as pd
import nltk
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#nltk.download('punkt_tab')
def load_data(base_path, max_files=1000):
"""
Load text files from directories and split them into sentences.
"""
texts = []
labels = []
for category in os.listdir(base_path):
category_path = os.path.join(base_path, category)
if not os.path.isdir(category_path):
continue
file_count = 0
for filename in os.listdir(category_path):
if not filename.endswith('.txt'):
continue
if file_count >= max_files:
break
with open(os.path.join(category_path, filename), 'r', encoding='utf-8') as f:
content = f.read()
sentences = nltk.sent_tokenize(content)
texts.extend(sentences)
# Create multi-label format
labels.extend([[category] for _ in sentences])
file_count += 1
return texts, labels
class IndependentMultiLabelClassifier:
def __init__(self):
self.vectorizer = TfidfVectorizer(max_features=5000)
self.mlb = MultiLabelBinarizer()
self.classifiers = {}
def fit(self, X, y):
# Transform text features
X_transformed = self.vectorizer.fit_transform(X)
# Transform labels and get all possible categories
y_transformed = self.mlb.fit_transform(y)
self.categories = self.mlb.classes_
# Train a binary classifier for each category
for i, category in enumerate(self.categories):
print(f"\nTraining classifier for: {category}")
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
y_binary = y_transformed[:, i]
clf.fit(X_transformed, y_binary)
self.classifiers[category] = clf
# Print performance metrics
y_pred = clf.predict(X_transformed)
print(classification_report(y_binary, y_pred))
def predict_proba(self, X):
# Transform text
X_transformed = self.vectorizer.transform(X)
# Get independent probabilities for each category
predictions = []
for category, clf in self.classifiers.items():
# Get raw probability for positive class
prob = clf.predict_proba(X_transformed)[0][1]
predictions.append((category, prob))
return sorted(predictions, key=lambda x: x[1], reverse=True)
# Example usage
if __name__ == "__main__":
base_path = "/content/extracted_files/Uzbek_News_Dataset"
print("Loading data...")
texts, labels = load_data(base_path)
print("Training independent classifiers...")
classifier = IndependentMultiLabelClassifier()
classifier.fit(texts, labels)
# Test prediction
test_text = "Amerikada zilzila sodir bo'ldi"
predictions = classifier.predict_proba([test_text])
print(f"\nIndependent prediction scores for: {test_text}")
for category, prob in predictions:
print(f"{category}: {prob*100:.1f}%")