File size: 3,575 Bytes
1573b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import pandas as pd
import nltk
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#nltk.download('punkt_tab')

def load_data(base_path, max_files=1000):
    """
    Load text files from directories and split them into sentences.
    """
    texts = []
    labels = []
    
    for category in os.listdir(base_path):
        category_path = os.path.join(base_path, category)
        if not os.path.isdir(category_path):
            continue
            
        file_count = 0
        for filename in os.listdir(category_path):
            if not filename.endswith('.txt'):
                continue
                
            if file_count >= max_files:
                break
                
            with open(os.path.join(category_path, filename), 'r', encoding='utf-8') as f:
                content = f.read()
                sentences = nltk.sent_tokenize(content)
                texts.extend(sentences)
                # Create multi-label format
                labels.extend([[category] for _ in sentences])
                
            file_count += 1
            
    return texts, labels

class IndependentMultiLabelClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.mlb = MultiLabelBinarizer()
        self.classifiers = {}
        
    def fit(self, X, y):
        # Transform text features
        X_transformed = self.vectorizer.fit_transform(X)
        
        # Transform labels and get all possible categories
        y_transformed = self.mlb.fit_transform(y)
        self.categories = self.mlb.classes_
        
        # Train a binary classifier for each category
        for i, category in enumerate(self.categories):
            print(f"\nTraining classifier for: {category}")
            clf = LogisticRegression(max_iter=1000, class_weight='balanced')
            y_binary = y_transformed[:, i]
            clf.fit(X_transformed, y_binary)
            self.classifiers[category] = clf
            
            # Print performance metrics
            y_pred = clf.predict(X_transformed)
            print(classification_report(y_binary, y_pred))
    
    def predict_proba(self, X):
        # Transform text
        X_transformed = self.vectorizer.transform(X)
        
        # Get independent probabilities for each category
        predictions = []
        for category, clf in self.classifiers.items():
            # Get raw probability for positive class
            prob = clf.predict_proba(X_transformed)[0][1]
            predictions.append((category, prob))
        
        return sorted(predictions, key=lambda x: x[1], reverse=True)

# Example usage
if __name__ == "__main__":
    base_path = "/content/extracted_files/Uzbek_News_Dataset"
    
    print("Loading data...")
    texts, labels = load_data(base_path)
    
    print("Training independent classifiers...")
    classifier = IndependentMultiLabelClassifier()
    classifier.fit(texts, labels)
    
    # Test prediction
    test_text = "Amerikada zilzila sodir bo'ldi"
    predictions = classifier.predict_proba([test_text])
    
    print(f"\nIndependent prediction scores for: {test_text}")
    for category, prob in predictions:
        print(f"{category}: {prob*100:.1f}%")