OrifjonKenjayev commited on
Commit
1573b7e
1 Parent(s): c8c45e2

Upload 2 files

Browse files
Files changed (2) hide show
  1. classifier_model.joblib +3 -0
  2. textclassification.py +101 -0
classifier_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:778c70546cb8cb95a51afdd61eff7c111bf81308f221d260a182948696e64304
3
+ size 997588
textclassification.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.preprocessing import MultiLabelBinarizer
7
+ from sklearn.metrics import classification_report
8
+ import pandas as pd
9
+ import nltk
10
+ from sklearn.pipeline import Pipeline
11
+ from sklearn.preprocessing import StandardScaler
12
+ #nltk.download('punkt_tab')
13
+
14
+ def load_data(base_path, max_files=1000):
15
+ """
16
+ Load text files from directories and split them into sentences.
17
+ """
18
+ texts = []
19
+ labels = []
20
+
21
+ for category in os.listdir(base_path):
22
+ category_path = os.path.join(base_path, category)
23
+ if not os.path.isdir(category_path):
24
+ continue
25
+
26
+ file_count = 0
27
+ for filename in os.listdir(category_path):
28
+ if not filename.endswith('.txt'):
29
+ continue
30
+
31
+ if file_count >= max_files:
32
+ break
33
+
34
+ with open(os.path.join(category_path, filename), 'r', encoding='utf-8') as f:
35
+ content = f.read()
36
+ sentences = nltk.sent_tokenize(content)
37
+ texts.extend(sentences)
38
+ # Create multi-label format
39
+ labels.extend([[category] for _ in sentences])
40
+
41
+ file_count += 1
42
+
43
+ return texts, labels
44
+
45
+ class IndependentMultiLabelClassifier:
46
+ def __init__(self):
47
+ self.vectorizer = TfidfVectorizer(max_features=5000)
48
+ self.mlb = MultiLabelBinarizer()
49
+ self.classifiers = {}
50
+
51
+ def fit(self, X, y):
52
+ # Transform text features
53
+ X_transformed = self.vectorizer.fit_transform(X)
54
+
55
+ # Transform labels and get all possible categories
56
+ y_transformed = self.mlb.fit_transform(y)
57
+ self.categories = self.mlb.classes_
58
+
59
+ # Train a binary classifier for each category
60
+ for i, category in enumerate(self.categories):
61
+ print(f"\nTraining classifier for: {category}")
62
+ clf = LogisticRegression(max_iter=1000, class_weight='balanced')
63
+ y_binary = y_transformed[:, i]
64
+ clf.fit(X_transformed, y_binary)
65
+ self.classifiers[category] = clf
66
+
67
+ # Print performance metrics
68
+ y_pred = clf.predict(X_transformed)
69
+ print(classification_report(y_binary, y_pred))
70
+
71
+ def predict_proba(self, X):
72
+ # Transform text
73
+ X_transformed = self.vectorizer.transform(X)
74
+
75
+ # Get independent probabilities for each category
76
+ predictions = []
77
+ for category, clf in self.classifiers.items():
78
+ # Get raw probability for positive class
79
+ prob = clf.predict_proba(X_transformed)[0][1]
80
+ predictions.append((category, prob))
81
+
82
+ return sorted(predictions, key=lambda x: x[1], reverse=True)
83
+
84
+ # Example usage
85
+ if __name__ == "__main__":
86
+ base_path = "/content/extracted_files/Uzbek_News_Dataset"
87
+
88
+ print("Loading data...")
89
+ texts, labels = load_data(base_path)
90
+
91
+ print("Training independent classifiers...")
92
+ classifier = IndependentMultiLabelClassifier()
93
+ classifier.fit(texts, labels)
94
+
95
+ # Test prediction
96
+ test_text = "Amerikada zilzila sodir bo'ldi"
97
+ predictions = classifier.predict_proba([test_text])
98
+
99
+ print(f"\nIndependent prediction scores for: {test_text}")
100
+ for category, prob in predictions:
101
+ print(f"{category}: {prob*100:.1f}%")