Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""model.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1lKXL4Cdum5DiSbczUsadXc0F8j46NM_m | |
# in the name of **allah** | |
""" | |
import torch | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from transformers import AutoTokenizer, BertForSequenceClassification,AutoConfig | |
from datasets import Dataset | |
import pandas as pd | |
import os | |
import re | |
from hazm import Normalizer, Lemmatizer, word_tokenize, stopwords_list | |
# Initialize Hazm components | |
normalizer = Normalizer() | |
lemmatizer = Lemmatizer() | |
stopwords = stopwords_list() | |
# Load the BERT model for sentiment analysis | |
dataset = Dataset.from_pandas(pd.DataFrame({"Comment": []})) | |
# بارگذاری مدل و توکنایزر | |
model_name = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-multi" | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# انتخاب دستگاه (GPU یا CPU) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
# Tokenization function for sentiment analysis | |
def tokenize_function(examples): | |
return tokenizer(examples["Comment"], padding="max_length", truncation=True, max_length=128, return_tensors='pt') | |
# Sentiment prediction function | |
def predict_sentiment(batch): | |
input_ids = torch.tensor(batch['input_ids']).to(device) | |
attention_mask = torch.tensor(batch['attention_mask']).to(device) | |
with torch.no_grad(): | |
outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
predictions = torch.argmax(outputs.logits, dim=-1) | |
return {'sentiment': predictions.cpu()} | |
# Mapping sentiment labels | |
sentiment_labels = { | |
0: 'بسیار عصیانی', | |
1: 'عصبانی', | |
2: ' خنثی', | |
3: 'مثبت', | |
4: ' بسیار مثبت' | |
} | |
# Adding sentiment prediction to tokenized dataset | |
def predict_sentiment_labels(text): | |
dataset = Dataset.from_dict({"Comment": [text]}) | |
tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
predicted_sentiments = tokenized_dataset.map(predict_sentiment, batched=True) | |
sentiment = predicted_sentiments[0]['sentiment'] | |
return sentiment_labels.get(sentiment, 'نامشخص') | |
# Functions from your original code for classifying sentence type and cleaning | |
imperative_verbs = [ | |
'بیا', 'برو', 'بخواب', 'کن', 'باش', 'بذار', 'فراموش کن', 'بخور', | |
'بپوش', 'ببخش', 'بنویس', 'دقت کن', 'دست بردار', 'سکوت کن', | |
'اجازه بده', 'نکن', 'پیش برو', 'خواب بمان', 'توجه کن', 'خوش آمدید', | |
'حواسجمع باش', 'در نظر بگیر', 'بخشید', 'بکش', 'نگذار', 'سعی کن', | |
'تلاش کن', 'ببین', 'نرو', 'بگیر', 'بگو', 'شک نکن', 'فکر کن', | |
'عادت کن', 'بیانداز', 'حرکت کن', 'شکایت نکن', 'عاشق شو', 'بخند', | |
'برگرد', 'بزن', 'آشپزی کن', 'بپذیر', 'شیرینی بپز', 'درس بخوان', | |
'کلاس بگذار', 'کمک کن', 'بمان', 'راهنمایی کن', 'لطفا' | |
] | |
def classify_sentence(sentence): | |
sentence = sentence.strip() | |
sentence_type = 'خبری' | |
if re.search(r'چرا|چطور|کجا|آیا|چه|چی|چند|کدام|کی|چندم|چیست|چیه|چندمین|چجوری|کی|چیست|چگونه|؟', sentence) or sentence.endswith('?'): | |
sentence_type = 'پرسشی' | |
elif re.search(r'\b(?:' + '|'.join(imperative_verbs) + r')\b', sentence): | |
sentence_type = 'امری' | |
return sentence_type | |
def clean_text(text): | |
text = re.sub(r'https://\S+|www\.\S+', '', text) | |
text = re.sub(r'[^ا-ی0-9\s#@_؟]', ' ', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
words = word_tokenize(text) | |
#words = [word for word in words if word not in stopwords] | |
#words = [lemmatizer.lemmatize(word) for word in words] | |
return ' '.join(words) | |
def process_sentence(sentence): | |
cleaned = clean_text(sentence) | |
sentence_type = classify_sentence(cleaned) | |
sentiment = predict_sentiment_labels(sentence) | |
return f"Type: {sentence_type}\nSentiment: {sentiment}\nCleaned Text: {cleaned}" | |
# ایجاد پوشه برای ذخیره فایلها در صورت نبود آن | |
output_folder = "./outputs" | |
if not os.path.exists(output_folder): | |
os.makedirs(output_folder) | |
# Function to process file | |
def process_file(file): | |
try: | |
df = pd.read_csv(file.name) | |
if 'Comment' not in df.columns: | |
return "Error: No 'Comment' column found in the file." | |
# Process comments | |
df['Cleaned_Comment'] = df['Comment'].apply(clean_text) | |
df['Type'] = df['Comment'].apply(classify_sentence) | |
df['Sentiment'] = df['Comment'].apply(predict_sentiment_labels) | |
processed_file_path = os.path.join(output_folder, "processed_file.csv") | |
df.to_csv(processed_file_path, index=False,encoding='utf-8-sig') | |
return processed_file_path | |
except Exception as e: | |
return str(e) |