import joblib
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import emoji

# Load the model and vectorizer
model = joblib.load("hard_voting_classifier.pkl")
vectorizer = joblib.load("vectorizer.pkl")

# Load custom stopwords
with open("Indonesia_stopwords.txt", "r") as f:
    custom_stopwords = [word.strip() for word in f.readlines()]

def preprocess_data(text):
    """Preprocess the input text."""
    # Case Folding
    text = text.lower()
    
    # Sentence Normalization
    text = emoji.demojize(text)  # Translate emojis to their word representation
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters except for spaces
    
    # Tokenization & Stemming
    stemmer = StemmerFactory().create_stemmer()
    tokens = [word for word in text.split() if word not in custom_stopwords]  # Tokenization and Stopword Removal
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    
    return ' '.join(tokens)

def predict_sentiment(text):
    """Predict the sentiment of the input text."""
    preprocessed_text = preprocess_data(text)
    vectorized_text = vectorizer.transform([preprocessed_text])
    prediction = model.predict(vectorized_text)
    return "Positive" if prediction[0] == 1 else "Negative"