File size: 4,180 Bytes
4370811
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8286e02
4370811
8286e02
4370811
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# model_utils.py
import os
import nltk
import ssl
import tempfile

# Create a temporary directory for NLTK data
nltk_data_dir = tempfile.mkdtemp()

# Set the NLTK data path
nltk.data.path.append(nltk_data_dir)

# Download stopwords to the temporary directory
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)

from nltk.corpus import stopwords
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
from keras.preprocessing.text import Tokenizer
# Define the personality trait labels
traits = ['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']

def preprocess(docs):
    stopwrd = set(stopwords.words('english'))
    t = Tokenizer(num_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
    t.fit_on_texts(docs)
    encoded_docs = t.texts_to_sequences(docs)
    idx2word = {v: k for k, v in t.word_index.items()}

    def abbreviation_handler(text):
        ln = text.lower()
        ln = ln.replace(r"'t", " not")
        ln = ln.replace(r"'s", " is")
        ln = ln.replace(r"'ll", " will")
        ln = ln.replace(r"'ve", " have")
        ln = ln.replace(r"'re", " are")
        ln = ln.replace(r"'m", " am")
        ln = ln.replace(r"'", " ")
        return ln

    def stopwords_handler(text):
        words = text.split()
        new_words = [w for w in words if w not in stopwrd]
        return ' '.join(new_words)

    def sequence_to_text(listOfSequences):
        tokenized_list = []
        for text in listOfSequences:
            newText = ''
            for num in text:
                newText += idx2word[num] + ' '
            newText = abbreviation_handler(newText)
            newText = stopwords_handler(newText)
            tokenized_list.append(newText)
        return tokenized_list

    newLists = sequence_to_text(encoded_docs)
    return newLists

def tokenize_text(text, hugging_model='roberta-base'):
    print("tokenize_text")
    clean_text = preprocess(text)
    tokenizer = AutoTokenizer.from_pretrained(hugging_model)
    inputs = tokenizer(clean_text, padding=True, truncation=True, return_tensors='tf')
    x = dict(inputs)
    return x

def single_predict(model, text, traits=['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']):
    traits_scores = dict()
    predicted_labels = dict()
    x = tokenize_text([text])
    logits = model.predict(x, verbose=0).logits
    probs = tf.math.sigmoid(logits).numpy()
    predictions = np.where(probs > 0.5, 1, 0)
    for t, s in zip(traits, probs[0]):
        traits_scores[t] = float(s)  # Convert numpy.float32 to Python float
    for t, l in zip(traits, predictions[0]):
        predicted_labels[t] = int(l)  # Convert numpy.int64 to Python int
    final_dic = {'probability': traits_scores, 'predicted_label': predicted_labels}
    return final_dic

def load_model_and_weights(hugging_model='roberta-base', output_folder='.'):

    print(f"Current working directory: {os.getcwd()}")
    print(f"Output folder: {output_folder}")
    print("Files in the output folder:")
    for file in os.listdir(output_folder):
        print(f"- {file}")

    model = TFAutoModelForSequenceClassification.from_pretrained(
        hugging_model, num_labels=len(traits), problem_type="multi_label_classification"
    )
    if len(hugging_model.split('/')) > 1:
        _hugging_model = hugging_model.split('/')[1]
    else:
        _hugging_model = hugging_model.split('/')[0]

    weights_path = os.path.join(output_folder, f'weights-{_hugging_model}.h5')
    print(f"Looking for weights file at: {weights_path}")
    if os.path.exists(weights_path):
        try:
            model.load_weights(weights_path)
            print("Custom weights loaded successfully.")
        except Exception as e:
            print(f"Error loading weights: {str(e)}")
            print("Using default weights.")
            return e
    else:
        print(f"Warning: Custom weights file not found at {weights_path}")
        print("Using default weights.")
    return model