File size: 1,427 Bytes
de37907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import nltk
from nltk.tokenize import word_tokenize

# Download the necessary NLTK data files
nltk.download('punkt')

def tokenize_text(text: str) -> list:
    # Convert text to lowercase before tokenization
    text = text.lower()
    return word_tokenize(text)

def tokenize_doc(doc: dict) -> list:
    tokenized_doc = []
    for key, value in doc.items():
        tokenized_key = key.lower().replace("_", " ")
        tokenized_doc.append(tokenized_key)
        tokenized_doc.append(':')
        if isinstance(value, str):
            tokenized_doc.extend(tokenize_text(value))
        else:
            tokenized_doc.extend(tokenize_text(str(value)))
    return tokenized_doc

def tokenize_doc_to_str(doc: dict) -> str:
    tokenized_doc = []
    for key, value in doc.items():
        tokenized_key = key.lower().replace("_", " ")
        tokenized_doc.append(tokenized_key)
        tokenized_doc.append(':')
        if isinstance(value, str):
            tokenized_doc.extend(value)
        else:
            tokenized_doc.extend(str(value))
    return ' '.join(tokenized_doc)

# Example usage
user_message = "Tell me a joke about computers."
tokenized_message = tokenize_text(user_message)
# print(tokenized_message)

doc = {
    "title": "Funny Computer Joke",
    "content": "Why do programmers prefer dark mode? Because light attracts bugs!",
    "rating": 5
}
tokenized_doc = tokenize_doc(doc)
# print(tokenized_doc)