File size: 1,882 Bytes
24bf069 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from data_analysis import df
from nltk.tokenize import word_tokenize
import re
import pandas as pd
import nltk
#Removing Duplicates
# df = df.drop_duplicates(subset='Text')
# df = df.reset_index(drop=True)
nltk.download('punkt')
# Initialize the set of non-alphanumeric characters to remove
nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
'*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
'/', '>', '<', '|', ' ']
def clean_text(text):
"""
Function to clean and preprocess text data.
"""
# Tokenize the text using spaCy
tokens = word_tokenize(text)
# Remove non-alphanumeric characters
words = [word.lower() for word in tokens if word not in nonalphanumeric]
# Join the lemmatized words back into a single string
cleaned_text = " ".join(words)
return cleaned_text
def remove_english(text):
"""
function that takes text as input and returns text without english words
"""
pat = "[a-zA-Z]+"
text = re.sub(pat, "", text)
return text
#applying clean_text function to all rows in 'Text' column
# df['clean_text'] = df['Text'].apply(clean_text)
# #Removing English from Chinese text
# df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset
# clean_text = df.loc[df.language=='Chinese']['clean_text']
# clean_text = clean_text.apply(remove_english) # removing English words
# df_Chinese.loc[:,'clean_text'] = clean_text
# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)
# # Drop rows with 'Chinese' language from the original DataFrame
# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)
# # shuffling dataframe and resetting index
# df = df.sample(frac=1).reset_index(drop=True) |