from data_analysis import df | |
from nltk.tokenize import word_tokenize | |
import re | |
import pandas as pd | |
import nltk | |
#Removing Duplicates | |
# df = df.drop_duplicates(subset='Text') | |
# df = df.reset_index(drop=True) | |
nltk.download('punkt') | |
# Initialize the set of non-alphanumeric characters to remove | |
nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&', | |
'*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?', | |
'/', '>', '<', '|', ' '] | |
def clean_text(text): | |
""" | |
Function to clean and preprocess text data. | |
""" | |
# Tokenize the text using spaCy | |
tokens = word_tokenize(text) | |
# Remove non-alphanumeric characters | |
words = [word.lower() for word in tokens if word not in nonalphanumeric] | |
# Join the lemmatized words back into a single string | |
cleaned_text = " ".join(words) | |
return cleaned_text | |
def remove_english(text): | |
""" | |
function that takes text as input and returns text without english words | |
""" | |
pat = "[a-zA-Z]+" | |
text = re.sub(pat, "", text) | |
return text | |
#applying clean_text function to all rows in 'Text' column | |
# df['clean_text'] = df['Text'].apply(clean_text) | |
# #Removing English from Chinese text | |
# df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset | |
# clean_text = df.loc[df.language=='Chinese']['clean_text'] | |
# clean_text = clean_text.apply(remove_english) # removing English words | |
# df_Chinese.loc[:,'clean_text'] = clean_text | |
# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame | |
# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True) | |
# # Drop rows with 'Chinese' language from the original DataFrame | |
# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True) | |
# # shuffling dataframe and resetting index | |
# df = df.sample(frac=1).reset_index(drop=True) |