File size: 1,882 Bytes

24bf069

from data_analysis import df
from nltk.tokenize import word_tokenize
import re
import pandas as pd
import nltk

#Removing Duplicates
# df = df.drop_duplicates(subset='Text')
# df = df.reset_index(drop=True)

nltk.download('punkt')
# Initialize the set of non-alphanumeric characters to remove
nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
                   '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?', 
                   '/', '>', '<', '|', ' ']

def clean_text(text):
    """
    Function to clean and preprocess text data.
    """
    # Tokenize the text using spaCy
    tokens = word_tokenize(text)
    
    # Remove non-alphanumeric characters
    words = [word.lower() for word in tokens if word not in nonalphanumeric]
    
    # Join the lemmatized words back into a single string
    cleaned_text = " ".join(words)
    
    return cleaned_text

def remove_english(text):
    """
    function that takes text as input and returns text without english words
    """
    pat = "[a-zA-Z]+"
    text = re.sub(pat, "", text)
    return text


#applying clean_text function to all rows in 'Text' column 
# df['clean_text'] = df['Text'].apply(clean_text)



# #Removing English from Chinese text
# df_Chinese = df[df['language']=='Chinese']  # Chinese data in dataset

# clean_text = df.loc[df.language=='Chinese']['clean_text']
# clean_text = clean_text.apply(remove_english)  # removing English words
# df_Chinese.loc[:,'clean_text'] = clean_text

# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)

# # Drop rows with 'Chinese' language from the original DataFrame
# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)


# # shuffling dataframe and resetting index
# df = df.sample(frac=1).reset_index(drop=True)