from preprocesing_text import * def select_unique(text): return pd.Series(text).unique() # preprosesing menggunakan fungsi yang ada pada preprocesing_text def text_preprocesing(dataset, mode): if mode == "satuan": Twit_data = pd.DataFrame(list(dataset.items()), columns = ['username','tweets']) Twit_data['tweets'] = Twit_data['tweets'].astype(str) Twit_data['tweets'] = Twit_data['tweets'].str.lower() Twit_data['tweets'] = Twit_data['tweets'].apply(remove_tweet_special) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_number) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_punctuation) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_LT) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_multiple) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_singl_char) Twit_data['tweets_tokenizing'] = Twit_data['tweets'].apply(word_tokenize_wrapper) Twit_data['tweet_tokens_WSW'] = Twit_data['tweets_tokenizing'].apply(stopwords_removal) Twit_data['tweet_normalized'] = Twit_data['tweet_tokens_WSW'].apply(normalized_term) term_dict = normalize_documents_sastrawi(Twit_data['tweet_normalized']) def get_stemmed_term(document): return [term_dict[term] for term in document] Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].apply(get_stemmed_term) # Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].swifter.apply(get_stemmed_term) # print(Twit_data['tweet_tokens_stemmed']) else: Twit_data = pd.DataFrame(dataset, columns=['label','tweets']) Twit_data['tweets'] = Twit_data['tweets'].astype(str) Twit_data['tweets'] = Twit_data['tweets'].str.lower() Twit_data['tweets'] = Twit_data['tweets'].apply(remove_tweet_special) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_number) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_punctuation) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_LT) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_multiple) Twit_data['tweets'] = Twit_data['tweets'].apply(remove_singl_char) Twit_data['tweets_tokenizing'] = Twit_data['tweets'].apply(word_tokenize_wrapper) print("please wait...") Twit_data['tweet_tokens_WSW'] = Twit_data['tweets_tokenizing'].apply(stopwords_removal) Twit_data['tweet_normalized'] = Twit_data['tweet_tokens_WSW'].apply(normalized_term) term_dict = normalize_documents_sastrawi(Twit_data['tweet_normalized']) def get_stemmed_term(document): return [term_dict[term] for term in document] Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].apply(get_stemmed_term) # Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].swifter.apply(get_stemmed_term) # print(Twit_data['tweet_tokens_stemmed']) return Twit_data['tweet_tokens_stemmed']