80 lines
3.0 KiB
Python
80 lines
3.0 KiB
Python
from preprocesing_text import *
|
|
|
|
def select_unique(text):
|
|
return pd.Series(text).unique()
|
|
|
|
# preprosesing menggunakan fungsi yang ada pada preprocesing_text
|
|
def text_preprocesing(dataset, mode):
|
|
if mode == "satuan":
|
|
Twit_data = pd.DataFrame(list(dataset.items()), columns = ['username','tweets'])
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].astype(str)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].str.lower()
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_tweet_special)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_number)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_punctuation)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_LT)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_multiple)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_singl_char)
|
|
|
|
Twit_data['tweets_tokenizing'] = Twit_data['tweets'].apply(word_tokenize_wrapper)
|
|
|
|
Twit_data['tweet_tokens_WSW'] = Twit_data['tweets_tokenizing'].apply(stopwords_removal)
|
|
|
|
Twit_data['tweet_normalized'] = Twit_data['tweet_tokens_WSW'].apply(normalized_term)
|
|
|
|
term_dict = normalize_documents_sastrawi(Twit_data['tweet_normalized'])
|
|
|
|
def get_stemmed_term(document):
|
|
return [term_dict[term] for term in document]
|
|
|
|
Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].apply(get_stemmed_term)
|
|
# Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].swifter.apply(get_stemmed_term)
|
|
|
|
# print(Twit_data['tweet_tokens_stemmed'])
|
|
|
|
else:
|
|
Twit_data = pd.DataFrame(dataset, columns=['label','tweets'])
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].astype(str)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].str.lower()
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_tweet_special)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_number)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_punctuation)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_LT)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_multiple)
|
|
|
|
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_singl_char)
|
|
|
|
Twit_data['tweets_tokenizing'] = Twit_data['tweets'].apply(word_tokenize_wrapper)
|
|
|
|
print("please wait...")
|
|
|
|
Twit_data['tweet_tokens_WSW'] = Twit_data['tweets_tokenizing'].apply(stopwords_removal)
|
|
|
|
Twit_data['tweet_normalized'] = Twit_data['tweet_tokens_WSW'].apply(normalized_term)
|
|
|
|
term_dict = normalize_documents_sastrawi(Twit_data['tweet_normalized'])
|
|
|
|
def get_stemmed_term(document):
|
|
return [term_dict[term] for term in document]
|
|
|
|
Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].apply(get_stemmed_term)
|
|
# Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].swifter.apply(get_stemmed_term)
|
|
# print(Twit_data['tweet_tokens_stemmed'])
|
|
|
|
return Twit_data['tweet_tokens_stemmed']
|