TIF_E41202548/FlaskSentimen/testku.py

80 lines
3.0 KiB
Python

from preprocesing_text import *
def select_unique(text):
return pd.Series(text).unique()
# preprosesing menggunakan fungsi yang ada pada preprocesing_text
def text_preprocesing(dataset, mode):
if mode == "satuan":
Twit_data = pd.DataFrame(list(dataset.items()), columns = ['username','tweets'])
Twit_data['tweets'] = Twit_data['tweets'].astype(str)
Twit_data['tweets'] = Twit_data['tweets'].str.lower()
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_tweet_special)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_number)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_punctuation)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_LT)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_multiple)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_singl_char)
Twit_data['tweets_tokenizing'] = Twit_data['tweets'].apply(word_tokenize_wrapper)
Twit_data['tweet_tokens_WSW'] = Twit_data['tweets_tokenizing'].apply(stopwords_removal)
Twit_data['tweet_normalized'] = Twit_data['tweet_tokens_WSW'].apply(normalized_term)
term_dict = normalize_documents_sastrawi(Twit_data['tweet_normalized'])
def get_stemmed_term(document):
return [term_dict[term] for term in document]
Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].apply(get_stemmed_term)
# Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].swifter.apply(get_stemmed_term)
# print(Twit_data['tweet_tokens_stemmed'])
else:
Twit_data = pd.DataFrame(dataset, columns=['label','tweets'])
Twit_data['tweets'] = Twit_data['tweets'].astype(str)
Twit_data['tweets'] = Twit_data['tweets'].str.lower()
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_tweet_special)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_number)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_punctuation)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_LT)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_whitespace_multiple)
Twit_data['tweets'] = Twit_data['tweets'].apply(remove_singl_char)
Twit_data['tweets_tokenizing'] = Twit_data['tweets'].apply(word_tokenize_wrapper)
print("please wait...")
Twit_data['tweet_tokens_WSW'] = Twit_data['tweets_tokenizing'].apply(stopwords_removal)
Twit_data['tweet_normalized'] = Twit_data['tweet_tokens_WSW'].apply(normalized_term)
term_dict = normalize_documents_sastrawi(Twit_data['tweet_normalized'])
def get_stemmed_term(document):
return [term_dict[term] for term in document]
Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].apply(get_stemmed_term)
# Twit_data['tweet_tokens_stemmed'] = Twit_data['tweet_normalized'].swifter.apply(get_stemmed_term)
# print(Twit_data['tweet_tokens_stemmed'])
return Twit_data['tweet_tokens_stemmed']