105 lines
3.1 KiB
Python
105 lines
3.1 KiB
Python
import re
|
|
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
|
|
|
def clean_twitter_data(text):
|
|
# Fungsi cleansing: menghilangkan mention, hashtag, RT, URL, dan karakter non-alfanumerik
|
|
text = re.sub(r'@[A-Za-z0-9_]+', '', text)
|
|
text = re.sub(r'#\w+', '', text)
|
|
text = re.sub(r'RT[\s]+', '', text)
|
|
text = re.sub(r'https?://\S+', '', text)
|
|
text = re.sub(r'[^A-Za-z0-9 ]', '', text)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
def normalize_text(text):
|
|
kamus_normalisasi = {
|
|
'bi': 'bank',
|
|
'ri': 'indonesia',
|
|
'akn': 'akan',
|
|
'gmn': 'bagaimana',
|
|
'ga': 'tidak',
|
|
'gak': 'tidak',
|
|
'nggak': 'tidak',
|
|
'yg': 'yang',
|
|
'kalo': 'kalau',
|
|
'aja': 'saja',
|
|
'nih': 'ini',
|
|
'dong': '',
|
|
'banget': 'sangat',
|
|
'bro': 'teman',
|
|
'sis': 'teman',
|
|
'dgn': 'dengan',
|
|
'bgt': 'sangat',
|
|
'blm': 'belum',
|
|
'jgn': 'jangan',
|
|
'tdk': 'tidak',
|
|
}
|
|
words = text.split()
|
|
normalized_words = [kamus_normalisasi.get(word, word) for word in words]
|
|
return ' '.join(normalized_words)
|
|
|
|
def convert_negation(text):
|
|
negation_words = ['tidak', 'bukan', 'tak', 'jangan', 'belum']
|
|
tokens = text.split()
|
|
new_tokens = []
|
|
negate = False
|
|
negation_word = ''
|
|
for token in tokens:
|
|
if token in negation_words:
|
|
negate = True
|
|
negation_word = token
|
|
elif negate:
|
|
new_tokens.append(f"{negation_word}-{token}")
|
|
negate = False
|
|
else:
|
|
new_tokens.append(token)
|
|
return ' '.join(new_tokens)
|
|
|
|
def remove_stopwords(text):
|
|
stopwords = set([
|
|
'yang', 'di', 'ke', 'dari', 'dan', 'atau', 'itu', 'ini', 'dengan',
|
|
'pada', 'untuk', 'ada', 'sangat', 'dalam', 'oleh', 'karena'
|
|
])
|
|
words = text.split()
|
|
filtered_words = [word for word in words if word.lower() not in stopwords]
|
|
return ' '.join(filtered_words)
|
|
|
|
def stemming(tokenized_text):
|
|
factory = StemmerFactory()
|
|
stemmer = factory.create_stemmer()
|
|
stemmed_words = [stemmer.stem(w) for w in tokenized_text]
|
|
return " ".join(stemmed_words)
|
|
|
|
def preprocess_text(text):
|
|
steps = {}
|
|
steps["original_text"] = text
|
|
|
|
# 1. Case Folding: konversi ke huruf kecil
|
|
case_folding = text.lower()
|
|
steps["case_folding"] = case_folding
|
|
|
|
# 2. Cleansing: hapus mention, hashtag, URL, dll.
|
|
cleansing = clean_twitter_data(case_folding)
|
|
steps["cleansing"] = cleansing
|
|
|
|
# 4. Normalization: normalisasi kata dengan kamus
|
|
normalization = normalize_text(cleansing)
|
|
steps["normalization"] = normalization
|
|
|
|
# 3. Convert Negation: ubah kata negasi
|
|
negation_converted = convert_negation(normalization)
|
|
steps["convert_negation"] = negation_converted
|
|
|
|
# 5. Stopwords: hilangkan stopwords
|
|
no_stopwords = remove_stopwords(negation_converted)
|
|
steps["stopwords"] = no_stopwords
|
|
|
|
# 6. Tokenizing: pecah teks jadi token
|
|
tokenizing = no_stopwords.split()
|
|
steps["tokenizing"] = tokenizing
|
|
|
|
# 7. Stemming: proses stemming tiap token
|
|
stemming_result = stemming(tokenizing)
|
|
steps["stemming"] = stemming_result
|
|
|
|
return steps |