import re from Sastrawi.Stemmer.StemmerFactory import StemmerFactory def clean_twitter_data(text): # Fungsi cleansing: menghilangkan mention, hashtag, RT, URL, dan karakter non-alfanumerik text = re.sub(r'@[A-Za-z0-9_]+', '', text) text = re.sub(r'#\w+', '', text) text = re.sub(r'RT[\s]+', '', text) text = re.sub(r'https?://\S+', '', text) text = re.sub(r'[^A-Za-z0-9 ]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def normalize_text(text): kamus_normalisasi = { 'bi': 'bank', 'ri': 'indonesia', 'akn': 'akan', 'gmn': 'bagaimana', 'ga': 'tidak', 'gak': 'tidak', 'nggak': 'tidak', 'yg': 'yang', 'kalo': 'kalau', 'aja': 'saja', 'nih': 'ini', 'dong': '', 'banget': 'sangat', 'bro': 'teman', 'sis': 'teman', 'dgn': 'dengan', 'bgt': 'sangat', 'blm': 'belum', 'jgn': 'jangan', 'tdk': 'tidak', } words = text.split() normalized_words = [kamus_normalisasi.get(word, word) for word in words] return ' '.join(normalized_words) def convert_negation(text): negation_words = ['tidak', 'bukan', 'tak', 'jangan', 'belum'] tokens = text.split() new_tokens = [] negate = False negation_word = '' for token in tokens: if token in negation_words: negate = True negation_word = token elif negate: new_tokens.append(f"{negation_word}-{token}") negate = False else: new_tokens.append(token) return ' '.join(new_tokens) def remove_stopwords(text): stopwords = set([ 'yang', 'di', 'ke', 'dari', 'dan', 'atau', 'itu', 'ini', 'dengan', 'pada', 'untuk', 'ada', 'sangat', 'dalam', 'oleh', 'karena' ]) words = text.split() filtered_words = [word for word in words if word.lower() not in stopwords] return ' '.join(filtered_words) def stemming(tokenized_text): factory = StemmerFactory() stemmer = factory.create_stemmer() stemmed_words = [stemmer.stem(w) for w in tokenized_text] return " ".join(stemmed_words) def preprocess_text(text): steps = {} steps["original_text"] = text # 1. Case Folding: konversi ke huruf kecil case_folding = text.lower() steps["case_folding"] = case_folding # 2. Cleansing: hapus mention, hashtag, URL, dll. cleansing = clean_twitter_data(case_folding) steps["cleansing"] = cleansing # 4. Normalization: normalisasi kata dengan kamus normalization = normalize_text(cleansing) steps["normalization"] = normalization # 3. Convert Negation: ubah kata negasi negation_converted = convert_negation(normalization) steps["convert_negation"] = negation_converted # 5. Stopwords: hilangkan stopwords no_stopwords = remove_stopwords(negation_converted) steps["stopwords"] = no_stopwords # 6. Tokenizing: pecah teks jadi token tokenizing = no_stopwords.split() steps["tokenizing"] = tokenizing # 7. Stemming: proses stemming tiap token stemming_result = stemming(tokenizing) steps["stemming"] = stemming_result return steps