TIF_E41211364/preprocessing.py

import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def clean_twitter_data(text):
    # Fungsi cleansing: menghilangkan mention, hashtag, RT, URL, dan karakter non-alfanumerik
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_text(text):
    kamus_normalisasi = {
        'bi': 'bank',
        'ri': 'indonesia',
        'akn': 'akan',
        'gmn': 'bagaimana',
        'ga': 'tidak',
        'gak': 'tidak',
        'nggak': 'tidak',
        'yg': 'yang',
        'kalo': 'kalau',
        'aja': 'saja',
        'nih': 'ini',
        'dong': '',
        'banget': 'sangat',
        'bro': 'teman',
        'sis': 'teman',
        'dgn': 'dengan',
        'bgt': 'sangat',
        'blm': 'belum',
        'jgn': 'jangan',
        'tdk': 'tidak',
    }
    words = text.split()
    normalized_words = [kamus_normalisasi.get(word, word) for word in words]
    return ' '.join(normalized_words)

def convert_negation(text):
    negation_words = ['tidak', 'bukan', 'tak', 'jangan', 'belum']
    tokens = text.split()
    new_tokens = []
    negate = False
    negation_word = ''
    for token in tokens:
        if token in negation_words:
            negate = True
            negation_word = token
        elif negate:
            new_tokens.append(f"{negation_word}-{token}")
            negate = False
        else:
            new_tokens.append(token)
    return ' '.join(new_tokens)

def remove_stopwords(text):
    stopwords = set([
        'yang', 'di', 'ke', 'dari', 'dan', 'atau', 'itu', 'ini', 'dengan',
        'pada', 'untuk', 'ada', 'sangat', 'dalam', 'oleh', 'karena'
    ])
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

def stemming(tokenized_text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stemmed_words = [stemmer.stem(w) for w in tokenized_text]
    return " ".join(stemmed_words)

def preprocess_text(text):
    steps = {}
    steps["original_text"] = text

    # 1. Case Folding: konversi ke huruf kecil
    case_folding = text.lower()
    steps["case_folding"] = case_folding

    # 2. Cleansing: hapus mention, hashtag, URL, dll.
    cleansing = clean_twitter_data(case_folding)
    steps["cleansing"] = cleansing

    # 4. Normalization: normalisasi kata dengan kamus
    normalization = normalize_text(cleansing)
    steps["normalization"] = normalization

    # 3. Convert Negation: ubah kata negasi
    negation_converted = convert_negation(normalization)
    steps["convert_negation"] = negation_converted

    # 5. Stopwords: hilangkan stopwords
    no_stopwords = remove_stopwords(negation_converted)
    steps["stopwords"] = no_stopwords

    # 6. Tokenizing: pecah teks jadi token
    tokenizing = no_stopwords.split()
    steps["tokenizing"] = tokenizing

    # 7. Stemming: proses stemming tiap token
    stemming_result = stemming(tokenizing)
    steps["stemming"] = stemming_result

    return steps