TIF_E41200378/sentimen.py

import re
import pandas as pd
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stp = stopwords.words('indonesian')
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Preprocessing
def lower(text):
    # Case Folding
    return text.lower()


def remove_punctuation(text):
    # Happy Emoticons
    emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', ':d', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

    # Sad Emoticons
    emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

    # All emoticons (happy + sad)
    emoticons = emoticons_happy.union(emoticons_sad)

    text = ' '.join([word for word in text.split() if word not in emoticons])

    text = re.sub(r'@[\w]*', ' ', text)

    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', text)

    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)

    text = re.sub(r'^RT[\s]+', ' ', text)

    text = text.lower()

    text = re.sub(r'[^\w\s]+', ' ', text)

    text = re.sub(r'[0-9]+', ' ', text)

    text = re.sub(r'_', ' ', text)

    text = re.sub(r'\$\w*', ' ', text)

    return text
# tambahan
def normalize_text(text):
    # Load slang-formal mapping data
    slang_formal_data = pd.read_csv('slang_formal_mapping.csv')

    # Create a dictionary from slang to formal words
    slang_formal_dict = dict(zip(slang_formal_data['slang'], slang_formal_data['formal']))

    # Convert text to string if it's not already a string
    text = str(text)

    # Split text into words
    words = text.split()

    # Normalize each word using the slang-formal dictionary
    normalized_words = [slang_formal_dict.get(word, word) for word in words]

    # Join normalized words back into text
    normalized_text = ' '.join(normalized_words)

    return normalized_text

def tokenize(text):
  tokens = text.split()
  return tokens

def remove_stopwords(text):
    return ([word for word in text.split() if word not in stp])
    # return text


def stem_text(text):
    text = ' '.join([stemmer.stem(word) for word in text])
    return text


# Kalimat Testing
def preprocess_data(text):
    text = remove_punctuation(text)
    # text = tokenize(text)
    text = normalize_text(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text