TIF_E41200378/sentimen.py

102 lines
2.7 KiB
Python

import re
import pandas as pd
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stp = stopwords.words('indonesian')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# Preprocessing
def lower(text):
# Case Folding
return text.lower()
def remove_punctuation(text):
# Happy Emoticons
emoticons_happy = set([
':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', ':d', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3'
])
# Sad Emoticons
emoticons_sad = set([
':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';('
])
# All emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
text = ' '.join([word for word in text.split() if word not in emoticons])
text = re.sub(r'@[\w]*', ' ', text)
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', text)
text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)
text = re.sub(r'^RT[\s]+', ' ', text)
text = text.lower()
text = re.sub(r'[^\w\s]+', ' ', text)
text = re.sub(r'[0-9]+', ' ', text)
text = re.sub(r'_', ' ', text)
text = re.sub(r'\$\w*', ' ', text)
return text
# tambahan
def normalize_text(text):
# Load slang-formal mapping data
slang_formal_data = pd.read_csv('slang_formal_mapping.csv')
# Create a dictionary from slang to formal words
slang_formal_dict = dict(zip(slang_formal_data['slang'], slang_formal_data['formal']))
# Convert text to string if it's not already a string
text = str(text)
# Split text into words
words = text.split()
# Normalize each word using the slang-formal dictionary
normalized_words = [slang_formal_dict.get(word, word) for word in words]
# Join normalized words back into text
normalized_text = ' '.join(normalized_words)
return normalized_text
def tokenize(text):
tokens = text.split()
return tokens
def remove_stopwords(text):
return ([word for word in text.split() if word not in stp])
# return text
def stem_text(text):
text = ' '.join([stemmer.stem(word) for word in text])
return text
# Kalimat Testing
def preprocess_data(text):
text = remove_punctuation(text)
# text = tokenize(text)
text = normalize_text(text)
text = remove_stopwords(text)
text = stem_text(text)
return text