102 lines
2.7 KiB
Python
102 lines
2.7 KiB
Python
import re
|
|
import pandas as pd
|
|
from nltk.corpus import stopwords
|
|
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
|
|
|
stp = stopwords.words('indonesian')
|
|
factory = StemmerFactory()
|
|
stemmer = factory.create_stemmer()
|
|
|
|
# Preprocessing
|
|
def lower(text):
|
|
# Case Folding
|
|
return text.lower()
|
|
|
|
|
|
def remove_punctuation(text):
|
|
# Happy Emoticons
|
|
emoticons_happy = set([
|
|
':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
|
|
':^)', ':-D', ':D', ':d', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
|
|
'=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
|
|
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
|
|
'<3'
|
|
])
|
|
|
|
# Sad Emoticons
|
|
emoticons_sad = set([
|
|
':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
|
|
':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
|
|
':c', ':{', '>:\\', ';('
|
|
])
|
|
|
|
# All emoticons (happy + sad)
|
|
emoticons = emoticons_happy.union(emoticons_sad)
|
|
|
|
text = ' '.join([word for word in text.split() if word not in emoticons])
|
|
|
|
text = re.sub(r'@[\w]*', ' ', text)
|
|
|
|
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', text)
|
|
|
|
text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)
|
|
|
|
text = re.sub(r'^RT[\s]+', ' ', text)
|
|
|
|
text = text.lower()
|
|
|
|
text = re.sub(r'[^\w\s]+', ' ', text)
|
|
|
|
text = re.sub(r'[0-9]+', ' ', text)
|
|
|
|
text = re.sub(r'_', ' ', text)
|
|
|
|
text = re.sub(r'\$\w*', ' ', text)
|
|
|
|
return text
|
|
# tambahan
|
|
def normalize_text(text):
|
|
# Load slang-formal mapping data
|
|
slang_formal_data = pd.read_csv('slang_formal_mapping.csv')
|
|
|
|
# Create a dictionary from slang to formal words
|
|
slang_formal_dict = dict(zip(slang_formal_data['slang'], slang_formal_data['formal']))
|
|
|
|
# Convert text to string if it's not already a string
|
|
text = str(text)
|
|
|
|
# Split text into words
|
|
words = text.split()
|
|
|
|
# Normalize each word using the slang-formal dictionary
|
|
normalized_words = [slang_formal_dict.get(word, word) for word in words]
|
|
|
|
# Join normalized words back into text
|
|
normalized_text = ' '.join(normalized_words)
|
|
|
|
return normalized_text
|
|
|
|
def tokenize(text):
|
|
tokens = text.split()
|
|
return tokens
|
|
|
|
def remove_stopwords(text):
|
|
return ([word for word in text.split() if word not in stp])
|
|
# return text
|
|
|
|
|
|
def stem_text(text):
|
|
text = ' '.join([stemmer.stem(word) for word in text])
|
|
return text
|
|
|
|
|
|
# Kalimat Testing
|
|
def preprocess_data(text):
|
|
text = remove_punctuation(text)
|
|
# text = tokenize(text)
|
|
text = normalize_text(text)
|
|
text = remove_stopwords(text)
|
|
text = stem_text(text)
|
|
return text
|
|
|