import re import pandas as pd from nltk.corpus import stopwords from Sastrawi.Stemmer.StemmerFactory import StemmerFactory stp = stopwords.words('indonesian') factory = StemmerFactory() stemmer = factory.create_stemmer() # Preprocessing def lower(text): # Case Folding return text.lower() def remove_punctuation(text): # Happy Emoticons emoticons_happy = set([ ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':-D', ':D', ':d', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P', 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)', '<3' ]) # Sad Emoticons emoticons_sad = set([ ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<', ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c', ':c', ':{', '>:\\', ';(' ]) # All emoticons (happy + sad) emoticons = emoticons_happy.union(emoticons_sad) text = ' '.join([word for word in text.split() if word not in emoticons]) text = re.sub(r'@[\w]*', ' ', text) text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', text) text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text) text = re.sub(r'^RT[\s]+', ' ', text) text = text.lower() text = re.sub(r'[^\w\s]+', ' ', text) text = re.sub(r'[0-9]+', ' ', text) text = re.sub(r'_', ' ', text) text = re.sub(r'\$\w*', ' ', text) return text # tambahan def normalize_text(text): # Load slang-formal mapping data slang_formal_data = pd.read_csv('slang_formal_mapping.csv') # Create a dictionary from slang to formal words slang_formal_dict = dict(zip(slang_formal_data['slang'], slang_formal_data['formal'])) # Convert text to string if it's not already a string text = str(text) # Split text into words words = text.split() # Normalize each word using the slang-formal dictionary normalized_words = [slang_formal_dict.get(word, word) for word in words] # Join normalized words back into text normalized_text = ' '.join(normalized_words) return normalized_text def tokenize(text): tokens = text.split() return tokens def remove_stopwords(text): return ([word for word in text.split() if word not in stp]) # return text def stem_text(text): text = ' '.join([stemmer.stem(word) for word in text]) return text # Kalimat Testing def preprocess_data(text): text = remove_punctuation(text) # text = tokenize(text) text = normalize_text(text) text = remove_stopwords(text) text = stem_text(text) return text