MIF_E31230988/test.py

30 lines
965 B
Python

import nltk
nltk.download('punkt') # Pastikan ini diunduh
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
# Preprocessing Function
def preprocess_text(text):
# Tokenisasi
tokens = word_tokenize(text.lower()) # Gunakan nltk.download('punkt')
# Menghapus tanda baca dan stopwords
stop_words = set(stopwords.words('indonesian')) # Gunakan nltk.download('stopwords')
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()
tokens = [stemmer.stem(word) for word in tokens]
return ' '.join(tokens)
# Contoh penggunaan
sample_text = "Ini adalah contoh review produk yang sangat bagus!"
processed_text = preprocess_text(sample_text)
print(processed_text)