30 lines
965 B
Python
30 lines
965 B
Python
import nltk
|
|
nltk.download('punkt') # Pastikan ini diunduh
|
|
nltk.download('stopwords')
|
|
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.corpus import stopwords
|
|
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
# Preprocessing Function
|
|
def preprocess_text(text):
|
|
# Tokenisasi
|
|
tokens = word_tokenize(text.lower()) # Gunakan nltk.download('punkt')
|
|
|
|
# Menghapus tanda baca dan stopwords
|
|
stop_words = set(stopwords.words('indonesian')) # Gunakan nltk.download('stopwords')
|
|
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
|
|
|
|
# Stemming
|
|
factory = StemmerFactory()
|
|
stemmer = factory.create_stemmer()
|
|
tokens = [stemmer.stem(word) for word in tokens]
|
|
|
|
return ' '.join(tokens)
|
|
|
|
# Contoh penggunaan
|
|
sample_text = "Ini adalah contoh review produk yang sangat bagus!"
|
|
processed_text = preprocess_text(sample_text)
|
|
print(processed_text)
|