124 lines
3.7 KiB
Python
124 lines
3.7 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
import tensorflow as tf
|
|
import pickle
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import (
|
|
LSTM,
|
|
Embedding,
|
|
Dense,
|
|
SpatialDropout1D,
|
|
TimeDistributed,
|
|
)
|
|
from tensorflow.keras.optimizers import Adam
|
|
|
|
# -----------------------
|
|
# 1. Load dataset
|
|
# -----------------------
|
|
df = pd.read_csv("quiz_questions.csv")
|
|
|
|
# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
|
|
df.dropna(subset=["paragraph", "question"], inplace=True)
|
|
|
|
|
|
# -----------------------
|
|
# 2. Preprocessing text
|
|
# -----------------------
|
|
def preprocess_text(text):
|
|
# Contoh preprocessing sederhana
|
|
text = text.lower()
|
|
return text
|
|
|
|
|
|
df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
|
|
df["question"] = df["question"].astype(str).apply(preprocess_text)
|
|
|
|
# -----------------------
|
|
# 3. Tokenization
|
|
# -----------------------
|
|
# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
|
|
tokenizer = Tokenizer()
|
|
tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
|
|
vocab_size = len(tokenizer.word_index) + 1 # +1 karena index dimulai dari 1
|
|
|
|
# Konversi teks menjadi sequences
|
|
X_sequences = tokenizer.texts_to_sequences(df["paragraph"])
|
|
y_sequences = tokenizer.texts_to_sequences(df["question"])
|
|
|
|
# Cari panjang sequence maksimal (agar uniform untuk padding)
|
|
max_len_paragraph = max(len(seq) for seq in X_sequences)
|
|
max_len_question = max(len(seq) for seq in y_sequences)
|
|
max_length = max(max_len_paragraph, max_len_question)
|
|
|
|
# Padding sequences (panjangnya disamakan => max_length)
|
|
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post")
|
|
y_padded = pad_sequences(y_sequences, maxlen=max_length, padding="post")
|
|
|
|
with open("tokenizer.pkl", "wb") as f:
|
|
pickle.dump(tokenizer, f)
|
|
print("Tokenizer disimpan ke tokenizer.pkl")
|
|
|
|
# -----------------------
|
|
# 4. Siapkan X, y
|
|
# -----------------------
|
|
# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
|
|
# idealnya y memiliki shape: (num_samples, max_length, 1)
|
|
X = np.array(X_padded)
|
|
y = np.expand_dims(np.array(y_padded), axis=-1)
|
|
|
|
print("Shape X:", X.shape)
|
|
print("Shape y:", y.shape) # (batch_size, max_length, 1)
|
|
|
|
# -----------------------
|
|
# 5. Split data
|
|
# -----------------------
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.2, random_state=42
|
|
)
|
|
|
|
print("Train size:", X_train.shape, y_train.shape)
|
|
print("Test size: ", X_test.shape, y_test.shape)
|
|
|
|
# -----------------------
|
|
# 6. Build Model LSTM
|
|
# -----------------------
|
|
# Kita pakai 2 LSTM stack, masing2 return_sequences=True
|
|
# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
|
|
model = Sequential()
|
|
model.add(Embedding(input_dim=vocab_size, output_dim=128))
|
|
model.add(SpatialDropout1D(0.2))
|
|
|
|
model.add(LSTM(128, return_sequences=True))
|
|
model.add(LSTM(128, return_sequences=True))
|
|
|
|
# TimeDistributed Dense agar Dense diaplikasikan per timestep
|
|
model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
|
|
|
|
# -----------------------
|
|
# 7. Compile
|
|
# -----------------------
|
|
model.compile(
|
|
loss="sparse_categorical_crossentropy",
|
|
optimizer=Adam(learning_rate=0.001),
|
|
metrics=["accuracy"],
|
|
)
|
|
|
|
model.summary()
|
|
|
|
# -----------------------
|
|
# 8. Train Model
|
|
# -----------------------
|
|
epochs = 10
|
|
history = model.fit(
|
|
X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
|
|
)
|
|
|
|
# -----------------------
|
|
# 9. Save Model
|
|
# -----------------------
|
|
model.save("lstm_question_generator.keras")
|
|
print("Training selesai dan model telah disimpan.")
|