import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import tensorflow as tf import pickle from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import ( LSTM, Embedding, Dense, SpatialDropout1D, TimeDistributed, ) from tensorflow.keras.optimizers import Adam # 1. Load dataset df = pd.read_csv("quiz_questions.csv") # Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong df.dropna(subset=["paragraph", "question"], inplace=True) # 2. Preprocessing text def preprocess_text(text): # Contoh preprocessing sederhana text = text.lower() return text df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text) df["question"] = df["question"].astype(str).apply(preprocess_text) # 3. Tokenization # Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya tokenizer = Tokenizer() tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist()) vocab_size = len(tokenizer.word_index) + 1 # +1 karena index dimulai dari 1 # Konversi teks menjadi sequences X_sequences = tokenizer.texts_to_sequences(df["paragraph"]) y_sequences = tokenizer.texts_to_sequences(df["question"]) # Cari panjang sequence maksimal (agar uniform untuk padding) max_len_paragraph = max(len(seq) for seq in X_sequences) max_len_question = max(len(seq) for seq in y_sequences) max_length = max(max_len_paragraph, max_len_question) # Padding sequences (panjangnya disamakan => max_length) X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post") y_padded = pad_sequences(y_sequences, maxlen=max_length, padding="post") with open("tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f) print("Tokenizer disimpan ke tokenizer.pkl") # 4. Siapkan X, y # Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy", # idealnya y memiliki shape: (num_samples, max_length, 1) X = np.array(X_padded) y = np.expand_dims(np.array(y_padded), axis=-1) print("Shape X:", X.shape) print("Shape y:", y.shape) # (batch_size, max_length, 1) # 5. Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) print("Train size:", X_train.shape, y_train.shape) print("Test size: ", X_test.shape, y_test.shape) # 6. Build Model LSTM # Kita pakai 2 LSTM stack, masing2 return_sequences=True # Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim) model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=128)) model.add(SpatialDropout1D(0.2)) model.add(LSTM(128, return_sequences=True)) model.add(LSTM(128, return_sequences=True)) # TimeDistributed Dense agar Dense diaplikasikan per timestep model.add(TimeDistributed(Dense(vocab_size, activation="softmax"))) # 7. Compile model.compile( loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"], ) model.summary() # 8. Train Model epochs = 10 history = model.fit( X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32 ) # 9. Save Model model.save("lstm_question_generator.keras") print("Training selesai dan model telah disimpan.")