In [22]:
# import library

# Data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

# Natural language processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Deep learning
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


from sklearn.model_selection import train_test_split

# Metrics for model evaluation
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score

# Utility for serialization
import pickle


In [23]:
# download assets
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/akeon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/akeon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/akeon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/akeon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
# load dataset
df = pd.read_json("dataset/training_dataset.json")
print(df.head())
with open("dataset/training_dataset.json", "r", encoding="utf-8") as file:
    dataset = json.load(file)
    
    
# Menghitung total context
total_context = len(dataset)

# Menghitung total qa_pairs
total_qa_pairs = sum(len(entry["qa_pairs"]) for entry in dataset)

# Menampilkan hasil
print(f"\nTotal Context: {total_context}")
print(f"Total QA Pairs: {total_qa_pairs}")

                                             context  \
0  Albert Einstein adalah fisikawan teoretis kela...   
1  Samudra Pasifik adalah yang terbesar dan terda...   
2  Proklamasi Kemerdekaan Indonesia dibacakan pad...   
3  Hukum Newton adalah tiga hukum fisika yang men...   
4  Budi Utomo adalah organisasi pemuda yang didir...   

                                            qa_pairs  
0  [{'type': 'fill_in_the_blank', 'question': 'Si...  
1  [{'type': 'fill_in_the_blank', 'question': 'Sa...  
2  [{'type': 'fill_in_the_blank', 'question': 'Pr...  
3  [{'type': 'fill_in_the_blank', 'question': 'Hu...  
4  [{'type': 'fill_in_the_blank', 'question': 'Bu...  

Total Context: 49
Total QA Pairs: 95


In [25]:
# Text Preprocessing
stop_words = set(stopwords.words("indonesian"))                                                                                                                                                                                                                                                                             
lemmatizer = WordNetLemmatizer()

normalization_dict = {
    "yg": "yang",
    "gokil": "kocak",
    "kalo": "kalau",
    "gue": "saya",
    "elo": "kamu",
    "nih": "ini",
    "trs": "terus",
    "tdk": "tidak",
    "gmna": "bagaimana",
    "tp": "tapi",
    "jd": "jadi",
    "aja": "saja",
    "krn": "karena",
    "blm": "belum",
    "dgn": "dengan",
    "skrg": "sekarang",
    "msh": "masih",
    "lg": "lagi",
    "sy": "saya",
    "sm": "sama",
    "bgt": "banget",
    "dr": "dari",
    "kpn": "kapan",
    "hrs": "harus",
    "cm": "cuma",
    "sbnrnya": "sebenarnya",
    "tdr": "tidur",
    "tdk": "tidak",
    "kl": "kalau",
    "org": "orang",
    "pke": "pakai",
    "prnh": "pernah",
    "brgkt": "berangkat",
    "pdhl": "padahal",
    "btw": "ngomong-ngomong",
    "dmn": "di mana",
    "bsk": "besok",
    "td": "tadi",
    "dlm": "dalam",
    "utk": "untuk",
    "spt": "seperti",
    "gpp": "tidak apa-apa",
    "bs": "bisa",
    "jg": "juga",
    "tp": "tapi",
    "dg": "dengan",
    "klw": "kalau",
    "wkwk": "haha",
    "cpt": "cepat",
    "knp": "kenapa",
    "jgk": "juga",
    "plg": "pulang",
    "brp": "berapa",
    "bkn": "bukan",
    "mnt": "minta",
    "udh": "sudah",
    "sdh": "sudah",
    "brkt": "berangkat",
    "btw": "by the way",
    "tdk": "tidak",
    "sprt": "seperti",
    "jgn": "jangan",
    "mlm": "malam",
    "sblm": "sebelum",
    "stlh": "setelah",
    "tdr": "tidur",
    "mlh": "malah",
    "tmn": "teman",
}


def text_preprocessing(text):
    #doing lower case 
    text = text.lower()
    
    # remove symbol and read mark
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # remove blank space
    text = re.sub(r"\s+", " ", text).strip()
    
    # word tokenize 
    tokens = word_tokenize(text)
    
    # normalassi kata
    tokens = [normalization_dict[word] if word in normalization_dict else word for word in tokens] 
    
    
    # lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  
    
    # stopword removal
    tokens = [word for word in tokens if word not in stop_words]  
    
    return tokens


In [26]:
# with open("dataset/training_dataset.json", "r", encoding="utf-8") as file:
#     dataset = json.load(file)

# === Extract Data so that each QA pair has its own context === #
contexts = []
questions = []
answers = []
question_types = []

for entry in dataset:
    processed_context = text_preprocessing(entry["context"])
    for qa in entry["qa_pairs"]:
        contexts.append(processed_context)
        questions.append(text_preprocessing(qa["question"]))
        answers.append(text_preprocessing(qa["answer"]))
        question_types.append(qa["type"])

# === Initialize Tokenizer and fit on all text === #
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(contexts + questions + answers)

# === Convert Text to Sequences === #
context_sequences = tokenizer.texts_to_sequences(contexts)
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# === Define Max Length for Padding === #
MAX_LENGTH = 100
context_padded = pad_sequences(context_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
question_padded = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")

# Encode Question Types
question_type_dict = {"fill_in_the_blank": 0, "true_false": 1, "multiple_choice": 2}
question_type_labels = np.array([question_type_dict[q_type] for q_type in question_types])

# Save the processed data (optional)
np.save("context_padded.npy", context_padded)
np.save("question_padded.npy", question_padded)
np.save("answer_padded.npy", answer_padded)
np.save("question_type_labels.npy", question_type_labels)
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("✅ Data processing complete!")
print("Samples:", context_padded.shape[0])  # This should now match the number of QA pairs


✅ Data processing complete!
Samples: 95


In [27]:
# === Split Data into Training and Testing Sets === #
(context_train, context_test,
 question_train, question_test,
 answer_train, answer_test,
 qtype_train, qtype_test) = train_test_split(
    context_padded,
    question_padded,
    answer_padded,
    question_type_labels,
    test_size=0.2,
    random_state=42
)

print("Training samples:", context_train.shape[0])
print("Testing samples:", context_test.shape[0])


Training samples: 76
Testing samples: 19


In [28]:
# === Model Hyperparameters === #
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 300
LSTM_UNITS = 256
BATCH_SIZE = 32
EPOCHS = 10

# === Build Model === #
# Encoder for Context
context_input = Input(shape=(MAX_LENGTH,), name="context_input")
context_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name="context_embedding")(context_input)
encoder_lstm = LSTM(LSTM_UNITS, return_state=True, name="encoder_lstm")
encoder_output, state_h, state_c = encoder_lstm(context_embedding)

# Decoder for Question (Teacher Forcing)
question_decoder_input = Input(shape=(MAX_LENGTH,), name="question_decoder_input")
question_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name="question_embedding")(question_decoder_input)
question_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name="question_lstm")
question_output, _, _ = question_lstm(question_embedding, initial_state=[state_h, state_c])
question_dense = Dense(VOCAB_SIZE, activation="softmax", name="question_output")(question_output)

# Decoder for Answer
answer_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name="answer_lstm")
answer_output, _, _ = answer_lstm(context_embedding, initial_state=[state_h, state_c])
answer_dense = Dense(VOCAB_SIZE, activation="softmax", name="answer_output")(answer_output)

# Classification Output for Question Type
type_dense = Dense(128, activation="relu")(encoder_output)
question_type_output = Dense(3, activation="softmax", name="question_type_output")(type_dense)

# Construct the Model
model = Model(
    inputs=[context_input, question_decoder_input],
    outputs=[question_dense, answer_dense, question_type_output],
)

# === Compile the Model === #
model.compile(
    optimizer="adam",
    loss={
        "question_output": "sparse_categorical_crossentropy",
        "answer_output": "sparse_categorical_crossentropy",
        "question_type_output": "sparse_categorical_crossentropy",
    },
    metrics={
        "question_output": ["accuracy"],
        "answer_output": ["accuracy"],
        "question_type_output": ["accuracy"],
    },
)

# === Train the Model === #
model.fit(
    [context_train, question_train],
    {
        "question_output": question_train,
        "answer_output": answer_train,
        "question_type_output": qtype_train,
    },
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
)

# Save the Model
model.save("lstm_multi_output_model.keras")
print("✅ Model training complete and saved!")

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 681ms/step - answer_output_accuracy: 0.0297 - answer_output_loss: 6.2965 - loss: 13.6985 - question_output_accuracy: 0.0000e+00 - question_output_loss: 6.3022 - question_type_output_accuracy: 0.2625 - question_type_output_loss: 1.0992 - val_answer_output_accuracy: 0.2044 - val_answer_output_loss: 6.2629 - val_loss: 13.6638 - val_question_output_accuracy: 0.0069 - val_question_output_loss: 6.2961 - val_question_type_output_accuracy: 0.2500 - val_question_type_output_loss: 1.1048
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 305ms/step - answer_output_accuracy: 0.2145 - answer_output_loss: 6.2378 - loss: 13.6041 - question_output_accuracy: 0.0127 - question_output_loss: 6.2865 - question_type_output_accuracy: 0.6076 - question_type_output_loss: 1.0785 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 6.1644 - val_loss: 13.5630 - val_question_output_accuracy: 0.0100 - val_quest

In [29]:
# === Evaluate on Test Set === #
pred_question, pred_answer, pred_qtype = model.predict([context_test, question_test])
pred_qtype_labels = np.argmax(pred_qtype, axis=1)

print("=== Evaluation on Test Data ===")
print("Classification Report for Question Type:")
print(classification_report(qtype_test, pred_qtype_labels))
print("Accuracy:", accuracy_score(qtype_test, pred_qtype_labels))
print("Precision:", precision_score(qtype_test, pred_qtype_labels, average='weighted'))
print("Recall:", recall_score(qtype_test, pred_qtype_labels, average='weighted'))


def sequence_to_text(sequence, tokenizer):
    return [tokenizer.index_word.get(idx, "<OOV>") for idx in sequence if idx != 0]

reference = [sequence_to_text(question_test[0], tokenizer)]
candidate = sequence_to_text(np.argmax(pred_question[0], axis=-1), tokenizer)
bleu_score = nltk.translate.bleu_score.sentence_bleu(reference, candidate)
print("BLEU score for first test sample (question generation):", bleu_score)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 264ms/step
=== Evaluation on Test Data ===
Classification Report for Question Type:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.50      0.83      0.62         6
           2       0.44      0.80      0.57         5

    accuracy                           0.47        19
   macro avg       0.31      0.54      0.40        19
weighted avg       0.27      0.47      0.35        19

Accuracy: 0.47368421052631576
Precision: 0.27485380116959063
Recall: 0.47368421052631576
BLEU score for first test sample (question generation): 0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
