In [47]:
# import library

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score


In [48]:
# download assets
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/akeon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/akeon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/akeon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/akeon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [49]:
# load dataset
df = pd.read_json("independent_dataset.json")
df.head()

Unnamed: 0,context,qa_pairs
0,Albert Einstein adalah fisikawan teoretis kela...,"[{'type': 'fill_in_the_blank', 'question': '__..."
1,Samudra Pasifik adalah yang terbesar dan terda...,"[{'type': 'fill_in_the_blank', 'question': 'Sa..."
2,Proklamasi Kemerdekaan Indonesia dibacakan pad...,"[{'type': 'fill_in_the_blank', 'question': 'Pr..."
3,Hukum Newton adalah tiga hukum fisika yang men...,"[{'type': 'fill_in_the_blank', 'question': 'Hu..."
4,Budi Utomo adalah organisasi pemuda yang didir...,"[{'type': 'fill_in_the_blank', 'question': 'Bu..."


In [50]:
# Text Preprocessing
stop_words = set(stopwords.words("indonesian"))                                                                                                                                                                                                                                                                             
lemmatizer = WordNetLemmatizer()

normalization_dict = {
    "yg": "yang",
    "gokil": "kocak",
    "kalo": "kalau",
    "gue": "saya",
    "elo": "kamu",
    "nih": "ini",
    "trs": "terus",
    "tdk": "tidak",
    "gmna": "bagaimana",
    "tp": "tapi",
    "jd": "jadi",
    "aja": "saja",
    "krn": "karena",
    "blm": "belum",
    "dgn": "dengan",
    "skrg": "sekarang",
    "msh": "masih",
    "lg": "lagi",
    "sy": "saya",
    "sm": "sama",
    "bgt": "banget",
    "dr": "dari",
    "kpn": "kapan",
    "hrs": "harus",
    "cm": "cuma",
    "sbnrnya": "sebenarnya",
    "tdr": "tidur",
    "tdk": "tidak",
    "kl": "kalau",
    "org": "orang",
    "pke": "pakai",
    "prnh": "pernah",
    "brgkt": "berangkat",
    "pdhl": "padahal",
    "btw": "ngomong-ngomong",
    "dmn": "di mana",
    "bsk": "besok",
    "td": "tadi",
    "dlm": "dalam",
    "utk": "untuk",
    "spt": "seperti",
    "gpp": "tidak apa-apa",
    "bs": "bisa",
    "jg": "juga",
    "tp": "tapi",
    "dg": "dengan",
    "klw": "kalau",
    "wkwk": "haha",
    "cpt": "cepat",
    "knp": "kenapa",
    "jgk": "juga",
    "plg": "pulang",
    "brp": "berapa",
    "bkn": "bukan",
    "mnt": "minta",
    "udh": "sudah",
    "sdh": "sudah",
    "brkt": "berangkat",
    "btw": "by the way",
    "tdk": "tidak",
    "sprt": "seperti",
    "jgn": "jangan",
    "mlm": "malam",
    "sblm": "sebelum",
    "stlh": "setelah",
    "tdr": "tidur",
    "mlh": "malah",
    "tmn": "teman",
}


def text_preprocessing(text):
    #doing lower case 
    text = text.lower()
    
    # remove symbol and read mark
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # remove blank space
    text = re.sub(r"\s+", " ", text).strip()
    
    # word tokenize 
    tokens = word_tokenize(text)
    
    # normalassi kata
    tokens = [normalization_dict[word] if word in normalization_dict else word for word in tokens] 
    
    
    # lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  
    
    # stopword removal
    tokens = [word for word in tokens if word not in stop_words]  
    
    return tokens


In [51]:
with open("independent_dataset.json", "r", encoding="utf-8") as file:
    dataset = json.load(file)

for entry in dataset:
    entry["context"] = text_preprocessing(entry["context"])
    for qa in entry["qa_pairs"]:
        qa["question"] = text_preprocessing(qa["question"])
        qa["answer"] = text_preprocessing(qa["answer"])

# === Extract Contexts, Questions, Answers, and Question Types === #
contexts = [entry["context"] for entry in dataset]
questions = [qa["question"] for entry in dataset for qa in entry["qa_pairs"]]
answers = [qa["answer"] for entry in dataset for qa in entry["qa_pairs"]]
question_types = [qa["type"] for entry in dataset for qa in entry["qa_pairs"]]  # Extract Question Types

# === Initialize Tokenizer === #
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(contexts + questions + answers)

# === Convert Text to Sequences === #
context_sequences = tokenizer.texts_to_sequences(contexts)
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# === Define Max Length for Padding === #
MAX_LENGTH = 100  # Adjust based on dataset analysis
context_padded = pad_sequences(context_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
question_padded = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")

# === Encode Question Types (Convert Categorical Labels to Numeric) === #
question_type_dict = {"fill_in_the_blank": 0, "true_false": 1, "multiple_choice": 2}
question_type_labels = np.array([question_type_dict[q_type] for q_type in question_types])

# === Save Processed Data as .npy Files === #
np.save("context_padded.npy", context_padded)
np.save("question_padded.npy", question_padded)
np.save("answer_padded.npy", answer_padded)
np.save("question_type_labels.npy", question_type_labels)

# Save Tokenizer for Future Use
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# === Check Results === #
print(f"✅ Vocabulary Size: {len(tokenizer.word_index) + 1}")
print(f"✅ Sample Tokenized Context: {context_padded[0]}")
print(f"✅ Sample Tokenized Question: {question_padded[0]}")
print(f"✅ Sample Tokenized Answer: {answer_padded[0]}")
print(f"✅ Sample Question Type Label: {question_type_labels[0]}")

✅ Vocabulary Size: 182
✅ Sample Tokenized Context: [  9  10  91  38  92  93  39   5  19  94  95  11  96  97  40  98  99 100
 101  20  21  22  11  41 102  11  38   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]
✅ Sample Tokenized Question: [39  5 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]
✅ Sample Tokenized Answer: [ 9 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0 

In [52]:
# Implementation of lstm with semantic analyz
# === Load Tokenizer === #
with open("tokenizer.pkl", "rb") as handle:
    tokenizer = pickle.load(handle)

# === Load Data yang Sudah Diproses === #
MAX_LENGTH = 100
VOCAB_SIZE = len(tokenizer.word_index) + 1

context_padded = np.load("context_padded.npy")
question_padded = np.load("question_padded.npy")
answer_padded = np.load("answer_padded.npy")
question_type_labels = np.load(
    "question_type_labels.npy"
)  # Label tipe soal (0 = Fill, 1 = True/False, 2 = Multiple Choice)

# === Hyperparameter === #
EMBEDDING_DIM = 300
LSTM_UNITS = 256
BATCH_SIZE = 32
EPOCHS = 10


# === Input Encoder (Konteks) === #
context_input = Input(shape=(MAX_LENGTH,), name="context_input")
context_embedding = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    mask_zero=True,
    name="context_embedding",
)(context_input)
encoder_lstm = LSTM(LSTM_UNITS, return_state=True, name="encoder_lstm")
encoder_output, state_h, state_c = encoder_lstm(context_embedding)

# === Decoder untuk Pertanyaan === #
question_decoder_input = Input(shape=(MAX_LENGTH,), name="question_decoder_input")
question_embedding = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    mask_zero=True,
    name="question_embedding",
)(question_decoder_input)
question_lstm = LSTM(
    LSTM_UNITS, return_sequences=True, return_state=True, name="question_lstm"
)
question_output, _, _ = question_lstm(
    question_embedding, initial_state=[state_h, state_c]
)
question_dense = Dense(VOCAB_SIZE, activation="softmax", name="question_output")(
    question_output
)

# === Decoder untuk Jawaban === #
answer_lstm = LSTM(
    LSTM_UNITS, return_sequences=True, return_state=True, name="answer_lstm"
)
answer_output, _, _ = answer_lstm(context_embedding, initial_state=[state_h, state_c])
answer_dense = Dense(VOCAB_SIZE, activation="softmax", name="answer_output")(
    answer_output
)

# === Prediksi Tipe Soal (Fill, True/False, Multiple Choice) === #
type_dense = Dense(128, activation="relu")(encoder_output)
question_type_output = Dense(3, activation="softmax", name="question_type_output")(
    type_dense
)  # 3 Kategori soal

# === Membangun Model Multi-Output === #
model = Model(
    inputs=[context_input, question_decoder_input],
    outputs=[question_dense, answer_dense, question_type_output],
)

# === Compile Model === #
# Compile Model (Fix for multiple outputs)
model.compile(
    optimizer="adam",
    loss={
        "question_output": "sparse_categorical_crossentropy",
        "answer_output": "sparse_categorical_crossentropy",
        "question_type_output": "sparse_categorical_crossentropy",
    },
    metrics={
        "question_output": ["accuracy"],
        "answer_output": ["accuracy"],
        "question_type_output": ["accuracy"],
    },
)

# === Training Model === #
model.fit(
    [context_padded, question_padded],
    {
        "question_output": question_padded,
        "answer_output": answer_padded,
        "question_type_output": question_type_labels,
    },
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
)

# === Simpan Model === #
model.save("lstm_multi_output_model.keras")

print("✅ Model LSTM Multi-Output berhasil dilatih dan disimpan!")

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - answer_output_accuracy: 0.0000e+00 - answer_output_loss: 5.2070 - loss: 11.5152 - question_output_accuracy: 0.0000e+00 - question_output_loss: 5.2081 - question_type_output_accuracy: 0.3333 - question_type_output_loss: 1.1002 - val_answer_output_accuracy: 0.1250 - val_answer_output_loss: 5.1854 - val_loss: 11.4804 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 5.2043 - val_question_type_output_accuracy: 1.0000 - val_question_type_output_loss: 1.0907
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step - answer_output_accuracy: 0.2100 - answer_output_loss: 5.1680 - loss: 11.4156 - question_output_accuracy: 0.0167 - question_output_loss: 5.1820 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.0656 - val_answer_output_accuracy: 0.2450 - val_answer_output_loss: 5.1625 - val_loss: 11.4545 - val_question_output_accuracy: 0.0000e+00 - 

In [53]:
predictions = model.predict([context_padded_test, question_padded_test])

# predictions[0] corresponds to question_output (shape: [batch_size, MAX_LENGTH, VOCAB_SIZE])
# predictions[1] corresponds to answer_output   (shape: [batch_size, MAX_LENGTH, VOCAB_SIZE])
# predictions[2] corresponds to question_type_output (shape: [batch_size, 3])

# Convert probabilities to predicted class indices
question_output_pred = np.argmax(predictions[0], axis=-1)  # shape: (batch_size, MAX_LENGTH)
answer_output_pred   = np.argmax(predictions[1], axis=-1)  # shape: (batch_size, MAX_LENGTH)
question_type_pred   = np.argmax(predictions[2], axis=-1)  # shape: (batch_size,)

# === 3. Evaluate QUESTION TYPE (single-label classification) === #
print("=== Evaluation for Question Type ===")
print(classification_report(
    question_type_test,          # True labels
    question_type_pred,          # Predicted labels
    target_names=["Fill", "True/False", "Multiple Choice"],  # Optionally label your classes
    zero_division=0             # Avoids warning if a class is absent
))

# If you just want separate metrics (macro-average for multi-class):
acc_qtype = accuracy_score(question_type_test, question_type_pred)
prec_qtype = precision_score(question_type_test, question_type_pred, average='macro', zero_division=0)
rec_qtype  = recall_score(question_type_test, question_type_pred, average='macro', zero_division=0)

print(f"Question Type -> Accuracy: {acc_qtype:.4f}, Precision(macro): {prec_qtype:.4f}, Recall(macro): {rec_qtype:.4f}")
print("")

# === 4. Evaluate QUESTION OUTPUT & ANSWER OUTPUT (sequence predictions) === #
# We do a token-level comparison. We must exclude padded positions to get a fair score.

# A helper function to flatten predictions & true labels while ignoring padding (zeros).
def flatten_and_mask(true_seq, pred_seq, pad_token=0):
    """
    true_seq, pred_seq = [batch_size, MAX_LENGTH]
    Returns flattened arrays of true & predicted labels, ignoring where true_seq == pad_token.
    """
    mask = (true_seq != pad_token)
    true_flat = true_seq[mask].flatten()
    pred_flat = pred_seq[mask].flatten()
    return true_flat, pred_flat

# --- 4a. Question Output ---
q_true_flat, q_pred_flat = flatten_and_mask(question_padded_test, question_output_pred, pad_token=0)

print("=== Evaluation for Question Tokens ===")
print(classification_report(
    q_true_flat, 
    q_pred_flat,
    zero_division=0  # Avoid warnings if a class is absent
))

acc_q = accuracy_score(q_true_flat, q_pred_flat)
prec_q = precision_score(q_true_flat, q_pred_flat, average='macro', zero_division=0)
rec_q  = recall_score(q_true_flat, q_pred_flat, average='macro', zero_division=0)
print(f"Question Tokens -> Accuracy: {acc_q:.4f}, Precision(macro): {prec_q:.4f}, Recall(macro): {rec_q:.4f}")
print("")

# --- 4b. Answer Output ---
a_true_flat, a_pred_flat = flatten_and_mask(answer_padded_test, answer_output_pred, pad_token=0)

print("=== Evaluation for Answer Tokens ===")
print(classification_report(
    a_true_flat,
    a_pred_flat,
    zero_division=0
))

acc_a = accuracy_score(a_true_flat, a_pred_flat)
prec_a = precision_score(a_true_flat, a_pred_flat, average='macro', zero_division=0)
rec_a  = recall_score(a_true_flat, a_pred_flat, average='macro', zero_division=0)
print(f"Answer Tokens -> Accuracy: {acc_a:.4f}, Precision(macro): {prec_a:.4f}, Recall(macro): {rec_a:.4f}")


NameError: name 'context_padded_test' is not defined