In [34]:
import numpy as np
import pandas as pd
import json
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input,
    LSTM,
    Dense,
    Embedding,
    Bidirectional,
    Concatenate,
    Attention,
    Dropout,
)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re

In [35]:
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
    data = json.load(f)


# Preprocessing function
def preprocess_text(text):
    """Melakukan preprocessing teks dasar"""
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Persiapkan data untuk model
def prepare_data(data):
    """Siapkan data untuk model"""
    contexts = []
    tokens_list = []
    ner_list = []
    srl_list = []
    questions = []
    answers = []
    q_types = []

    for item in data:
        for qa in item["qas"]:
            contexts.append(preprocess_text(item["context"]))
            tokens_list.append(item["tokens"])
            ner_list.append(item["ner"])
            srl_list.append(item["srl"])
            questions.append(preprocess_text(qa["question"]))
            answers.append(qa["answer"])
            q_types.append(qa["type"])

    return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types


contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
    data
)

In [36]:
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
vocab_size = len(tokenizer.word_index) + 1

# Encoding untuk NER
ner_tokenizer = Tokenizer(oov_token="<OOV>")
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
ner_vocab_size = len(ner_tokenizer.word_index) + 1

# Encoding untuk SRL
srl_tokenizer = Tokenizer(oov_token="<OOV>")
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
srl_vocab_size = len(srl_tokenizer.word_index) + 1

# Encoding untuk tipe pertanyaan
q_type_tokenizer = Tokenizer()
q_type_tokenizer.fit_on_texts(q_types)
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1


# Konversi token, ner, srl ke sequences
def tokens_to_sequences(tokens, ner, srl):
    """Konversi token, ner, dan srl ke sequences"""
    token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
    ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
    srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
    return token_seqs, ner_seqs, srl_seqs


# Menentukan panjang maksimum untuk padding
context_seqs = tokenizer.texts_to_sequences(contexts)
question_seqs = tokenizer.texts_to_sequences(questions)
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)

max_context_len = max([len(seq) for seq in context_seqs])
max_question_len = max([len(seq) for seq in question_seqs])
max_token_len = max([len(seq) for seq in token_seqs])


# Pad sequences untuk memastikan semua input sama panjang
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
    """Padding semua sequences"""
    context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
    question_padded = pad_sequences(
        question_seqs, maxlen=max_question_len, padding="post"
    )
    token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
    ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
    srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
    return context_padded, question_padded, token_padded, ner_padded, srl_padded


# Siapkan encoder untuk jawaban
answer_tokenizer = Tokenizer(oov_token="<OOV>")
answer_tokenizer.fit_on_texts(answers)
answer_vocab_size = len(answer_tokenizer.word_index) + 1

# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
q_type_indices = []
for q_type in q_types:
    # Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
    q_type_indices.append(q_type_idx)

# Konversi ke numpy array
q_type_indices = np.array(q_type_indices)

# One-hot encode tipe pertanyaan
q_type_categorical = tf.keras.utils.to_categorical(
    q_type_indices, num_classes=q_type_vocab_size
)

# Pad sequences
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
    pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
)

# Encode jawaban
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
max_answer_len = max([len(seq) for seq in answer_seqs])
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")

# Split data menjadi train dan test sets
indices = list(range(len(context_padded)))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

In [37]:
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
def get_subset(data, indices):
    return np.array([data[i] for i in indices])


# Train data
train_context = get_subset(context_padded, train_indices)
train_question = get_subset(question_padded, train_indices)
train_token = get_subset(token_padded, train_indices)
train_ner = get_subset(ner_padded, train_indices)
train_srl = get_subset(srl_padded, train_indices)
train_q_type = get_subset(q_type_categorical, train_indices)
train_answer = get_subset(answer_padded, train_indices)

# Test data
test_context = get_subset(context_padded, test_indices)
test_question = get_subset(question_padded, test_indices)
test_token = get_subset(token_padded, test_indices)
test_ner = get_subset(ner_padded, test_indices)
test_srl = get_subset(srl_padded, test_indices)
test_q_type = get_subset(q_type_categorical, test_indices)
test_answer = get_subset(answer_padded, test_indices)

# Hyperparameters
embedding_dim = 100
lstm_units = 128
ner_embedding_dim = 50
srl_embedding_dim = 50
dropout_rate = 0.3

In [38]:
# Function untuk membuat model
def create_qa_model():
    # Input layers
    context_input = Input(shape=(max_context_len,), name="context_input")
    question_input = Input(shape=(max_question_len,), name="question_input")
    token_input = Input(shape=(max_token_len,), name="token_input")
    ner_input = Input(shape=(max_token_len,), name="ner_input")
    srl_input = Input(shape=(max_token_len,), name="srl_input")
    q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")

    # Shared embedding layer for text
    text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")

    # Embedding untuk NER dan SRL
    ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
        ner_input
    )
    srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
        srl_input
    )

    # Apply embeddings
    context_embed = text_embedding(context_input)
    question_embed = text_embedding(question_input)
    token_embed = text_embedding(token_input)

    # Bi-directional LSTM untuk context dan token-level features
    context_lstm = Bidirectional(
        LSTM(lstm_units, return_sequences=True, name="context_lstm")
    )(context_embed)
    question_lstm = Bidirectional(
        LSTM(lstm_units, return_sequences=True, name="question_lstm")
    )(question_embed)

    # Concat token features (tokens, NER, SRL)
    token_features = Concatenate(name="token_features")(
        [token_embed, ner_embedding, srl_embedding]
    )
    token_lstm = Bidirectional(
        LSTM(lstm_units, return_sequences=True, name="token_lstm")
    )(token_features)

    # Attention mechanism untuk context dengan memperhatikan question
    context_attention = tf.keras.layers.Attention(name="context_attention")(
        [context_lstm, question_lstm]
    )

    # Pool attention outputs
    context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
        context_attention
    )
    question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")(
        question_lstm
    )
    token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)

    # Concat all features
    all_features = Concatenate(name="all_features")(
        [context_att_pool, question_pool, token_pool, q_type_input]
    )

    # Dense layers
    x = Dense(256, activation="relu", name="dense_1")(all_features)
    x = Dropout(dropout_rate)(x)
    x = Dense(128, activation="relu", name="dense_2")(x)
    x = Dropout(dropout_rate)(x)

    # Output layer untuk jawaban
    answer_output = Dense(
        answer_vocab_size, activation="softmax", name="answer_output"
    )(x)

    # Create model
    model = Model(
        inputs=[
            context_input,
            question_input,
            token_input,
            ner_input,
            srl_input,
            q_type_input,
        ],
        outputs=answer_output,
    )

    # Compile model
    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

    return model


# Buat model
model = create_qa_model()
model.summary()

In [39]:
# Callback untuk menyimpan model terbaik
checkpoint = ModelCheckpoint(
    "qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1
)

early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)

# Training
batch_size = 8
epochs = 50

# Ubah format jawaban untuk sparse categorical crossentropy
train_answer_labels = train_answer[:, 0]  # Ambil indeks pertama dari jawaban
test_answer_labels = test_answer[:, 0]

# Train model
history = model.fit(
    [train_context, train_question, train_token, train_ner, train_srl, train_q_type],
    train_answer_labels,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(
        [test_context, test_question, test_token, test_ner, test_srl, test_q_type],
        test_answer_labels,
    ),
    callbacks=[
        # checkpoint,
        early_stop,
    ],
)

model.save("qa_lstm_model_final.keras")

# Simpan tokenizer
tokenizer_data = {
    "word_tokenizer": tokenizer.to_json(),
    "ner_tokenizer": ner_tokenizer.to_json(),
    "srl_tokenizer": srl_tokenizer.to_json(),
    "answer_tokenizer": answer_tokenizer.to_json(),
    "q_type_tokenizer": q_type_tokenizer.to_json(),
    "max_context_len": max_context_len,
    "max_question_len": max_question_len,
    "max_token_len": max_token_len,
}

with open("qa_tokenizers.json", "w") as f:
    json.dump(tokenizer_data, f)

Epoch 1/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 47ms/step - accuracy: 0.0914 - loss: 6.2711 - val_accuracy: 0.1368 - val_loss: 5.4982
Epoch 2/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.1016 - loss: 5.0131 - val_accuracy: 0.0596 - val_loss: 5.4249
Epoch 3/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.0930 - loss: 4.8057 - val_accuracy: 0.0877 - val_loss: 5.5286
Epoch 4/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - accuracy: 0.1550 - loss: 4.4755 - val_accuracy: 0.1509 - val_loss: 5.2432
Epoch 5/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.2023 - loss: 4.2949 - val_accuracy: 0.1895 - val_loss: 5.3838
Epoch 6/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - accuracy: 0.2297 - loss: 4.1641 - val_accuracy: 0.2070 - val_loss: 5.4800
Epoch 7/50
[1m143/14

In [40]:
model = load_model("qa_lstm_model_final.keras")
results = model.evaluate(
    [test_context, test_question, test_token, test_ner, test_srl, test_q_type],
    test_answer_labels,
    batch_size=batch_size,
)

print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.3886 - loss: 10.5154
Test Loss: 10.776042938232422
Test Accuracy: 0.4070175290107727


In [41]:
def predict_answer(context, question, tokens, ner, srl, q_type):
    # Preprocess
    context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
    question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])

    # Convert token, ner, srl dengan benar (memperhatikan format yang sama dengan data training)
    token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
    ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
    srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]

    # Handle tipe pertanyaan
    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
    q_type_cat = tf.keras.utils.to_categorical(
        [q_type_idx], num_classes=q_type_vocab_size
    )

    # Pad sequences
    context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
    question_pad = pad_sequences(question_seq, maxlen=max_question_len, padding="post")
    token_pad = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
    ner_pad = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
    srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")

    # Predict
    prediction = model.predict(
        [context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat]
    )

    # Get answer index
    answer_idx = np.argmax(prediction[0])

    # Convert to answer text
    for word, idx in answer_tokenizer.word_index.items():
        if idx == answer_idx:
            return word

    return "Unknown"

In [45]:
from collections import Counter

# def evaluate_model_performance(test_data):
#     output_path = "bleu_answer_calculation.xlsx"
    
#     bleu_scores = []
#     rows = []

#     for i in range(len(test_data)):
#         idx = test_data[i]
        
#         sample_context = contexts[idx]
#         sample_question = questions[idx]  # Get the actual question
#         sample_tokens = tokens_list[idx]
#         sample_ner = ner_list[idx]
#         sample_srl = srl_list[idx]
#         sample_q_type = q_types[idx]
#         actual_answer = answers[idx]
        
#         print(f"Processing sample {i+1}/{len(test_data)} (index {idx})")
        
#         # Call predict_answer with parameters in the correct order
#         pred_answer = predict_answer(
#             sample_context,      # context
#             sample_question,     # question (string)
#             sample_tokens,       # tokens (list)
#             sample_ner,          # ner
#             sample_srl,          # srl
#             sample_q_type        # q_type
#         )

#         actual_tokens = actual_answer.split()
#         pred_tokens = pred_answer.split()

#         max_n = 4
#         weights = [1 / max_n] * max_n
#         clipped_counts = []
#         total_counts = []
#         precisions = []

#         log_text = f"Sample {i+1}:\n"
#         log_text += f"Context: {sample_context}\n"
#         log_text += f"Question: {sample_question}\n"
#         log_text += f"Actual Answer: {actual_answer}\n"
#         log_text += f"Predicted Answer: {pred_answer}\n"
#         log_text += f"Actual Tokens: {actual_tokens}\n"
#         log_text += f"Predicted Tokens: {pred_tokens}\n"
        
#         print(log_text)

#         for n in range(1, max_n + 1):
#             # Skip if not enough tokens for n-gram
#             if len(actual_tokens) < n or len(pred_tokens) < n:
#                 clipped_counts.append(0)
#                 total_counts.append(0 if len(pred_tokens) < n else sum(1 for _ in range(len(pred_tokens) - n + 1)))
#                 precisions.append(0)
#                 log_text += f"{n}-gram: clipped count = 0, total candidate = {total_counts[-1]}, precision = 0.0000\n"
#                 continue
                
#             ref_ngrams = Counter(tuple(actual_tokens[j:j + n]) for j in range(len(actual_tokens) - n + 1))
#             cand_ngrams = Counter(tuple(pred_tokens[j:j + n]) for j in range(len(pred_tokens) - n + 1))

#             clip_sum = sum(min(cnt, ref_ngrams.get(ng, 0)) for ng, cnt in cand_ngrams.items())
#             total = sum(cand_ngrams.values())
#             p_n = clip_sum / total if total > 0 else 0

#             clipped_counts.append(clip_sum)
#             total_counts.append(total)
#             precisions.append(p_n)

#             log_text += f"{n}-gram: clipped count = {clip_sum}, total candidate = {total}, precision = {p_n:.4f}\n"

#         c = len(pred_tokens)
#         r = len(actual_tokens)

#         if c == 0:
#             bp = 0
#             log_text += f"Brevity Penalty: BP = {bp:.4f} (c={c}, r={r}) - No predicted tokens.\n"
#         else:
#             bp = 1 if c > r else np.exp(1 - r / c)
#             log_text += f"Brevity Penalty: BP = {bp:.4f} (c={c}, r={r})\n"

#         # Avoid math domain error with log(0)
#         filtered_precisions = [max(p, 1e-10) for p in precisions]  # Replace 0 with small value
        
#         if all(p > 0 for p in precisions):
#             bleu = bp * np.exp(sum(w * np.log(p) for w, p in zip(weights, filtered_precisions)))
#         else:
#             bleu = 0.0

#         log_text += f"BLEU score = {bleu:.4f}\n"
#         print(f"BLEU score = {bleu:.4f}")

#         bleu_scores.append(bleu)
#         rows.append({"Result": log_text})

#     try:
#         df = pd.DataFrame(rows)
#         df.to_excel(output_path, index=False)
#         print(f"Results saved to: {output_path}")
#     except Exception as e:
#         print(f"Error saving to Excel: {e}")

#     # Handle empty bleu_scores
#     avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0
    
#     results = {
#         "avg_bleu_score": avg_bleu,
#     }

#     return results

def evaluate_model_performance(test_data, use_unigram_only=True, include_bigram=False):
    """
    Evaluate model performance using BLEU score.
    
    Args:
        test_data: List of indices for test samples
        use_unigram_only: If True, calculate BLEU using only unigrams
        include_bigram: If True and use_unigram_only is False, include up to bigrams
        
    Returns:
        Dictionary with evaluation metrics
    """
    output_path = "bleu_answer_calculation.xlsx"
    
    unigram_scores = []
    bigram_scores = []
    full_bleu_scores = []
    rows = []
    
    # Add counters to track stats
    total_samples = len(test_data)
    perfect_matches = 0
    zero_scores = 0

    for i in range(total_samples):
        idx = test_data[i]
        
        sample_context = contexts[idx]
        sample_question = questions[idx]
        sample_tokens = tokens_list[idx]
        sample_ner = ner_list[idx]
        sample_srl = srl_list[idx]
        sample_q_type = q_types[idx]
        actual_answer = answers[idx]
        
        print(f"Processing sample {i+1}/{total_samples} (index {idx})")
        
        # Call predict_answer with parameters in the correct order
        pred_answer = predict_answer(
            sample_context,
            sample_question,
            sample_tokens,
            sample_ner,
            sample_srl,
            sample_q_type
        )

        actual_tokens = actual_answer.split()
        pred_tokens = pred_answer.split()

        # Define max_n based on parameters
        if use_unigram_only:
            max_n = 1
        elif include_bigram:
            max_n = 2
        else:
            max_n = 4  # Original implementation with up to 4-grams
            
        weights = [1 / max_n] * max_n
        clipped_counts = []
        total_counts = []
        precisions = []

        log_text = f"Sample {i+1}:\n"
        log_text += f"Context: {sample_context}\n"
        log_text += f"Question: {sample_question}\n"
        log_text += f"Actual Answer: {actual_answer}\n"
        log_text += f"Predicted Answer: {pred_answer}\n"
        log_text += f"Actual Tokens: {actual_tokens}\n"
        log_text += f"Predicted Tokens: {pred_tokens}\n"

        # Calculate precision for each n-gram level
        for n in range(1, max_n + 1):
            # Skip if not enough tokens for n-gram
            if len(actual_tokens) < n or len(pred_tokens) < n:
                clipped_counts.append(0)
                total_counts.append(0 if len(pred_tokens) < n else sum(1 for _ in range(len(pred_tokens) - n + 1)))
                precisions.append(0)
                log_text += f"{n}-gram: clipped count = 0, total candidate = {total_counts[-1]}, precision = 0.0000\n"
                continue
                
            ref_ngrams = Counter(tuple(actual_tokens[j:j + n]) for j in range(len(actual_tokens) - n + 1))
            cand_ngrams = Counter(tuple(pred_tokens[j:j + n]) for j in range(len(pred_tokens) - n + 1))

            clip_sum = sum(min(cnt, ref_ngrams.get(ng, 0)) for ng, cnt in cand_ngrams.items())
            total = sum(cand_ngrams.values())
            p_n = clip_sum / total if total > 0 else 0

            clipped_counts.append(clip_sum)
            total_counts.append(total)
            precisions.append(p_n)

            log_text += f"{n}-gram: clipped count = {clip_sum}, total candidate = {total}, precision = {p_n:.4f}\n"
            
            # Save unigram and bigram scores separately
            if n == 1:
                unigram_precision = p_n
            elif n == 2:
                bigram_precision = p_n

        c = len(pred_tokens)
        r = len(actual_tokens)

        if c == 0:
            bp = 0
            log_text += f"Brevity Penalty: BP = {bp:.4f} (c={c}, r={r}) - No predicted tokens.\n"
        else:
            bp = 1 if c > r else np.exp(1 - r / c)
            log_text += f"Brevity Penalty: BP = {bp:.4f} (c={c}, r={r})\n"

        # Calculate unigram BLEU (just precision with brevity penalty)
        unigram_bleu = bp * unigram_precision if 'unigram_precision' in locals() else 0
        unigram_scores.append(unigram_bleu)
        log_text += f"Unigram BLEU score = {unigram_bleu:.4f}\n"
        
        # Track perfect matches and zero scores
        if unigram_bleu >= 0.9999:  # Close enough to 1.0
            perfect_matches += 1
        if unigram_bleu <= 0.0001:  # Close enough to 0.0
            zero_scores += 1
        
        # Calculate bigram BLEU if needed
        if include_bigram and 'bigram_precision' in locals():
            # Geometric mean of unigram and bigram precision
            bigram_bleu = bp * np.sqrt(unigram_precision * bigram_precision) if unigram_precision > 0 and bigram_precision > 0 else 0
            bigram_scores.append(bigram_bleu)
            log_text += f"Bigram BLEU score = {bigram_bleu:.4f}\n"

        # Calculate full BLEU with all n-grams
        if all(p > 0 for p in precisions):
            # Avoid math domain error with log(0)
            filtered_precisions = [max(p, 1e-10) for p in precisions]
            full_bleu = bp * np.exp(sum(w * np.log(p) for w, p in zip(weights, filtered_precisions)))
        else:
            full_bleu = 0.0
            
        full_bleu_scores.append(full_bleu)
        log_text += f"Full BLEU score ({max_n}-gram) = {full_bleu:.4f}\n"
        
        print(f"Unigram BLEU: {unigram_bleu:.4f}")
        if include_bigram:
            print(f"Bigram BLEU: {bigram_scores[-1]:.4f}")
        print(f"Full BLEU ({max_n}-gram): {full_bleu:.4f}")

        rows.append({"Result": log_text})

    # Debug information
    print("\nDEBUG INFORMATION:")
    print(f"Total samples processed: {total_samples}")
    print(f"Samples with perfect score (1.0): {perfect_matches}")
    print(f"Samples with zero score (0.0): {zero_scores}")
    print(f"Number of scores collected: {len(unigram_scores)}")
    
    # Print all scores for verification
    if len(unigram_scores) <= 20:  # Only if reasonably short
        print("All unigram scores:", unigram_scores)
    else:
        print("First 10 unigram scores:", unigram_scores[:10])
        print("Last 10 unigram scores:", unigram_scores[-10:])
    
    # Calculate average scores with extra checks
    if unigram_scores:
        avg_unigram = sum(unigram_scores) / len(unigram_scores)
        print(f"Sum of all unigram scores: {sum(unigram_scores):.4f}")
        print(f"Mean of all unigram scores: {avg_unigram:.4f}")
    else:
        avg_unigram = 0.0
        print("Warning: No unigram scores to average!")
    
    avg_bigram = sum(bigram_scores) / len(bigram_scores) if bigram_scores else 0.0
    avg_full = sum(full_bleu_scores) / len(full_bleu_scores) if full_bleu_scores else 0.0
    
    # Save results to Excel
    try:
        df = pd.DataFrame(rows)
        df.to_excel(output_path, index=False)
        print(f"Results saved to: {output_path}")
        
        # Also save a summary of scores
        summary_data = []
        for i in range(len(unigram_scores)):
            summary_row = {
                "Sample": i+1, 
                "Index": test_data[i],
                "Unigram BLEU": unigram_scores[i]
            }
            if include_bigram and i < len(bigram_scores):
                summary_row["Bigram BLEU"] = bigram_scores[i]
            summary_row["Full BLEU"] = full_bleu_scores[i]
            summary_data.append(summary_row)
            
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel("bleu_scores_summary.xlsx", index=False)
        print("Summary of scores saved to: bleu_scores_summary.xlsx")
        
    except Exception as e:
        print(f"Error saving to Excel: {e}")
    
    results = {
        "avg_unigram_bleu": avg_unigram,
        "avg_bigram_bleu": avg_bigram if include_bigram else None,
        "avg_full_bleu": avg_full,
        "perfect_matches": perfect_matches,
        "zero_scores": zero_scores,
        "total_samples": total_samples
    }

    return results

# performance_metrics = evaluate_model_performance(test_indices)
performance_metrics = evaluate_model_performance(test_indices, use_unigram_only=True)

print("\nModel Performance Metrics:")
print(f"Average Unigram BLEU Score: {performance_metrics['avg_unigram_bleu'] * 100:.2f}%")
print(f"Perfect matches: {performance_metrics['perfect_matches']} out of {performance_metrics['total_samples']} ({performance_metrics['perfect_matches']/performance_metrics['total_samples']*100:.2f}%)")
print(f"Zero scores: {performance_metrics['zero_scores']} out of {performance_metrics['total_samples']} ({performance_metrics['zero_scores']/performance_metrics['total_samples']*100:.2f}%)")

Processing sample 1/285 (index 628)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Unigram BLEU: 0.0000
Full BLEU (1-gram): 0.0000
Processing sample 2/285 (index 680)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Unigram BLEU: 0.0000
Full BLEU (1-gram): 0.0000
Processing sample 3/285 (index 1085)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Unigram BLEU: 0.0000
Full BLEU (1-gram): 0.0000
Processing sample 4/285 (index 578)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Unigram BLEU: 1.0000
Full BLEU (1-gram): 1.0000
Processing sample 5/285 (index 1010)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Unigram BLEU: 1.0000
Full BLEU (1-gram): 1.0000
Processing sample 6/285 (index 759)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Unigram BLEU: 0.0000
Full BLEU (1-gram): 0.0000
Processing sample 7/285 (index 931)
[1m1/1[0m [32m━━━━━━━━━