In [6]:
# ==========================
# 1) Install/Import Dependencies
# ==========================
# If you are in a brand new environment, uncomment the following line:
# %pip install tensorflow pandas

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

# ==========================
# 2) Load Dataset (CSV)
# ==========================
# Adjust the file path to your CSV file
df = pd.read_csv("quiz_questions.csv")

# Extract the paragraphs and questions
paragraphs = df['paragraph'].astype(str).tolist()
questions  = df['question'].astype(str).tolist()

# (Optional) For demonstration, let's ignore question_type, answer, distractors in this example
# but you can incorporate them as extra signals if you wish.

# ==========================
# 3) Tokenize Text
# ==========================
# Create two tokenizers: one for paragraphs, one for questions
num_words = 10000  # Maximum vocabulary size

tokenizer_paragraph = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer_paragraph.fit_on_texts(paragraphs)
paragraph_sequences = tokenizer_paragraph.texts_to_sequences(paragraphs)

tokenizer_question = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer_question.fit_on_texts(questions)
question_sequences = tokenizer_question.texts_to_sequences(questions)

# Get max lengths (for padding)
max_paragraph_len = max(len(seq) for seq in paragraph_sequences)
max_question_len  = max(len(seq) for seq in question_sequences)

# Pad sequences
encoder_input_data = pad_sequences(paragraph_sequences, maxlen=max_paragraph_len, padding='post')
# For decoder data, we usually do teacher forcing:
# We'll keep one version as input, one version shifted as the target
decoder_input_data_full = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')

# We create decoder_target_data by shifting to the left by 1 token
decoder_target_data = np.copy(decoder_input_data_full[:, 1:])
decoder_input_data  = np.copy(decoder_input_data_full[:, :-1])

# Expand target dimension for sparse_categorical_crossentropy
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Calculate vocab sizes
vocab_size_paragraph = min(len(tokenizer_paragraph.word_index) + 1, num_words)
vocab_size_question  = min(len(tokenizer_question.word_index)  + 1, num_words)

# ==========================
# 4) Build Seq2Seq Model
# ==========================
embedding_dim = 128
latent_dim    = 256  # LSTM hidden dimension

# ----- Encoder -----
encoder_inputs = Input(shape=(None,), name="encoder_inputs")
encoder_embedding = Embedding(input_dim=vocab_size_paragraph,
                              output_dim=embedding_dim,
                              mask_zero=True)(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
_, state_h, state_c = encoder_lstm(encoder_embedding)

encoder_states = [state_h, state_c]

# ----- Decoder -----
decoder_inputs = Input(shape=(None,), name="decoder_inputs")
decoder_embedding = Embedding(input_dim=vocab_size_question,
                              output_dim=embedding_dim,
                              mask_zero=True,
                              name="decoder_embedding")(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_embedding,
                                     initial_state=encoder_states)
decoder_dense = Dense(vocab_size_question, activation='softmax', name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Combine into a training model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

print(model.summary())

# ==========================
# 5) Train/Test Split (Optional)
# ==========================
# For simplicity, let's do a quick train/validation split
# Adjust split size or do a separate test set for production usage.
split_index = int(0.8 * len(encoder_input_data))
encoder_train = encoder_input_data[:split_index]
decoder_train = decoder_input_data[:split_index]
target_train  = decoder_target_data[:split_index]

encoder_val = encoder_input_data[split_index:]
decoder_val = decoder_input_data[split_index:]
target_val  = decoder_target_data[split_index:]

# ==========================
# 6) Fit the Model
# ==========================
history = model.fit(
    [encoder_train, decoder_train],
    target_train,
    batch_size=32,
    epochs=10,
    validation_data=([encoder_val, decoder_val], target_val)
)

# The accuracy reported is "sparse_categorical_accuracy" at the token level.

# ==========================
# 7) Evaluate the Model
# ==========================
# If you want a quick evaluation on the validation set:
val_loss, val_accuracy = model.evaluate([encoder_val, decoder_val], target_val)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy (token-level): {val_accuracy:.4f}")

# ==========================
# 8) Build Inference Models
# ==========================
# Encoder model for inference
encoder_model_inf = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,), name="inference_state_h")
decoder_state_input_c = Input(shape=(latent_dim,), name="inference_state_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb_inf = decoder_embedding(decoder_inputs)
decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inf, initial_state=decoder_states_inputs
)
decoder_inf_states = [state_h_inf, state_c_inf]
decoder_inf_outputs = decoder_dense(decoder_inf_outputs)

decoder_model_inf = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_inf_outputs] + decoder_inf_states
)

# Create index-to-word mapping for the question tokenizer
index_to_word_question = {idx: word for word, idx in tokenizer_question.word_index.items()}
# If you used an OOV token, might want to handle that as well.

def generate_question(paragraph_text, max_length=50, start_token=None, end_token=None):
    """
    Generate a question from a paragraph using the trained seq2seq model.
    Token-level decoding with greedy search.
    """
    # 1) Encode the paragraph
    seq = tokenizer_paragraph.texts_to_sequences([paragraph_text])
    seq = pad_sequences(seq, maxlen=max_paragraph_len, padding='post')
    states_value = encoder_model_inf.predict(seq)

    # 2) Start token
    target_seq = np.zeros((1, 1), dtype='int32')
    # If you have a <START> token, set it here
    # e.g., target_seq[0, 0] = tokenizer_question.word_index["<start>"]

    decoded_words = []

    for _ in range(max_length):
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = index_to_word_question.get(sampled_token_index, '<UNK>')

        # Stop if we encounter an <end> token or a special index
        if end_token and (sampled_word == end_token):
            break

        decoded_words.append(sampled_word)

        # Next target
        target_seq = np.zeros((1, 1), dtype='int32')
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return ' '.join(decoded_words)

# ==========================
# 9) Test Inference on a Paragraph
# ==========================
test_paragraph = "Albert Einstein was a theoretical physicist born in Germany..."
generated = generate_question(test_paragraph)
print("Generated question:", generated)


2025-02-05 01:57:25.675154: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


None
Epoch 1/10


2025-02-05 01:57:27.530017: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.
2025-02-05 01:57:27.593630: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


ValueError: math domain error

yups 0
yups 1
yups 2
yups 3
yups 4
yups 5
yups 6
yups 7
yups 8
yups 9
yups 10
yups 11
yups 12
yups 13
yups 14
yups 15
yups 16
yups 17
yups 18
yups 19
