fix: remove the comment

2025-02-07 13:44:10 +07:00 · 2025-02-07 13:44:10 +07:00 · ba8b5f7f8c
parent fb6ff8512b
commit ba8b5f7f8c
3 changed files with 82 additions and 49 deletions
--- a/.~lock.quiz_questions.csv#
+++ b/.~lock.quiz_questions.csv#
@ -1 +0,0 @@
-,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;
--- a/main.py
+++ b/main.py
@ -15,18 +15,18 @@ from tensorflow.keras.layers import (
 )
 from tensorflow.keras.optimizers import Adam

-# -----------------------
+
 # 1. Load dataset
-# -----------------------
+
 df = pd.read_csv("quiz_questions.csv")

 # Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
 df.dropna(subset=["paragraph", "question"], inplace=True)


-# -----------------------
 # 2. Preprocessing text
-# -----------------------
+
+
 def preprocess_text(text):
    # Contoh preprocessing sederhana
    text = text.lower()
@ -36,9 +36,9 @@ def preprocess_text(text):
 df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
 df["question"] = df["question"].astype(str).apply(preprocess_text)

-# -----------------------
+
 # 3. Tokenization
-# -----------------------
+
 # Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
@ -61,9 +61,9 @@ with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
 print("Tokenizer disimpan ke tokenizer.pkl")

-# -----------------------
+
 # 4. Siapkan X, y
-# -----------------------
+
 # Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
 # idealnya y memiliki shape: (num_samples, max_length, 1)
 X = np.array(X_padded)
@ -72,9 +72,9 @@ y = np.expand_dims(np.array(y_padded), axis=-1)
 print("Shape X:", X.shape)
 print("Shape y:", y.shape)  # (batch_size, max_length, 1)

-# -----------------------
+
 # 5. Split data
-# -----------------------
+
 X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
 )
@ -82,9 +82,9 @@ X_train, X_test, y_train, y_test = train_test_split(
 print("Train size:", X_train.shape, y_train.shape)
 print("Test size: ", X_test.shape, y_test.shape)

-# -----------------------
+
 # 6. Build Model LSTM
-# -----------------------
+
 # Kita pakai 2 LSTM stack, masing2 return_sequences=True
 # Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
 model = Sequential()
@ -97,9 +97,9 @@ model.add(LSTM(128, return_sequences=True))
 # TimeDistributed Dense agar Dense diaplikasikan per timestep
 model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))

-# -----------------------
+
 # 7. Compile
-# -----------------------
+
 model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(learning_rate=0.001),
@ -108,16 +108,16 @@ model.compile(

 model.summary()

-# -----------------------
+
 # 8. Train Model
-# -----------------------
+
 epochs = 10
 history = model.fit(
    X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
 )

-# -----------------------
+
 # 9. Save Model
-# -----------------------
+
 model.save("lstm_question_generator.keras")
 print("Training selesai dan model telah disimpan.")
--- a/test.py
+++ b/test.py
@ -1,68 +1,102 @@
 import numpy as np
-import re
-from tensorflow.keras.models import load_model
-
-from tensorflow.keras.preprocessing.sequence import pad_sequences
+import json
 import pickle
-
-# Misal kita punya tokenizer, model, dan max_length:
-# tokenizer, model, max_length = ...
-# Pastikan Anda load model & tokenizer sesuai environment Anda.
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences


 def preprocess_text(text):
-    # Fungsi preprocess sederhana (harus sama atau mirip dengan training)
+    """ Simple text preprocessing (should match training preprocessing) """
    text = text.lower()
-    # Buat penyesuaian lain jika perlu
    return text


 def generate_question(paragraph, tokenizer, model, max_length):
+    """ Generate a question based on the input paragraph """
    # 1) Preprocess paragraph
    paragraph = preprocess_text(paragraph)

    # 2) Tokenize
-    seq = tokenizer.texts_to_sequences([paragraph])  # hasilnya list of list
-    # 3) Pad sequence
+    seq = tokenizer.texts_to_sequences([paragraph])  
    padded = pad_sequences(seq, maxlen=max_length, padding="post")

-    # 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size)
-    prediction = model.predict(padded)  # (1, max_length, vocab_size)
+    # 3) Get model predictions
+    prediction = model.predict(padded)  

-    # 5) Cari argmax di setiap time step => (1, max_length)
+    # 4) Get the most likely word indices
    predicted_indices = np.argmax(prediction, axis=-1)[0]

-    # 6) Konversi ke kata
+    # 5) Convert indices to words
    predicted_words = []
    for idx in predicted_indices:
-        # Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer
-        if idx == 0:
-            # Boleh langsung break, karena sisanya kemungkinan pad
+        if idx == 0:  # Assuming 0 is padding or unknown token
            break
        word = tokenizer.index_word.get(idx, "")
        predicted_words.append(word)

-    # 7) Gabungkan jadi satu kalimat
+    # 6) Form the final question
    predicted_question = " ".join(predicted_words)
-
-    # Bisa saja kita menambahkan tanda tanya
    if not predicted_question.endswith("?"):
-        predicted_question = predicted_question + "?"
+        predicted_question += "?"

    return predicted_question


+def determine_question_type(paragraph):
+    """ Simple rule-based method to determine question type """
+    if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower():
+        return "true_false"
+    elif "berapa" in paragraph.lower() or "mana" in paragraph.lower():
+        return "multiple_choice"
+    else:
+        return "fill_in_the_blank"
+
+
+def extract_possible_answer(paragraph):
+    """ Basic heuristic to extract an answer (first proper noun or keyword) """
+    words = paragraph.split()
+    if len(words) > 2:
+        return words[0] + " " + words[1]  # Return first two words as a basic approach
+    return words[0]
+
+
+def generate_options(question_type, answer):
+    """ Generate dummy options for multiple-choice questions """
+    if question_type == "multiple_choice":
+        return [answer, "Option B", "Option C", "Option D"]
+    elif question_type == "true_false":
+        return ["True", "False"]
+    else:
+        return ["-"]  # Placeholder for fill-in-the-blank
+
+
+# Load the trained model and tokenizer
 model = load_model("lstm_question_generator.keras")

-
-
 with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

-# Pastikan max_length sama seperti saat training
-max_length = 50  # Atau nilai yang Anda tetapkan
+max_length = 50  # Ensure this is the same as in training

-paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum."
+# Sample paragraph input
+paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum"

-generated_q = generate_question(paragraph_input, tokenizer, model, max_length)
-print("Generated question:", generated_q)
+# Generate question
+generated_question = generate_question(paragraph_input, tokenizer, model, max_length)
+
+# Determine question type and answer
+question_type = determine_question_type(paragraph_input)
+answer = extract_possible_answer(paragraph_input)
+options = generate_options(question_type, answer)
+
+# Construct JSON output
+output = {
+    "paragraph": paragraph_input,
+    "question_type": question_type,
+    "question": generated_question,
+    "answer": answer,
+    "options": "|".join(options)  # Match dataset format
+}
+
+# Print JSON result
+print(json.dumps(output, indent=4, ensure_ascii=False))
				`@ -1 +0,0 @@`
				`,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;`