diff --git a/.~lock.quiz_questions.csv# b/.~lock.quiz_questions.csv# deleted file mode 100644 index 3a57694..0000000 --- a/.~lock.quiz_questions.csv# +++ /dev/null @@ -1 +0,0 @@ -,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4; \ No newline at end of file diff --git a/main.py b/main.py index f5d2376..32a764b 100644 --- a/main.py +++ b/main.py @@ -15,18 +15,18 @@ from tensorflow.keras.layers import ( ) from tensorflow.keras.optimizers import Adam -# ----------------------- + # 1. Load dataset -# ----------------------- + df = pd.read_csv("quiz_questions.csv") # Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong df.dropna(subset=["paragraph", "question"], inplace=True) -# ----------------------- # 2. Preprocessing text -# ----------------------- + + def preprocess_text(text): # Contoh preprocessing sederhana text = text.lower() @@ -36,9 +36,9 @@ def preprocess_text(text): df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text) df["question"] = df["question"].astype(str).apply(preprocess_text) -# ----------------------- + # 3. Tokenization -# ----------------------- + # Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya tokenizer = Tokenizer() tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist()) @@ -61,9 +61,9 @@ with open("tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f) print("Tokenizer disimpan ke tokenizer.pkl") -# ----------------------- + # 4. Siapkan X, y -# ----------------------- + # Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy", # idealnya y memiliki shape: (num_samples, max_length, 1) X = np.array(X_padded) @@ -72,9 +72,9 @@ y = np.expand_dims(np.array(y_padded), axis=-1) print("Shape X:", X.shape) print("Shape y:", y.shape) # (batch_size, max_length, 1) -# ----------------------- + # 5. Split data -# ----------------------- + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) @@ -82,9 +82,9 @@ X_train, X_test, y_train, y_test = train_test_split( print("Train size:", X_train.shape, y_train.shape) print("Test size: ", X_test.shape, y_test.shape) -# ----------------------- + # 6. Build Model LSTM -# ----------------------- + # Kita pakai 2 LSTM stack, masing2 return_sequences=True # Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim) model = Sequential() @@ -97,9 +97,9 @@ model.add(LSTM(128, return_sequences=True)) # TimeDistributed Dense agar Dense diaplikasikan per timestep model.add(TimeDistributed(Dense(vocab_size, activation="softmax"))) -# ----------------------- + # 7. Compile -# ----------------------- + model.compile( loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), @@ -108,16 +108,16 @@ model.compile( model.summary() -# ----------------------- + # 8. Train Model -# ----------------------- + epochs = 10 history = model.fit( X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32 ) -# ----------------------- + # 9. Save Model -# ----------------------- + model.save("lstm_question_generator.keras") print("Training selesai dan model telah disimpan.") diff --git a/test.py b/test.py index 16f4573..50e3479 100644 --- a/test.py +++ b/test.py @@ -1,68 +1,102 @@ import numpy as np -import re -from tensorflow.keras.models import load_model - -from tensorflow.keras.preprocessing.sequence import pad_sequences +import json import pickle - -# Misal kita punya tokenizer, model, dan max_length: -# tokenizer, model, max_length = ... -# Pastikan Anda load model & tokenizer sesuai environment Anda. +from tensorflow.keras.models import load_model +from tensorflow.keras.preprocessing.sequence import pad_sequences def preprocess_text(text): - # Fungsi preprocess sederhana (harus sama atau mirip dengan training) + """ Simple text preprocessing (should match training preprocessing) """ text = text.lower() - # Buat penyesuaian lain jika perlu return text def generate_question(paragraph, tokenizer, model, max_length): + """ Generate a question based on the input paragraph """ # 1) Preprocess paragraph paragraph = preprocess_text(paragraph) # 2) Tokenize - seq = tokenizer.texts_to_sequences([paragraph]) # hasilnya list of list - # 3) Pad sequence + seq = tokenizer.texts_to_sequences([paragraph]) padded = pad_sequences(seq, maxlen=max_length, padding="post") - # 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size) - prediction = model.predict(padded) # (1, max_length, vocab_size) + # 3) Get model predictions + prediction = model.predict(padded) - # 5) Cari argmax di setiap time step => (1, max_length) + # 4) Get the most likely word indices predicted_indices = np.argmax(prediction, axis=-1)[0] - # 6) Konversi ke kata + # 5) Convert indices to words predicted_words = [] for idx in predicted_indices: - # Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer - if idx == 0: - # Boleh langsung break, karena sisanya kemungkinan pad + if idx == 0: # Assuming 0 is padding or unknown token break word = tokenizer.index_word.get(idx, "") predicted_words.append(word) - # 7) Gabungkan jadi satu kalimat + # 6) Form the final question predicted_question = " ".join(predicted_words) - - # Bisa saja kita menambahkan tanda tanya if not predicted_question.endswith("?"): - predicted_question = predicted_question + "?" + predicted_question += "?" return predicted_question +def determine_question_type(paragraph): + """ Simple rule-based method to determine question type """ + if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower(): + return "true_false" + elif "berapa" in paragraph.lower() or "mana" in paragraph.lower(): + return "multiple_choice" + else: + return "fill_in_the_blank" + + +def extract_possible_answer(paragraph): + """ Basic heuristic to extract an answer (first proper noun or keyword) """ + words = paragraph.split() + if len(words) > 2: + return words[0] + " " + words[1] # Return first two words as a basic approach + return words[0] + + +def generate_options(question_type, answer): + """ Generate dummy options for multiple-choice questions """ + if question_type == "multiple_choice": + return [answer, "Option B", "Option C", "Option D"] + elif question_type == "true_false": + return ["True", "False"] + else: + return ["-"] # Placeholder for fill-in-the-blank + + +# Load the trained model and tokenizer model = load_model("lstm_question_generator.keras") - - with open("tokenizer.pkl", "rb") as f: tokenizer = pickle.load(f) -# Pastikan max_length sama seperti saat training -max_length = 50 # Atau nilai yang Anda tetapkan +max_length = 50 # Ensure this is the same as in training -paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum." +# Sample paragraph input +paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum" -generated_q = generate_question(paragraph_input, tokenizer, model, max_length) -print("Generated question:", generated_q) +# Generate question +generated_question = generate_question(paragraph_input, tokenizer, model, max_length) + +# Determine question type and answer +question_type = determine_question_type(paragraph_input) +answer = extract_possible_answer(paragraph_input) +options = generate_options(question_type, answer) + +# Construct JSON output +output = { + "paragraph": paragraph_input, + "question_type": question_type, + "question": generated_question, + "answer": answer, + "options": "|".join(options) # Match dataset format +} + +# Print JSON result +print(json.dumps(output, indent=4, ensure_ascii=False))