fix: remove the comment

This commit is contained in:
akhdanre 2025-02-07 13:44:10 +07:00
parent fb6ff8512b
commit ba8b5f7f8c
3 changed files with 82 additions and 49 deletions

View File

@ -1 +0,0 @@
,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;

36
main.py
View File

@ -15,18 +15,18 @@ from tensorflow.keras.layers import (
) )
from tensorflow.keras.optimizers import Adam from tensorflow.keras.optimizers import Adam
# -----------------------
# 1. Load dataset # 1. Load dataset
# -----------------------
df = pd.read_csv("quiz_questions.csv") df = pd.read_csv("quiz_questions.csv")
# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong # Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
df.dropna(subset=["paragraph", "question"], inplace=True) df.dropna(subset=["paragraph", "question"], inplace=True)
# -----------------------
# 2. Preprocessing text # 2. Preprocessing text
# -----------------------
def preprocess_text(text): def preprocess_text(text):
# Contoh preprocessing sederhana # Contoh preprocessing sederhana
text = text.lower() text = text.lower()
@ -36,9 +36,9 @@ def preprocess_text(text):
df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text) df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
df["question"] = df["question"].astype(str).apply(preprocess_text) df["question"] = df["question"].astype(str).apply(preprocess_text)
# -----------------------
# 3. Tokenization # 3. Tokenization
# -----------------------
# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya # Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
tokenizer = Tokenizer() tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist()) tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
@ -61,9 +61,9 @@ with open("tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f) pickle.dump(tokenizer, f)
print("Tokenizer disimpan ke tokenizer.pkl") print("Tokenizer disimpan ke tokenizer.pkl")
# -----------------------
# 4. Siapkan X, y # 4. Siapkan X, y
# -----------------------
# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy", # Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
# idealnya y memiliki shape: (num_samples, max_length, 1) # idealnya y memiliki shape: (num_samples, max_length, 1)
X = np.array(X_padded) X = np.array(X_padded)
@ -72,9 +72,9 @@ y = np.expand_dims(np.array(y_padded), axis=-1)
print("Shape X:", X.shape) print("Shape X:", X.shape)
print("Shape y:", y.shape) # (batch_size, max_length, 1) print("Shape y:", y.shape) # (batch_size, max_length, 1)
# -----------------------
# 5. Split data # 5. Split data
# -----------------------
X_train, X_test, y_train, y_test = train_test_split( X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42 X, y, test_size=0.2, random_state=42
) )
@ -82,9 +82,9 @@ X_train, X_test, y_train, y_test = train_test_split(
print("Train size:", X_train.shape, y_train.shape) print("Train size:", X_train.shape, y_train.shape)
print("Test size: ", X_test.shape, y_test.shape) print("Test size: ", X_test.shape, y_test.shape)
# -----------------------
# 6. Build Model LSTM # 6. Build Model LSTM
# -----------------------
# Kita pakai 2 LSTM stack, masing2 return_sequences=True # Kita pakai 2 LSTM stack, masing2 return_sequences=True
# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim) # Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
model = Sequential() model = Sequential()
@ -97,9 +97,9 @@ model.add(LSTM(128, return_sequences=True))
# TimeDistributed Dense agar Dense diaplikasikan per timestep # TimeDistributed Dense agar Dense diaplikasikan per timestep
model.add(TimeDistributed(Dense(vocab_size, activation="softmax"))) model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
# -----------------------
# 7. Compile # 7. Compile
# -----------------------
model.compile( model.compile(
loss="sparse_categorical_crossentropy", loss="sparse_categorical_crossentropy",
optimizer=Adam(learning_rate=0.001), optimizer=Adam(learning_rate=0.001),
@ -108,16 +108,16 @@ model.compile(
model.summary() model.summary()
# -----------------------
# 8. Train Model # 8. Train Model
# -----------------------
epochs = 10 epochs = 10
history = model.fit( history = model.fit(
X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32 X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
) )
# -----------------------
# 9. Save Model # 9. Save Model
# -----------------------
model.save("lstm_question_generator.keras") model.save("lstm_question_generator.keras")
print("Training selesai dan model telah disimpan.") print("Training selesai dan model telah disimpan.")

94
test.py
View File

@ -1,68 +1,102 @@
import numpy as np import numpy as np
import re import json
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle import pickle
from tensorflow.keras.models import load_model
# Misal kita punya tokenizer, model, dan max_length: from tensorflow.keras.preprocessing.sequence import pad_sequences
# tokenizer, model, max_length = ...
# Pastikan Anda load model & tokenizer sesuai environment Anda.
def preprocess_text(text): def preprocess_text(text):
# Fungsi preprocess sederhana (harus sama atau mirip dengan training) """ Simple text preprocessing (should match training preprocessing) """
text = text.lower() text = text.lower()
# Buat penyesuaian lain jika perlu
return text return text
def generate_question(paragraph, tokenizer, model, max_length): def generate_question(paragraph, tokenizer, model, max_length):
""" Generate a question based on the input paragraph """
# 1) Preprocess paragraph # 1) Preprocess paragraph
paragraph = preprocess_text(paragraph) paragraph = preprocess_text(paragraph)
# 2) Tokenize # 2) Tokenize
seq = tokenizer.texts_to_sequences([paragraph]) # hasilnya list of list seq = tokenizer.texts_to_sequences([paragraph])
# 3) Pad sequence
padded = pad_sequences(seq, maxlen=max_length, padding="post") padded = pad_sequences(seq, maxlen=max_length, padding="post")
# 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size) # 3) Get model predictions
prediction = model.predict(padded) # (1, max_length, vocab_size) prediction = model.predict(padded)
# 5) Cari argmax di setiap time step => (1, max_length) # 4) Get the most likely word indices
predicted_indices = np.argmax(prediction, axis=-1)[0] predicted_indices = np.argmax(prediction, axis=-1)[0]
# 6) Konversi ke kata # 5) Convert indices to words
predicted_words = [] predicted_words = []
for idx in predicted_indices: for idx in predicted_indices:
# Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer if idx == 0: # Assuming 0 is padding or unknown token
if idx == 0:
# Boleh langsung break, karena sisanya kemungkinan pad
break break
word = tokenizer.index_word.get(idx, "") word = tokenizer.index_word.get(idx, "")
predicted_words.append(word) predicted_words.append(word)
# 7) Gabungkan jadi satu kalimat # 6) Form the final question
predicted_question = " ".join(predicted_words) predicted_question = " ".join(predicted_words)
# Bisa saja kita menambahkan tanda tanya
if not predicted_question.endswith("?"): if not predicted_question.endswith("?"):
predicted_question = predicted_question + "?" predicted_question += "?"
return predicted_question return predicted_question
def determine_question_type(paragraph):
""" Simple rule-based method to determine question type """
if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower():
return "true_false"
elif "berapa" in paragraph.lower() or "mana" in paragraph.lower():
return "multiple_choice"
else:
return "fill_in_the_blank"
def extract_possible_answer(paragraph):
""" Basic heuristic to extract an answer (first proper noun or keyword) """
words = paragraph.split()
if len(words) > 2:
return words[0] + " " + words[1] # Return first two words as a basic approach
return words[0]
def generate_options(question_type, answer):
""" Generate dummy options for multiple-choice questions """
if question_type == "multiple_choice":
return [answer, "Option B", "Option C", "Option D"]
elif question_type == "true_false":
return ["True", "False"]
else:
return ["-"] # Placeholder for fill-in-the-blank
# Load the trained model and tokenizer
model = load_model("lstm_question_generator.keras") model = load_model("lstm_question_generator.keras")
with open("tokenizer.pkl", "rb") as f: with open("tokenizer.pkl", "rb") as f:
tokenizer = pickle.load(f) tokenizer = pickle.load(f)
# Pastikan max_length sama seperti saat training max_length = 50 # Ensure this is the same as in training
max_length = 50 # Atau nilai yang Anda tetapkan
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum." # Sample paragraph input
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum"
generated_q = generate_question(paragraph_input, tokenizer, model, max_length) # Generate question
print("Generated question:", generated_q) generated_question = generate_question(paragraph_input, tokenizer, model, max_length)
# Determine question type and answer
question_type = determine_question_type(paragraph_input)
answer = extract_possible_answer(paragraph_input)
options = generate_options(question_type, answer)
# Construct JSON output
output = {
"paragraph": paragraph_input,
"question_type": question_type,
"question": generated_question,
"answer": answer,
"options": "|".join(options) # Match dataset format
}
# Print JSON result
print(json.dumps(output, indent=4, ensure_ascii=False))