fix: remove the comment
This commit is contained in:
parent
fb6ff8512b
commit
ba8b5f7f8c
|
@ -1 +0,0 @@
|
|||
,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;
|
36
main.py
36
main.py
|
@ -15,18 +15,18 @@ from tensorflow.keras.layers import (
|
|||
)
|
||||
from tensorflow.keras.optimizers import Adam
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 1. Load dataset
|
||||
# -----------------------
|
||||
|
||||
df = pd.read_csv("quiz_questions.csv")
|
||||
|
||||
# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
|
||||
df.dropna(subset=["paragraph", "question"], inplace=True)
|
||||
|
||||
|
||||
# -----------------------
|
||||
# 2. Preprocessing text
|
||||
# -----------------------
|
||||
|
||||
|
||||
def preprocess_text(text):
|
||||
# Contoh preprocessing sederhana
|
||||
text = text.lower()
|
||||
|
@ -36,9 +36,9 @@ def preprocess_text(text):
|
|||
df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
|
||||
df["question"] = df["question"].astype(str).apply(preprocess_text)
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 3. Tokenization
|
||||
# -----------------------
|
||||
|
||||
# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
|
||||
tokenizer = Tokenizer()
|
||||
tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
|
||||
|
@ -61,9 +61,9 @@ with open("tokenizer.pkl", "wb") as f:
|
|||
pickle.dump(tokenizer, f)
|
||||
print("Tokenizer disimpan ke tokenizer.pkl")
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 4. Siapkan X, y
|
||||
# -----------------------
|
||||
|
||||
# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
|
||||
# idealnya y memiliki shape: (num_samples, max_length, 1)
|
||||
X = np.array(X_padded)
|
||||
|
@ -72,9 +72,9 @@ y = np.expand_dims(np.array(y_padded), axis=-1)
|
|||
print("Shape X:", X.shape)
|
||||
print("Shape y:", y.shape) # (batch_size, max_length, 1)
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 5. Split data
|
||||
# -----------------------
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
@ -82,9 +82,9 @@ X_train, X_test, y_train, y_test = train_test_split(
|
|||
print("Train size:", X_train.shape, y_train.shape)
|
||||
print("Test size: ", X_test.shape, y_test.shape)
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 6. Build Model LSTM
|
||||
# -----------------------
|
||||
|
||||
# Kita pakai 2 LSTM stack, masing2 return_sequences=True
|
||||
# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
|
||||
model = Sequential()
|
||||
|
@ -97,9 +97,9 @@ model.add(LSTM(128, return_sequences=True))
|
|||
# TimeDistributed Dense agar Dense diaplikasikan per timestep
|
||||
model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 7. Compile
|
||||
# -----------------------
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer=Adam(learning_rate=0.001),
|
||||
|
@ -108,16 +108,16 @@ model.compile(
|
|||
|
||||
model.summary()
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 8. Train Model
|
||||
# -----------------------
|
||||
|
||||
epochs = 10
|
||||
history = model.fit(
|
||||
X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
|
||||
)
|
||||
|
||||
# -----------------------
|
||||
|
||||
# 9. Save Model
|
||||
# -----------------------
|
||||
|
||||
model.save("lstm_question_generator.keras")
|
||||
print("Training selesai dan model telah disimpan.")
|
||||
|
|
94
test.py
94
test.py
|
@ -1,68 +1,102 @@
|
|||
import numpy as np
|
||||
import re
|
||||
from tensorflow.keras.models import load_model
|
||||
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
import json
|
||||
import pickle
|
||||
|
||||
# Misal kita punya tokenizer, model, dan max_length:
|
||||
# tokenizer, model, max_length = ...
|
||||
# Pastikan Anda load model & tokenizer sesuai environment Anda.
|
||||
from tensorflow.keras.models import load_model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
|
||||
def preprocess_text(text):
|
||||
# Fungsi preprocess sederhana (harus sama atau mirip dengan training)
|
||||
""" Simple text preprocessing (should match training preprocessing) """
|
||||
text = text.lower()
|
||||
# Buat penyesuaian lain jika perlu
|
||||
return text
|
||||
|
||||
|
||||
def generate_question(paragraph, tokenizer, model, max_length):
|
||||
""" Generate a question based on the input paragraph """
|
||||
# 1) Preprocess paragraph
|
||||
paragraph = preprocess_text(paragraph)
|
||||
|
||||
# 2) Tokenize
|
||||
seq = tokenizer.texts_to_sequences([paragraph]) # hasilnya list of list
|
||||
# 3) Pad sequence
|
||||
seq = tokenizer.texts_to_sequences([paragraph])
|
||||
padded = pad_sequences(seq, maxlen=max_length, padding="post")
|
||||
|
||||
# 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size)
|
||||
prediction = model.predict(padded) # (1, max_length, vocab_size)
|
||||
# 3) Get model predictions
|
||||
prediction = model.predict(padded)
|
||||
|
||||
# 5) Cari argmax di setiap time step => (1, max_length)
|
||||
# 4) Get the most likely word indices
|
||||
predicted_indices = np.argmax(prediction, axis=-1)[0]
|
||||
|
||||
# 6) Konversi ke kata
|
||||
# 5) Convert indices to words
|
||||
predicted_words = []
|
||||
for idx in predicted_indices:
|
||||
# Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer
|
||||
if idx == 0:
|
||||
# Boleh langsung break, karena sisanya kemungkinan pad
|
||||
if idx == 0: # Assuming 0 is padding or unknown token
|
||||
break
|
||||
word = tokenizer.index_word.get(idx, "")
|
||||
predicted_words.append(word)
|
||||
|
||||
# 7) Gabungkan jadi satu kalimat
|
||||
# 6) Form the final question
|
||||
predicted_question = " ".join(predicted_words)
|
||||
|
||||
# Bisa saja kita menambahkan tanda tanya
|
||||
if not predicted_question.endswith("?"):
|
||||
predicted_question = predicted_question + "?"
|
||||
predicted_question += "?"
|
||||
|
||||
return predicted_question
|
||||
|
||||
|
||||
def determine_question_type(paragraph):
|
||||
""" Simple rule-based method to determine question type """
|
||||
if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower():
|
||||
return "true_false"
|
||||
elif "berapa" in paragraph.lower() or "mana" in paragraph.lower():
|
||||
return "multiple_choice"
|
||||
else:
|
||||
return "fill_in_the_blank"
|
||||
|
||||
|
||||
def extract_possible_answer(paragraph):
|
||||
""" Basic heuristic to extract an answer (first proper noun or keyword) """
|
||||
words = paragraph.split()
|
||||
if len(words) > 2:
|
||||
return words[0] + " " + words[1] # Return first two words as a basic approach
|
||||
return words[0]
|
||||
|
||||
|
||||
def generate_options(question_type, answer):
|
||||
""" Generate dummy options for multiple-choice questions """
|
||||
if question_type == "multiple_choice":
|
||||
return [answer, "Option B", "Option C", "Option D"]
|
||||
elif question_type == "true_false":
|
||||
return ["True", "False"]
|
||||
else:
|
||||
return ["-"] # Placeholder for fill-in-the-blank
|
||||
|
||||
|
||||
# Load the trained model and tokenizer
|
||||
model = load_model("lstm_question_generator.keras")
|
||||
|
||||
|
||||
|
||||
with open("tokenizer.pkl", "rb") as f:
|
||||
tokenizer = pickle.load(f)
|
||||
|
||||
# Pastikan max_length sama seperti saat training
|
||||
max_length = 50 # Atau nilai yang Anda tetapkan
|
||||
max_length = 50 # Ensure this is the same as in training
|
||||
|
||||
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum."
|
||||
# Sample paragraph input
|
||||
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum"
|
||||
|
||||
generated_q = generate_question(paragraph_input, tokenizer, model, max_length)
|
||||
print("Generated question:", generated_q)
|
||||
# Generate question
|
||||
generated_question = generate_question(paragraph_input, tokenizer, model, max_length)
|
||||
|
||||
# Determine question type and answer
|
||||
question_type = determine_question_type(paragraph_input)
|
||||
answer = extract_possible_answer(paragraph_input)
|
||||
options = generate_options(question_type, answer)
|
||||
|
||||
# Construct JSON output
|
||||
output = {
|
||||
"paragraph": paragraph_input,
|
||||
"question_type": question_type,
|
||||
"question": generated_question,
|
||||
"answer": answer,
|
||||
"options": "|".join(options) # Match dataset format
|
||||
}
|
||||
|
||||
# Print JSON result
|
||||
print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||
|
|
Loading…
Reference in New Issue