fix: remove the comment
This commit is contained in:
parent
fb6ff8512b
commit
ba8b5f7f8c
|
@ -1 +0,0 @@
|
||||||
,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;
|
|
36
main.py
36
main.py
|
@ -15,18 +15,18 @@ from tensorflow.keras.layers import (
|
||||||
)
|
)
|
||||||
from tensorflow.keras.optimizers import Adam
|
from tensorflow.keras.optimizers import Adam
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 1. Load dataset
|
# 1. Load dataset
|
||||||
# -----------------------
|
|
||||||
df = pd.read_csv("quiz_questions.csv")
|
df = pd.read_csv("quiz_questions.csv")
|
||||||
|
|
||||||
# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
|
# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
|
||||||
df.dropna(subset=["paragraph", "question"], inplace=True)
|
df.dropna(subset=["paragraph", "question"], inplace=True)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 2. Preprocessing text
|
# 2. Preprocessing text
|
||||||
# -----------------------
|
|
||||||
|
|
||||||
def preprocess_text(text):
|
def preprocess_text(text):
|
||||||
# Contoh preprocessing sederhana
|
# Contoh preprocessing sederhana
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
@ -36,9 +36,9 @@ def preprocess_text(text):
|
||||||
df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
|
df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
|
||||||
df["question"] = df["question"].astype(str).apply(preprocess_text)
|
df["question"] = df["question"].astype(str).apply(preprocess_text)
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 3. Tokenization
|
# 3. Tokenization
|
||||||
# -----------------------
|
|
||||||
# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
|
# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
|
||||||
tokenizer = Tokenizer()
|
tokenizer = Tokenizer()
|
||||||
tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
|
tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
|
||||||
|
@ -61,9 +61,9 @@ with open("tokenizer.pkl", "wb") as f:
|
||||||
pickle.dump(tokenizer, f)
|
pickle.dump(tokenizer, f)
|
||||||
print("Tokenizer disimpan ke tokenizer.pkl")
|
print("Tokenizer disimpan ke tokenizer.pkl")
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 4. Siapkan X, y
|
# 4. Siapkan X, y
|
||||||
# -----------------------
|
|
||||||
# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
|
# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
|
||||||
# idealnya y memiliki shape: (num_samples, max_length, 1)
|
# idealnya y memiliki shape: (num_samples, max_length, 1)
|
||||||
X = np.array(X_padded)
|
X = np.array(X_padded)
|
||||||
|
@ -72,9 +72,9 @@ y = np.expand_dims(np.array(y_padded), axis=-1)
|
||||||
print("Shape X:", X.shape)
|
print("Shape X:", X.shape)
|
||||||
print("Shape y:", y.shape) # (batch_size, max_length, 1)
|
print("Shape y:", y.shape) # (batch_size, max_length, 1)
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 5. Split data
|
# 5. Split data
|
||||||
# -----------------------
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
X, y, test_size=0.2, random_state=42
|
X, y, test_size=0.2, random_state=42
|
||||||
)
|
)
|
||||||
|
@ -82,9 +82,9 @@ X_train, X_test, y_train, y_test = train_test_split(
|
||||||
print("Train size:", X_train.shape, y_train.shape)
|
print("Train size:", X_train.shape, y_train.shape)
|
||||||
print("Test size: ", X_test.shape, y_test.shape)
|
print("Test size: ", X_test.shape, y_test.shape)
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 6. Build Model LSTM
|
# 6. Build Model LSTM
|
||||||
# -----------------------
|
|
||||||
# Kita pakai 2 LSTM stack, masing2 return_sequences=True
|
# Kita pakai 2 LSTM stack, masing2 return_sequences=True
|
||||||
# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
|
# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
|
||||||
model = Sequential()
|
model = Sequential()
|
||||||
|
@ -97,9 +97,9 @@ model.add(LSTM(128, return_sequences=True))
|
||||||
# TimeDistributed Dense agar Dense diaplikasikan per timestep
|
# TimeDistributed Dense agar Dense diaplikasikan per timestep
|
||||||
model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
|
model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 7. Compile
|
# 7. Compile
|
||||||
# -----------------------
|
|
||||||
model.compile(
|
model.compile(
|
||||||
loss="sparse_categorical_crossentropy",
|
loss="sparse_categorical_crossentropy",
|
||||||
optimizer=Adam(learning_rate=0.001),
|
optimizer=Adam(learning_rate=0.001),
|
||||||
|
@ -108,16 +108,16 @@ model.compile(
|
||||||
|
|
||||||
model.summary()
|
model.summary()
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 8. Train Model
|
# 8. Train Model
|
||||||
# -----------------------
|
|
||||||
epochs = 10
|
epochs = 10
|
||||||
history = model.fit(
|
history = model.fit(
|
||||||
X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
|
X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
|
||||||
)
|
)
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
# 9. Save Model
|
# 9. Save Model
|
||||||
# -----------------------
|
|
||||||
model.save("lstm_question_generator.keras")
|
model.save("lstm_question_generator.keras")
|
||||||
print("Training selesai dan model telah disimpan.")
|
print("Training selesai dan model telah disimpan.")
|
||||||
|
|
94
test.py
94
test.py
|
@ -1,68 +1,102 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import re
|
import json
|
||||||
from tensorflow.keras.models import load_model
|
|
||||||
|
|
||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
||||||
import pickle
|
import pickle
|
||||||
|
from tensorflow.keras.models import load_model
|
||||||
# Misal kita punya tokenizer, model, dan max_length:
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
# tokenizer, model, max_length = ...
|
|
||||||
# Pastikan Anda load model & tokenizer sesuai environment Anda.
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_text(text):
|
def preprocess_text(text):
|
||||||
# Fungsi preprocess sederhana (harus sama atau mirip dengan training)
|
""" Simple text preprocessing (should match training preprocessing) """
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
# Buat penyesuaian lain jika perlu
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def generate_question(paragraph, tokenizer, model, max_length):
|
def generate_question(paragraph, tokenizer, model, max_length):
|
||||||
|
""" Generate a question based on the input paragraph """
|
||||||
# 1) Preprocess paragraph
|
# 1) Preprocess paragraph
|
||||||
paragraph = preprocess_text(paragraph)
|
paragraph = preprocess_text(paragraph)
|
||||||
|
|
||||||
# 2) Tokenize
|
# 2) Tokenize
|
||||||
seq = tokenizer.texts_to_sequences([paragraph]) # hasilnya list of list
|
seq = tokenizer.texts_to_sequences([paragraph])
|
||||||
# 3) Pad sequence
|
|
||||||
padded = pad_sequences(seq, maxlen=max_length, padding="post")
|
padded = pad_sequences(seq, maxlen=max_length, padding="post")
|
||||||
|
|
||||||
# 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size)
|
# 3) Get model predictions
|
||||||
prediction = model.predict(padded) # (1, max_length, vocab_size)
|
prediction = model.predict(padded)
|
||||||
|
|
||||||
# 5) Cari argmax di setiap time step => (1, max_length)
|
# 4) Get the most likely word indices
|
||||||
predicted_indices = np.argmax(prediction, axis=-1)[0]
|
predicted_indices = np.argmax(prediction, axis=-1)[0]
|
||||||
|
|
||||||
# 6) Konversi ke kata
|
# 5) Convert indices to words
|
||||||
predicted_words = []
|
predicted_words = []
|
||||||
for idx in predicted_indices:
|
for idx in predicted_indices:
|
||||||
# Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer
|
if idx == 0: # Assuming 0 is padding or unknown token
|
||||||
if idx == 0:
|
|
||||||
# Boleh langsung break, karena sisanya kemungkinan pad
|
|
||||||
break
|
break
|
||||||
word = tokenizer.index_word.get(idx, "")
|
word = tokenizer.index_word.get(idx, "")
|
||||||
predicted_words.append(word)
|
predicted_words.append(word)
|
||||||
|
|
||||||
# 7) Gabungkan jadi satu kalimat
|
# 6) Form the final question
|
||||||
predicted_question = " ".join(predicted_words)
|
predicted_question = " ".join(predicted_words)
|
||||||
|
|
||||||
# Bisa saja kita menambahkan tanda tanya
|
|
||||||
if not predicted_question.endswith("?"):
|
if not predicted_question.endswith("?"):
|
||||||
predicted_question = predicted_question + "?"
|
predicted_question += "?"
|
||||||
|
|
||||||
return predicted_question
|
return predicted_question
|
||||||
|
|
||||||
|
|
||||||
|
def determine_question_type(paragraph):
|
||||||
|
""" Simple rule-based method to determine question type """
|
||||||
|
if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower():
|
||||||
|
return "true_false"
|
||||||
|
elif "berapa" in paragraph.lower() or "mana" in paragraph.lower():
|
||||||
|
return "multiple_choice"
|
||||||
|
else:
|
||||||
|
return "fill_in_the_blank"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_possible_answer(paragraph):
|
||||||
|
""" Basic heuristic to extract an answer (first proper noun or keyword) """
|
||||||
|
words = paragraph.split()
|
||||||
|
if len(words) > 2:
|
||||||
|
return words[0] + " " + words[1] # Return first two words as a basic approach
|
||||||
|
return words[0]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_options(question_type, answer):
|
||||||
|
""" Generate dummy options for multiple-choice questions """
|
||||||
|
if question_type == "multiple_choice":
|
||||||
|
return [answer, "Option B", "Option C", "Option D"]
|
||||||
|
elif question_type == "true_false":
|
||||||
|
return ["True", "False"]
|
||||||
|
else:
|
||||||
|
return ["-"] # Placeholder for fill-in-the-blank
|
||||||
|
|
||||||
|
|
||||||
|
# Load the trained model and tokenizer
|
||||||
model = load_model("lstm_question_generator.keras")
|
model = load_model("lstm_question_generator.keras")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
with open("tokenizer.pkl", "rb") as f:
|
with open("tokenizer.pkl", "rb") as f:
|
||||||
tokenizer = pickle.load(f)
|
tokenizer = pickle.load(f)
|
||||||
|
|
||||||
# Pastikan max_length sama seperti saat training
|
max_length = 50 # Ensure this is the same as in training
|
||||||
max_length = 50 # Atau nilai yang Anda tetapkan
|
|
||||||
|
|
||||||
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum."
|
# Sample paragraph input
|
||||||
|
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum"
|
||||||
|
|
||||||
generated_q = generate_question(paragraph_input, tokenizer, model, max_length)
|
# Generate question
|
||||||
print("Generated question:", generated_q)
|
generated_question = generate_question(paragraph_input, tokenizer, model, max_length)
|
||||||
|
|
||||||
|
# Determine question type and answer
|
||||||
|
question_type = determine_question_type(paragraph_input)
|
||||||
|
answer = extract_possible_answer(paragraph_input)
|
||||||
|
options = generate_options(question_type, answer)
|
||||||
|
|
||||||
|
# Construct JSON output
|
||||||
|
output = {
|
||||||
|
"paragraph": paragraph_input,
|
||||||
|
"question_type": question_type,
|
||||||
|
"question": generated_question,
|
||||||
|
"answer": answer,
|
||||||
|
"options": "|".join(options) # Match dataset format
|
||||||
|
}
|
||||||
|
|
||||||
|
# Print JSON result
|
||||||
|
print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||||
|
|
Loading…
Reference in New Issue