feat: clean up dataset and adding new dataset

2025-06-04 21:26:04 +07:00 · 2025-06-04 21:26:04 +07:00 · 74b7dd177b
parent fc640c9017
commit 74b7dd177b
19 changed files with 1948 additions and 886 deletions
--- a/question_generation/.~lock.bleu_calculation.xlsx#
+++ b/question_generation/.~lock.bleu_calculation.xlsx#
@ -1 +0,0 @@
-,akeon,fedora,15.05.2025 10:47,file:///home/akeon/.config/libreoffice/4;
--- a/question_generation/answer_model.py
+++ b/question_generation/answer_model.py
@ -23,101 +23,9 @@ import re
 import string
 from collections import Counter

-# Data contoh yang diberikan
-# data = [
-#   {
-#     "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
-#     "tokens": [
-#       "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara"
-#     ],
-#     "ner": [
-#       "PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"
-#     ],
-#     "srl": [
-#       "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
-#     ],
-#     "qas": [
-#       {
-#         "type": "isian",
-#         "question": "Dimana kartini lahir ___",
-#         "answer": "jepara",
-#         "id": "qa_0_q1"
-#       },
-#       {
-#         "type": "true_false",
-#         "question": "Kartini lahir pada tanggal 21 mei 1879 ___",
-#         "options": ["true", "false"],
-#         "answer": "false",
-#         "id": "qa_0_q2"
-#       }
-#     ]
-#   },
-#   {
-#     "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
-#     "tokens": [
-#       "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan"
-#     ],
-#     "ner": [
-#       "O", "ORG", "O", "O", "O", "DATE", "O", "LOC"
-#     ],
-#     "srl": [
-#       "ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"
-#     ],
-#     "qas": [
-#       {
-#         "type": "opsi",
-#         "question": "Dimana kerajaan majapahit berdiri ___",
-#         "options": ["trowulan", "singasari", "kuta", "banten"],
-#         "answer": "trowulan",
-#         "id": "qa_1_q1"
-#       },
-#       {
-#         "type": "true_false",
-#         "question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
-#         "options": ["true", "false"],
-#         "answer": "false",
-#         "id": "qa_1_q2"
-#       }
-#     ]
-#   },
-#   {
-#     "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
-#     "tokens": [
-#       "soekarno", "dan", "mohammad", "hatta", "memproklamasikan", "kemerdekaan", "indonesia", "pada", "17", "agustus", "1945"
-#     ],
-#     "ner": [
-#       "PER", "O", "PER", "PER", "O", "O", "LOC", "O", "DATE", "DATE", "DATE"
-#     ],
-#     "srl": [
-#       "ARG0", "O", "ARG0", "ARG0", "V", "ARG1", "ARGM-LOC", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP"
-#     ],
-#     "qas": [
-#       {
-#         "type": "isian",
-#         "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
-#         "answer": "17 agustus 1945",
-#         "id": "qa_2_q1"
-#       },
-#       {
-#         "type": "opsi",
-#         "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
-#         "options": ["soekarno", "mohammad hatta", "sudirman", "ahmad yani"],
-#         "answer": "soekarno mohammad hatta",
-#         "id": "qa_2_q2"
-#       }
-#     ]
-#   }
-# ]

-with open("data_converted.json", "r") as f:
+with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
    data = json.load(f)
-    
-
-
-
-# # Simpan ke file JSON untuk kebutuhan di masa depan
-# with read('qa_dataset.json', 'w', encoding='utf-8') as f:
-#     json.dump(data, f, ensure_ascii=False, indent=2)


 # Preprocessing function
--- a/question_generation/answer_predict.py
+++ b/question_generation/answer_predict.py
@ -23,22 +23,26 @@ max_token_len = tokenizer_data["max_token_len"]
 q_type_vocab_size = len(q_type_tokenizer.word_index) + 1

 # Load trained model
-model = load_model("qa_lstm_model_final.h5")
+model = load_model("qa_lstm_model_final.keras")
+

 def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

+
 def predict_answer(context, question, tokens, ner, srl, q_type):
    context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
    question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
    token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
    ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
    srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
-    
+
    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
-    q_type_cat = tf.keras.utils.to_categorical([q_type_idx], num_classes=q_type_vocab_size)
+    q_type_cat = tf.keras.utils.to_categorical(
+        [q_type_idx], num_classes=q_type_vocab_size
+    )

    # Pad sequences
    context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
@ -48,7 +52,9 @@ def predict_answer(context, question, tokens, ner, srl, q_type):
    srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")

    # Predict
-    prediction = model.predict([context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0)
+    prediction = model.predict(
+        [context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0
+    )
    answer_idx = np.argmax(prediction[0])

    # Retrieve predicted answer word
@ -58,6 +64,7 @@ def predict_answer(context, question, tokens, ner, srl, q_type):

    return "Unknown"

+
 def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
    entities = {}
    predicate = ""
@ -84,8 +91,10 @@ def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
        if "DATE" in entities:
            original_date = " ".join(entities["DATE"])
            try:
-                modified_year = str(int(entities['DATE'][-1]) + random.randint(1, 5))
-                modified_date = f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
+                modified_year = str(int(entities["DATE"][-1]) + random.randint(1, 5))
+                modified_date = (
+                    f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
+                )
            except:
                modified_date = original_date  # Fallback if parsing fails
            return f"{subject} {predicate} pada {modified_date} ___", "false"
@ -101,9 +110,10 @@ def generate_question_answer(context, tokens, ner, srl, question_type="isian"):

    return "Apa yang terjadi dalam teks ini ___", context

+
 # ✅ Example Usage with Random Sampling
 if __name__ == "__main__":
-    with open("data_converted.json", "r") as f:
+    with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
        data = json.load(f)

    # Randomly select an example for testing
@ -116,7 +126,7 @@ if __name__ == "__main__":
        test_item["tokens"],
        test_item["ner"],
        test_item["srl"],
-        test_qa["type"]
+        test_qa["type"],
    )

    print(f"Context: {test_item['context']}")
--- a/question_generation/bleu_answer_calculation.xlsx
+++ b/question_generation/bleu_answer_calculation.xlsx
--- a/question_generation/bleu_answer_calculation_stable.xlsx
+++ b/question_generation/bleu_answer_calculation_stable.xlsx
--- a/question_generation/bleu_calculation.xlsx
+++ b/question_generation/bleu_calculation.xlsx
--- a/question_generation/bleu_question_calculation.xlsx
+++ b/question_generation/bleu_question_calculation.xlsx
--- a/question_generation/bleu_question_calculation_stable.xlsx
+++ b/question_generation/bleu_question_calculation_stable.xlsx
--- a/question_generation/bleu_scores_summary.xlsx
+++ b/question_generation/bleu_scores_summary.xlsx
--- a/question_generation/cnv_dts.py
+++ b/question_generation/cnv_dts.py
@ -7,7 +7,7 @@ def normalize_question(text):
    return text.capitalize()

 # Load data
-with open('../dataset/dev_dataset_qg.json', 'r', encoding='utf-8') as file:
+with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

 processed_data = []
--- a/question_generation/qa_tokenizers.json
+++ b/question_generation/qa_tokenizers.json
--- a/question_generation/qg_lstm.ipynb
+++ b/question_generation/qg_lstm.ipynb
--- a/question_generation/question_answer_model.ipynb
+++ b/question_generation/question_answer_model.ipynb
--- a/question_generation/question_generation_model.ipynb
+++ b/question_generation/question_generation_model.ipynb
--- a/question_generation/question_predict_v2.py
+++ b/question_generation/question_predict_v2.py
@ -146,7 +146,7 @@ class QuestionPredictionModel:
 # Example usage
 if __name__ == "__main__":
    # Load test data
-    with open("data_converted.json", "r") as f:
+    with open("../dataset/conteks_question.json", "r") as f:
        test_data = json.load(f)

    # Initialize model
@ -156,7 +156,7 @@ if __name__ == "__main__":
    )

    # Example single prediction
-    sample = test_data[520]
+    sample = test_data[1]
    context = sample["context"]
    tokens = sample["tokens"]
    ner = sample["ner"]
--- a/question_generation/question_prediction_tokenizers.json
+++ b/question_generation/question_prediction_tokenizers.json
--- a/question_generation/question_prediction_tokenizers_stable.json
+++ b/question_generation/question_prediction_tokenizers_stable.json
--- a/question_generation/question_prediction_training_history.png
+++ b/question_generation/question_prediction_training_history.png
--- a/question_generation/question_prediction_training_history_stable.png
+++ b/question_generation/question_prediction_training_history_stable.png
				`@ -1 +0,0 @@`
				`,akeon,fedora,15.05.2025 10:47,file:///home/akeon/.config/libreoffice/4;`