feat: clean up dataset and adding new dataset

This commit is contained in:
akhdanre 2025-06-04 21:26:04 +07:00
parent fc640c9017
commit 74b7dd177b
19 changed files with 1948 additions and 886 deletions

View File

@ -1 +0,0 @@
,akeon,fedora,15.05.2025 10:47,file:///home/akeon/.config/libreoffice/4;

View File

@ -23,101 +23,9 @@ import re
import string
from collections import Counter
# Data contoh yang diberikan
# data = [
# {
# "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
# "tokens": [
# "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara"
# ],
# "ner": [
# "PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"
# ],
# "srl": [
# "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
# ],
# "qas": [
# {
# "type": "isian",
# "question": "Dimana kartini lahir ___",
# "answer": "jepara",
# "id": "qa_0_q1"
# },
# {
# "type": "true_false",
# "question": "Kartini lahir pada tanggal 21 mei 1879 ___",
# "options": ["true", "false"],
# "answer": "false",
# "id": "qa_0_q2"
# }
# ]
# },
# {
# "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
# "tokens": [
# "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan"
# ],
# "ner": [
# "O", "ORG", "O", "O", "O", "DATE", "O", "LOC"
# ],
# "srl": [
# "ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"
# ],
# "qas": [
# {
# "type": "opsi",
# "question": "Dimana kerajaan majapahit berdiri ___",
# "options": ["trowulan", "singasari", "kuta", "banten"],
# "answer": "trowulan",
# "id": "qa_1_q1"
# },
# {
# "type": "true_false",
# "question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
# "options": ["true", "false"],
# "answer": "false",
# "id": "qa_1_q2"
# }
# ]
# },
# {
# "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
# "tokens": [
# "soekarno", "dan", "mohammad", "hatta", "memproklamasikan", "kemerdekaan", "indonesia", "pada", "17", "agustus", "1945"
# ],
# "ner": [
# "PER", "O", "PER", "PER", "O", "O", "LOC", "O", "DATE", "DATE", "DATE"
# ],
# "srl": [
# "ARG0", "O", "ARG0", "ARG0", "V", "ARG1", "ARGM-LOC", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP"
# ],
# "qas": [
# {
# "type": "isian",
# "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
# "answer": "17 agustus 1945",
# "id": "qa_2_q1"
# },
# {
# "type": "opsi",
# "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
# "options": ["soekarno", "mohammad hatta", "sudirman", "ahmad yani"],
# "answer": "soekarno mohammad hatta",
# "id": "qa_2_q2"
# }
# ]
# }
# ]
with open("data_converted.json", "r") as f:
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
data = json.load(f)
# # Simpan ke file JSON untuk kebutuhan di masa depan
# with read('qa_dataset.json', 'w', encoding='utf-8') as f:
# json.dump(data, f, ensure_ascii=False, indent=2)
# Preprocessing function

View File

@ -23,22 +23,26 @@ max_token_len = tokenizer_data["max_token_len"]
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
# Load trained model
model = load_model("qa_lstm_model_final.h5")
model = load_model("qa_lstm_model_final.keras")
def preprocess_text(text):
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
def predict_answer(context, question, tokens, ner, srl, q_type):
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_cat = tf.keras.utils.to_categorical([q_type_idx], num_classes=q_type_vocab_size)
q_type_cat = tf.keras.utils.to_categorical(
[q_type_idx], num_classes=q_type_vocab_size
)
# Pad sequences
context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
@ -48,7 +52,9 @@ def predict_answer(context, question, tokens, ner, srl, q_type):
srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
# Predict
prediction = model.predict([context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0)
prediction = model.predict(
[context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0
)
answer_idx = np.argmax(prediction[0])
# Retrieve predicted answer word
@ -58,6 +64,7 @@ def predict_answer(context, question, tokens, ner, srl, q_type):
return "Unknown"
def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
entities = {}
predicate = ""
@ -84,8 +91,10 @@ def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
if "DATE" in entities:
original_date = " ".join(entities["DATE"])
try:
modified_year = str(int(entities['DATE'][-1]) + random.randint(1, 5))
modified_date = f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
modified_year = str(int(entities["DATE"][-1]) + random.randint(1, 5))
modified_date = (
f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
)
except:
modified_date = original_date # Fallback if parsing fails
return f"{subject} {predicate} pada {modified_date} ___", "false"
@ -101,9 +110,10 @@ def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
return "Apa yang terjadi dalam teks ini ___", context
# ✅ Example Usage with Random Sampling
if __name__ == "__main__":
with open("data_converted.json", "r") as f:
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
data = json.load(f)
# Randomly select an example for testing
@ -116,7 +126,7 @@ if __name__ == "__main__":
test_item["tokens"],
test_item["ner"],
test_item["srl"],
test_qa["type"]
test_qa["type"],
)
print(f"Context: {test_item['context']}")

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -7,7 +7,7 @@ def normalize_question(text):
return text.capitalize()
# Load data
with open('../dataset/dev_dataset_qg.json', 'r', encoding='utf-8') as file:
with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
data = json.load(file)
processed_data = []

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -146,7 +146,7 @@ class QuestionPredictionModel:
# Example usage
if __name__ == "__main__":
# Load test data
with open("data_converted.json", "r") as f:
with open("../dataset/conteks_question.json", "r") as f:
test_data = json.load(f)
# Initialize model
@ -156,7 +156,7 @@ if __name__ == "__main__":
)
# Example single prediction
sample = test_data[520]
sample = test_data[1]
context = sample["context"]
tokens = sample["tokens"]
ner = sample["ner"]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB