feat: clean up dataset and adding new dataset
This commit is contained in:
parent
fc640c9017
commit
74b7dd177b
|
@ -1 +0,0 @@
|
|||
,akeon,fedora,15.05.2025 10:47,file:///home/akeon/.config/libreoffice/4;
|
|
@ -23,101 +23,9 @@ import re
|
|||
import string
|
||||
from collections import Counter
|
||||
|
||||
# Data contoh yang diberikan
|
||||
# data = [
|
||||
# {
|
||||
# "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
||||
# "tokens": [
|
||||
# "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara"
|
||||
# ],
|
||||
# "ner": [
|
||||
# "PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"
|
||||
# ],
|
||||
# "srl": [
|
||||
# "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
|
||||
# ],
|
||||
# "qas": [
|
||||
# {
|
||||
# "type": "isian",
|
||||
# "question": "Dimana kartini lahir ___",
|
||||
# "answer": "jepara",
|
||||
# "id": "qa_0_q1"
|
||||
# },
|
||||
# {
|
||||
# "type": "true_false",
|
||||
# "question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
||||
# "options": ["true", "false"],
|
||||
# "answer": "false",
|
||||
# "id": "qa_0_q2"
|
||||
# }
|
||||
# ]
|
||||
# },
|
||||
# {
|
||||
# "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
# "tokens": [
|
||||
# "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan"
|
||||
# ],
|
||||
# "ner": [
|
||||
# "O", "ORG", "O", "O", "O", "DATE", "O", "LOC"
|
||||
# ],
|
||||
# "srl": [
|
||||
# "ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"
|
||||
# ],
|
||||
# "qas": [
|
||||
# {
|
||||
# "type": "opsi",
|
||||
# "question": "Dimana kerajaan majapahit berdiri ___",
|
||||
# "options": ["trowulan", "singasari", "kuta", "banten"],
|
||||
# "answer": "trowulan",
|
||||
# "id": "qa_1_q1"
|
||||
# },
|
||||
# {
|
||||
# "type": "true_false",
|
||||
# "question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
||||
# "options": ["true", "false"],
|
||||
# "answer": "false",
|
||||
# "id": "qa_1_q2"
|
||||
# }
|
||||
# ]
|
||||
# },
|
||||
# {
|
||||
# "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
|
||||
# "tokens": [
|
||||
# "soekarno", "dan", "mohammad", "hatta", "memproklamasikan", "kemerdekaan", "indonesia", "pada", "17", "agustus", "1945"
|
||||
# ],
|
||||
# "ner": [
|
||||
# "PER", "O", "PER", "PER", "O", "O", "LOC", "O", "DATE", "DATE", "DATE"
|
||||
# ],
|
||||
# "srl": [
|
||||
# "ARG0", "O", "ARG0", "ARG0", "V", "ARG1", "ARGM-LOC", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP"
|
||||
# ],
|
||||
# "qas": [
|
||||
# {
|
||||
# "type": "isian",
|
||||
# "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
|
||||
# "answer": "17 agustus 1945",
|
||||
# "id": "qa_2_q1"
|
||||
# },
|
||||
# {
|
||||
# "type": "opsi",
|
||||
# "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
|
||||
# "options": ["soekarno", "mohammad hatta", "sudirman", "ahmad yani"],
|
||||
# "answer": "soekarno mohammad hatta",
|
||||
# "id": "qa_2_q2"
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# ]
|
||||
|
||||
with open("data_converted.json", "r") as f:
|
||||
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
|
||||
|
||||
# # Simpan ke file JSON untuk kebutuhan di masa depan
|
||||
# with read('qa_dataset.json', 'w', encoding='utf-8') as f:
|
||||
# json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
|
|
|
@ -23,22 +23,26 @@ max_token_len = tokenizer_data["max_token_len"]
|
|||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
# Load trained model
|
||||
model = load_model("qa_lstm_model_final.h5")
|
||||
model = load_model("qa_lstm_model_final.keras")
|
||||
|
||||
|
||||
def preprocess_text(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def predict_answer(context, question, tokens, ner, srl, q_type):
|
||||
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
|
||||
question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
|
||||
token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
|
||||
ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
|
||||
srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
|
||||
|
||||
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_cat = tf.keras.utils.to_categorical([q_type_idx], num_classes=q_type_vocab_size)
|
||||
q_type_cat = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
|
||||
|
@ -48,7 +52,9 @@ def predict_answer(context, question, tokens, ner, srl, q_type):
|
|||
srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
# Predict
|
||||
prediction = model.predict([context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0)
|
||||
prediction = model.predict(
|
||||
[context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0
|
||||
)
|
||||
answer_idx = np.argmax(prediction[0])
|
||||
|
||||
# Retrieve predicted answer word
|
||||
|
@ -58,6 +64,7 @@ def predict_answer(context, question, tokens, ner, srl, q_type):
|
|||
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
|
||||
entities = {}
|
||||
predicate = ""
|
||||
|
@ -84,8 +91,10 @@ def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
|
|||
if "DATE" in entities:
|
||||
original_date = " ".join(entities["DATE"])
|
||||
try:
|
||||
modified_year = str(int(entities['DATE'][-1]) + random.randint(1, 5))
|
||||
modified_date = f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
|
||||
modified_year = str(int(entities["DATE"][-1]) + random.randint(1, 5))
|
||||
modified_date = (
|
||||
f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
|
||||
)
|
||||
except:
|
||||
modified_date = original_date # Fallback if parsing fails
|
||||
return f"{subject} {predicate} pada {modified_date} ___", "false"
|
||||
|
@ -101,9 +110,10 @@ def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
|
|||
|
||||
return "Apa yang terjadi dalam teks ini ___", context
|
||||
|
||||
|
||||
# ✅ Example Usage with Random Sampling
|
||||
if __name__ == "__main__":
|
||||
with open("data_converted.json", "r") as f:
|
||||
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Randomly select an example for testing
|
||||
|
@ -116,7 +126,7 @@ if __name__ == "__main__":
|
|||
test_item["tokens"],
|
||||
test_item["ner"],
|
||||
test_item["srl"],
|
||||
test_qa["type"]
|
||||
test_qa["type"],
|
||||
)
|
||||
|
||||
print(f"Context: {test_item['context']}")
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -7,7 +7,7 @@ def normalize_question(text):
|
|||
return text.capitalize()
|
||||
|
||||
# Load data
|
||||
with open('../dataset/dev_dataset_qg.json', 'r', encoding='utf-8') as file:
|
||||
with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
processed_data = []
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -146,7 +146,7 @@ class QuestionPredictionModel:
|
|||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Load test data
|
||||
with open("data_converted.json", "r") as f:
|
||||
with open("../dataset/conteks_question.json", "r") as f:
|
||||
test_data = json.load(f)
|
||||
|
||||
# Initialize model
|
||||
|
@ -156,7 +156,7 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
# Example single prediction
|
||||
sample = test_data[520]
|
||||
sample = test_data[1]
|
||||
context = sample["context"]
|
||||
tokens = sample["tokens"]
|
||||
ner = sample["ner"]
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Before Width: | Height: | Size: 49 KiB After Width: | Height: | Size: 52 KiB |
Binary file not shown.
After Width: | Height: | Size: 51 KiB |
Loading…
Reference in New Issue