diff --git a/NER/lstm_ner_qc.py b/NER/lstm_ner_qc.py new file mode 100644 index 0000000..50be877 --- /dev/null +++ b/NER/lstm_ner_qc.py @@ -0,0 +1,93 @@ +import json + +import numpy as np +from keras.models import Sequential +from keras.layers import ( + Embedding, + LSTM, + Dense, + TimeDistributed, + Bidirectional, + InputLayer, +) +from keras.preprocessing.sequence import pad_sequences +from keras.utils import to_categorical +from seqeval.metrics import classification_report +import pickle + + +with open("dataset/lstm_ner_dataset.json", "r", encoding="utf-8") as f: + data = json.load(f) + + +total_bLoc = 0 +total_o = 0 +total_b_per = 0 +total_i_per = 0 + +for idx, block in enumerate(data, start=1): + for token in block["labels"]: + if token == "B-LOC": + total_bLoc += 1 + elif token == "O": + total_o += 1 + elif token == "B-PER": + total_b_per += 1 + elif token == "I-PER": + total_i_per += 1 + +print("Total B-LOC:", total_bLoc) +print("Total O:", total_o) +print("Total B-PER:", total_b_per) +print("Total I-PER:", total_i_per) +print("Total B-PER + I-PER:", total_b_per + total_i_per) + +sentences = [[token.lower() for token in item["tokens"]] for item in data] +labels = [item["labels"] for item in data] + + +words = list(set(word for sentence in sentences for word in sentence)) +tags = list(set(tag for label_seq in labels for tag in label_seq)) + + +word2idx = {word: idx + 2 for idx, word in enumerate(words)} +word2idx["PAD"] = 0 +word2idx["UNK"] = 1 + +tag2idx = {tag: idx for idx, tag in enumerate(tags)} +idx2tag = {i: t for t, i in tag2idx.items()} + +X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences] +y = [[tag2idx[t] for t in ts] for ts in labels] + +maxlen = max(len(x) for x in X) +X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"]) +y = pad_sequences(y, maxlen=maxlen, padding="post", value=tag2idx["O"]) +y = [to_categorical(seq, num_classes=len(tag2idx)) for seq in y] + +model = Sequential() +model.add(InputLayer(input_shape=(maxlen,))) +model.add(Embedding(input_dim=len(word2idx), output_dim=64)) +model.add(Bidirectional(LSTM(units=64, return_sequences=True))) +model.add(TimeDistributed(Dense(len(tag2idx), activation="softmax"))) + +model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) +model.summary() + +model.fit(X, np.array(y), batch_size=2, epochs=10) + +model.save("NER/ner_bilstm_model.keras") + + +with open("NER/word2idx.pkl", "wb") as f: + pickle.dump(word2idx, f) + +with open("NER/tag2idx.pkl", "wb") as f: + pickle.dump(tag2idx, f) + + +y_true = [[idx2tag[np.argmax(token)] for token in seq] for seq in y] +y_pred = model.predict(X) +y_pred = [[idx2tag[np.argmax(token)] for token in seq] for seq in y_pred] + +print(classification_report(y_true, y_pred)) diff --git a/NER/ner_bilstm_model.keras b/NER/ner_bilstm_model.keras new file mode 100644 index 0000000..5932c29 Binary files /dev/null and b/NER/ner_bilstm_model.keras differ diff --git a/NER/tag2idx.pkl b/NER/tag2idx.pkl new file mode 100644 index 0000000..978ff9d Binary files /dev/null and b/NER/tag2idx.pkl differ diff --git a/NER/test_ner.py b/NER/test_ner.py new file mode 100644 index 0000000..52c850b --- /dev/null +++ b/NER/test_ner.py @@ -0,0 +1,39 @@ +import json +import numpy as np +import pickle + +from keras.models import load_model +from keras.preprocessing.sequence import pad_sequences + +model = load_model("NER/ner_bilstm_model.keras") + +with open("NER/word2idx.pkl", "rb") as f: + word2idx = pickle.load(f) + +with open("NER/tag2idx.pkl", "rb") as f: + tag2idx = pickle.load(f) + +idx2tag = {i: t for t, i in tag2idx.items()} + +maxlen = 100 + + +def predict_sentence(sentence): + tokens = sentence.strip().split() + x = [word2idx.get(w.lower(), word2idx["UNK"]) for w in tokens] + x = pad_sequences([x], maxlen=maxlen, padding="post", value=word2idx["PAD"]) + + preds = model.predict(x) + pred_labels = np.argmax(preds[0], axis=-1) + + print("Hasil prediksi NER:") + for token, label_idx in zip(tokens, pred_labels[: len(tokens)]): + print(f"{token}\t{idx2tag[label_idx]}") + + +if __name__ == "__main__": + try: + sentence = "dani datang ke indonesia" + predict_sentence(sentence) + except KeyboardInterrupt: + print("\n\nSelesai.") diff --git a/NER/word2idx.pkl b/NER/word2idx.pkl new file mode 100644 index 0000000..fc7e43e Binary files /dev/null and b/NER/word2idx.pkl differ diff --git a/combine_nlp_lstm.py b/combine_nlp_lstm.py new file mode 100644 index 0000000..44acbca --- /dev/null +++ b/combine_nlp_lstm.py @@ -0,0 +1,122 @@ +import numpy as np +import tensorflow as tf +import spacy +import nltk +from nltk.translate.bleu_score import sentence_bleu +from tensorflow.keras.layers import LSTM, Embedding, Dense, Input +from tensorflow.keras.models import Model +from transformers import TFT5ForConditionalGeneration, T5Tokenizer + +# === LOAD NLP MODEL === +nlp = spacy.load("en_core_web_sm") + + +# === PREPROCESSING FUNCTION === +def preprocess_text(text): + """Melakukan Named Entity Recognition dan Dependency Parsing""" + doc = nlp(text) + entities = {ent.text: ent.label_ for ent in doc.ents} + + # Print hasil Named Entity Recognition + print("\nNamed Entities Detected:") + for ent, label in entities.items(): + print(f"{ent}: {label}") + + return entities + + +# === LSTM MODEL (SEQUENCE-TO-SEQUENCE) === +embedding_dim = 128 +lstm_units = 256 +vocab_size = 5000 # Sesuaikan dengan dataset + +# Encoder +encoder_inputs = Input(shape=(None,)) +encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs) +encoder_lstm = LSTM(lstm_units, return_state=True) +encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding) + +# Decoder +decoder_inputs = Input(shape=(None,)) +decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs) +decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True) +decoder_outputs, _, _ = decoder_lstm( + decoder_embedding, initial_state=[state_h, state_c] +) +decoder_dense = Dense(vocab_size, activation="softmax") +output = decoder_dense(decoder_outputs) + +# Model +lstm_model = Model([encoder_inputs, decoder_inputs], output) +lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy") + + +# === FUNCTION TO GENERATE QUESTION USING LSTM === +def generate_question_lstm(text, model, tokenizer, max_len=20): + """Generate soal menggunakan LSTM""" + input_seq = tokenizer.texts_to_sequences([text]) + input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len) + + generated_question = [] + start_token = tokenizer.word_index.get("", 1) + end_token = tokenizer.word_index.get("", 2) + + next_word = start_token + while next_word != end_token and len(generated_question) < max_len: + output = model.predict([input_seq, np.array([next_word])]) + next_word = np.argmax(output[0, -1, :]) + generated_question.append(tokenizer.index_word.get(next_word, "")) + + return " ".join(generated_question) + + +# === T5 TRANSFORMER MODEL (VERSI TENSORFLOW) === +t5_model_name = "t5-small" +t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name) +t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name) + + +def generate_question_t5(text): + """Generate soal menggunakan T5 Transformer versi TensorFlow""" + input_text = "generate question: " + text + input_ids = t5_tokenizer.encode( + input_text, return_tensors="tf" + ) # Gunakan TensorFlow + output = t5_model.generate(input_ids, max_length=50) + return t5_tokenizer.decode(output[0], skip_special_tokens=True) + + +# === BLEU SCORE EVALUATION === +def evaluate_bleu(reference, candidate): + """Menghitung BLEU Score antara pertanyaan asli dan yang dihasilkan""" + score = sentence_bleu([reference.split()], candidate.split()) + print(f"BLEU Score: {score:.4f}") + return score + + +# === MAIN EXECUTION === +if __name__ == "__main__": + paragraph = "Albert Einstein mengembangkan teori relativitas pada tahun 1905." + + # Preprocessing + print("\nπŸ› οΈ Preprocessing text...") + entities = preprocess_text(paragraph) + + # Generate soal menggunakan LSTM + print("\nπŸ”΅ Generating Question using LSTM (Dummy Model)...") + dummy_tokenizer = { + "texts_to_sequences": lambda x: [[1, 2, 3, 4]], + "index_word": {1: "apa", 2: "siapa", 3: "di", 4: "tahun"}, + } + question_lstm = generate_question_lstm(paragraph, lstm_model, dummy_tokenizer) + print(f"LSTM Generated Question: {question_lstm}") + + # Generate soal menggunakan T5 + print("\n🟒 Generating Question using T5 Transformer...") + question_t5 = generate_question_t5(paragraph) + print(f"T5 Generated Question: {question_t5}") + + # Evaluasi BLEU Score + reference_question = "Kapan teori relativitas dikembangkan?" + print("\nπŸ“Š Evaluating BLEU Score...") + evaluate_bleu(reference_question, question_t5) diff --git a/dataset/README.md b/dataset/README.md new file mode 100644 index 0000000..540cb3a --- /dev/null +++ b/dataset/README.md @@ -0,0 +1,34 @@ +NER + +B-PER -> person kata awal +I-PER -> person kata tengah dan akhir +B-LOC -> awal dari entitas lokasi +I-LOC -> tengah dan akhir dari entitas lokasi +B-ORG -> awal dari entitas organisasi +I-ORG -> tengah dan akhir dari entitas organisasi +B-MISC -> awal dari entitas lain lain Miscellaneous +I-MISC -> Lanjutan dari entitas lain-lain +B-DATE -> tanggal +B-TIME -> waktu +O -> token luar entitas + +Semantic Role Labeling (SRL) +ARG0 -> Agen (pelaku) – biasanya subjek +ARG1 -> Pasien atau tema – objek/yang dikenai aksi +ARG2 -> Arah, tujuan, hasil +ARG3 -> Lokasi awal (sumber) +ARG4 -> Penerima, tujuan akhir +ARG5 -> Alat atau instrumen + +ARGM-TMP -> Waktu (Temporal) +ARGM-LOC -> Lokasi (Spatial) +ARGM-MNR -> Cara (Manner) +ARGM-CAU -> Penyebab (Cause) +ARGM-EXT -> Derajat atau perbandingan (Extent) +ARGM-DIS -> Diskursus (Discourse) seperti β€œtetapi” +ARGM-NEG -> Negasi (Negation), misal "tidak" +ARGM-MOD -> Modality: bisa, harus, mungkin +ARGM-PRP -> Tujuan (Purpose) +ARGM-REC -> Penerima (Recipient, kadang mirip ARG4) +ARGM-COM -> Komitatif (dengan siapa) +ARGM-ADV -> Modifikasi umum diff --git a/dataset/dataset_combination.json b/dataset/dataset_combination.json new file mode 100644 index 0000000..7b6ea13 --- /dev/null +++ b/dataset/dataset_combination.json @@ -0,0 +1,56 @@ +[ + { + "tokens": [ + "Barack", + "Obama", + "melihat", + "bank", + "di", + "tepi", + "sungai", + "." + ], + "ner_labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"], + "srl_labels": [ + "B-ARG0", + "I-ARG0", + "B-V", + "B-ARG1", + "B-ARGM-LOC", + "I-ARGM-LOC", + "I-ARGM-LOC", + "O" + ], + "predicate": "melihat", + "wsd_targets": [ + { + "index": 3, + "word": "bank", + "sense": "river_bank", + "sense_id": "bank%1:17:00::" + } + ] + }, + { + "tokens": ["Dia", "pergi", "ke", "bank", "untuk", "menabung", "."], + "ner_labels": ["O", "O", "O", "O", "O", "O", "O"], + "srl_labels": [ + "B-ARG0", + "B-V", + "B-ARGM-DIR", + "I-ARGM-DIR", + "B-ARGM-PRP", + "I-ARGM-PRP", + "O" + ], + "predicate": "pergi", + "wsd_targets": [ + { + "index": 3, + "word": "bank", + "sense": "financial_institution", + "sense_id": "bank%1:14:00::" + } + ] + } +] diff --git a/dataset/dataset_ner_srl.json b/dataset/dataset_ner_srl.json new file mode 100644 index 0000000..9c5ee96 --- /dev/null +++ b/dataset/dataset_ner_srl.json @@ -0,0 +1,2018 @@ +[ + { + "tokens": ["Barack", "Obama", "adalah", "kanselir", "asal", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"], + "labels_srl": [] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "pemain bola", "asal", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "datang", "dari", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Cristiano", + "Ronaldo", + "adalah", + "pemain bola", + "asal", + "Inggris" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "lahir", "di", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "adalah", "kanselir", "asal", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "pernah", "tinggal", "di", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "datang", "dari", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "lahir", "di", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "adalah", "presiden", "asal", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joko", + "Widodo", + "bekerja", + "sebagai", + "presiden", + "di", + "Kanada" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Angela", + "Merkel", + "bekerja", + "sebagai", + "ilmuwan", + "di", + "Indonesia" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "lahir", "di", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "lahir", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Cristiano", + "Ronaldo", + "adalah", + "ilmuwan", + "asal", + "Indonesia" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "pernah", "tinggal", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "ilmuwan", "asal", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "aktivis", "asal", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "pernah", "tinggal", "di", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "datang", "dari", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "pernah", "tinggal", "di", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Angela", + "Merkel", + "bekerja", + "sebagai", + "penyanyi", + "di", + "Brazil" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "adalah", "aktivis", "asal", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Emma", + "Watson", + "bekerja", + "sebagai", + "ilmuwan", + "di", + "Italia" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "adalah", "presiden", "asal", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "pernah", "tinggal", "di", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "datang", "dari", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "lahir", "di", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Elon", + "Musk", + "bekerja", + "sebagai", + "aktivis", + "di", + "Spanyol" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "pernah", "tinggal", "di", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "pernah", "tinggal", "di", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "lahir", "di", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "lahir", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "pernah", "tinggal", "di", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Barack", + "Obama", + "bekerja", + "sebagai", + "kanselir", + "di", + "Kanada" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "lahir", "di", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "datang", "dari", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "penyanyi", "asal", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "aktivis", "asal", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "penyanyi", "asal", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "lahir", "di", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "pernah", "tinggal", "di", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "pemain bola", "asal", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "pemain bola", "asal", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Taylor", + "Swift", + "bekerja", + "sebagai", + "aktivis", + "di", + "Brazil" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joe", + "Biden", + "bekerja", + "sebagai", + "pemain bola", + "di", + "Perancis" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "adalah", "aktor", "asal", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "adalah", "ilmuwan", "asal", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "adalah", "kanselir", "asal", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "adalah", "penyanyi", "asal", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Elon", + "Musk", + "bekerja", + "sebagai", + "penyanyi", + "di", + "Perancis" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "aktivis", "asal", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "adalah", "presiden", "asal", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "adalah", "ilmuwan", "asal", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "lahir", "di", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joko", + "Widodo", + "bekerja", + "sebagai", + "ilmuwan", + "di", + "Italia" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "adalah", "aktivis", "asal", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "pernah", "tinggal", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "pernah", "tinggal", "di", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "adalah", "kanselir", "asal", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "lahir", "di", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Cristiano", + "Ronaldo", + "bekerja", + "sebagai", + "aktivis", + "di", + "Spanyol" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Amerika"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "adalah", "kanselir", "asal", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "pernah", "tinggal", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "lahir", "di", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "adalah", "penyanyi", "asal", "Kanada"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "datang", "dari", "Hawaii"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "adalah", "pemain bola", "asal", "Italia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "lahir", "di", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Elon", + "Musk", + "bekerja", + "sebagai", + "kanselir", + "di", + "Jerman" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "pernah", "tinggal", "di", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "pernah", "tinggal", "di", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Indonesia"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "adalah", "aktivis", "asal", "Perancis"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "datang", "dari", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "pernah", "tinggal", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "pernah", "tinggal", "di", "Jerman"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "lahir", "di", "Inggris"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "pernah", "tinggal", "di", "Spanyol"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "lahir", "di", "Brazil"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + + { + "tokens": ["Nadiem", "Makarim", "lahir", "di", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Yogyakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "pernah", "tinggal", "di", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "artis", + "di", + "Bandung" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "bekerja", "sebagai", "artis", "di", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Ridwan", + "Kamil", + "bekerja", + "sebagai", + "menteri", + "di", + "Semarang" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "adalah", "penulis", "asal", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "datang", "dari", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Prabowo", + "Subianto", + "bekerja", + "sebagai", + "aktivis", + "di", + "Semarang" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "penulis", "asal", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "pernah", "tinggal", "di", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Susi", + "Pudjiastuti", + "adalah", + "walikota", + "asal", + "Yogyakarta" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "pernah", "tinggal", "di", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "datang", "dari", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "adalah", "penulis", "asal", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Susi", + "Pudjiastuti", + "bekerja", + "sebagai", + "dosen", + "di", + "Medan" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "walikota", "asal", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Prabowo", + "Subianto", + "bekerja", + "sebagai", + "menteri", + "di", + "Palembang" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "adalah", "artis", "asal", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "adalah", "walikota", "asal", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "pernah", "tinggal", "di", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "datang", "dari", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "lahir", "di", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "adalah", "dosen", "asal", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "datang", "dari", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "menteri", "asal", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Nadiem", + "Makarim", + "bekerja", + "sebagai", + "menteri", + "di", + "Palembang" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "adalah", "walikota", "asal", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "pernah", "tinggal", "di", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "lahir", "di", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "adalah", "dosen", "asal", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "lahir", "di", "Surabaya"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Sri", + "Mulyani", + "bekerja", + "sebagai", + "walikota", + "di", + "Surabaya" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "adalah", "menteri", "asal", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "pernah", "tinggal", "di", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "datang", "dari", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "lahir", "di", "Surabaya"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "lahir", "di", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "datang", "dari", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "adalah", "penulis", "asal", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "datang", "dari", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Prabowo", + "Subianto", + "adalah", + "penulis", + "asal", + "Yogyakarta" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "pernah", "tinggal", "di", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "penulis", + "di", + "Semarang" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "adalah", "walikota", "asal", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "walikota", "asal", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "dosen", + "di", + "Yogyakarta" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "lahir", "di", "Yogyakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "pernah", "tinggal", "di", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joko", + "Widodo", + "bekerja", + "sebagai", + "gubernur", + "di", + "Yogyakarta" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "adalah", "aktivis", "asal", "Yogyakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "datang", "dari", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "datang", "dari", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "datang", "dari", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Surabaya"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "lahir", "di", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "datang", "dari", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "datang", "dari", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "adalah", "dosen", "asal", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "datang", "dari", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "lahir", "di", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "datang", "dari", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "artis", + "di", + "Bandung" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "lahir", "di", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Bali"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "pernah", "tinggal", "di", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "datang", "dari", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "pernah", "tinggal", "di", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Ganjar", + "Pranowo", + "bekerja", + "sebagai", + "pengusaha", + "di", + "Makassar" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "adalah", "dosen", "asal", "Makassar"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "lahir", "di", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "lahir", "di", "Jakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "penulis", + "di", + "Makassar" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "lahir", "di", "Yogyakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "penulis", + "di", + "Yogyakarta" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Surabaya"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "bekerja", "sebagai", "artis", "di", "Medan"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "lahir", "di", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "adalah", "aktivis", "asal", "Semarang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "pernah", "tinggal", "di", "Bandung"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Yogyakarta"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "gubernur", + "di", + "Semarang" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "pernah", "tinggal", "di", "Padang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "pengusaha", + "di", + "Bandung" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "datang", "dari", "Surabaya"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Nadiem", + "Makarim", + "bekerja", + "sebagai", + "aktivis", + "di", + "Bali" + ], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "adalah", "aktivis", "asal", "Palembang"], + "labels_ner": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["jakarta", "adalah", "ibu", "kota", "Indonesia"], + "labels_ner": ["B-LOC", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "presiden", + "indonesia", + "saat", + "ini", + "adalah", + "prabowo", + "subianto" + ], + "labels_ner": ["O", "B-LOC", "O", "O", "O", "B-PER", "I-PER"] + }, + { + "tokens": ["dani", "datang", "dari", "jakarta"], + "labels_ner": ["B-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["dani", "pergi", "ke", "surabaya"], + "labels_ner": ["B-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Arti", + "penting", + "dari", + "pembelajaran", + "tentang", + "sejarah", + "kehidupan", + "zaman", + "praaksara" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "O", "B-TIME", "I-TIME"] + }, + { + "tokens": [ + "pertama-tama", + "adalah", + "kesadaran", + "akan", + "asal", + "usul", + "manusia" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": ["Tumbuhan", "memiliki", "akar"], + "labels_ner": ["O", "O", "O"] + }, + { + "tokens": [ + "Semakin", + "tinggi", + "tumbuhan", + "itu", + "semakin", + "dalam", + "pula", + "akarnya", + "menghunjam", + "ke", + "bumi" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "hingga", + "tidak", + "mudah", + "tumbang", + "dari", + "terpaan", + "angin", + "badai", + "atau", + "bencana", + "alam", + "lainnya" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": ["Demikian", "pula", "halnya", "dengan", "manusia"], + "labels_ner": ["O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Semakin", + "berbudaya", + "seseorang", + "atau", + "kelompok", + "masyarakat" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "semakin", + "dalam", + "pula", + "kesadaran", + "kolektifnya", + "tentang", + "asal", + "usul", + "dan", + "penghargaan", + "terhadap", + "tradisi" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-MISC" + ] + }, + { + "tokens": [ + "Jika", + "tidak", + "demikian", + "manusia", + "yang", + "melupakan", + "budaya", + "bangsanya" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "B-MISC", "B-MISC"] + }, + { + "tokens": [ + "akan", + "mudah", + "terombang-ambing", + "oleh", + "terpaan", + "budaya", + "asing", + "yang", + "lebih", + "kuat" + ], + "labels_ner": ["O", "O", "O", "O", "O", "B-MISC", "B-MISC", "O", "O", "O"] + }, + { + "tokens": [ + "sehingga", + "dengan", + "sendirinya", + "kehilangan", + "identitas", + "diri" + ], + "labels_ner": ["O", "O", "O", "O", "B-MISC", "I-MISC"] + }, + { + "tokens": [ + "Jadi", + "bangsa", + "yang", + "gampang", + "meninggalkan", + "tradisi", + "nenek", + "moyangnya" + ], + "labels_ner": ["O", "O", "O", "O", "O", "B-MISC", "O", "O"] + }, + { + "tokens": [ + "akan", + "mudah", + "didikte", + "oleh", + "budaya", + "dominan", + "dari", + "luar", + "yang", + "bukan", + "miliknya" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "B-MISC", + "I-MISC", + "O", + "B-LOC", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Kita", + "bisa", + "belajar", + "banyak", + "dari", + "keberhasilan", + "dan", + "capaian", + "prestasi", + "terbaik", + "dari", + "pendahulu", + "kita" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Sebaliknya", + "kita", + "juga", + "belajar", + "dari", + "kegagalan", + "mereka", + "yang", + "telah", + "menimbulkan", + "malapetaka", + "bagi", + "dirinya", + "atau", + "bagi", + "banyak", + "orang" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": ["Untuk", "memetik", "pelajaran", "dari", "uraian", "ini"], + "labels_ner": ["O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "dapat", + "kita", + "katakan", + "bahwa", + "nilai", + "terpenting", + "dalam", + "pembelajaran", + "sejarah", + "tentang", + "zaman", + "praaksara" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-TIME", + "I-TIME" + ] + }, + { + "tokens": [ + "dan", + "sesudahnya", + "ada", + "dua", + "yaitu", + "sebagai", + "inspirasi", + "untuk", + "pengembangan", + "nalar", + "kehidupan", + "dan", + "sebagai", + "peringatan" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Selebihnya", + "kecerdasan", + "dan", + "pikiran-pikiran", + "kritis", + "lah", + "yang", + "akan", + "menerangi", + "kehidupan", + "masa", + "kini", + "dan", + "masa", + "depan" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-TIME", + "I-TIME", + "O", + "B-TIME", + "I-TIME" + ] + }, + { + "tokens": [ + "Sekarang", + "muncul", + "pertanyaan", + "sejak", + "kapan", + "zaman", + "praaksara", + "berakhir" + ], + "labels_ner": ["O", "O", "O", "O", "O", "B-TIME", "I-TIME", "O"] + }, + { + "tokens": [ + "Sudah", + "barang", + "tentu", + "zaman", + "praaksara", + "itu", + "berakhir", + "setelah", + "kehidupan", + "manusia", + "mulai", + "mengenal", + "tulisan" + ], + "labels_ner": [ + "O", + "O", + "O", + "B-TIME", + "I-TIME", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Terkait", + "dengan", + "masa", + "berakhirnya", + "zaman", + "praaksara", + "masing-masing", + "tempat", + "akan", + "berbeda" + ], + "labels_ner": ["O", "O", "O", "O", "B-TIME", "I-TIME", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Penduduk", + "di", + "Kepulauan", + "Indonesia", + "baru", + "memasuki", + "masa", + "aksara", + "sekitar", + "abad", + "ke-5", + "M" + ], + "labels_ner": [ + "O", + "O", + "B-LOC", + "I-LOC", + "O", + "O", + "B-TIME", + "I-TIME", + "O", + "B-TIME", + "I-TIME", + "I-TIME" + ] + }, + { + "tokens": [ + "Hal", + "ini", + "jauh", + "lebih", + "terlambat", + "bila", + "dibandingkan", + "di", + "tempat", + "lain", + "misalnya", + "Mesir", + "dan", + "Mesopotamia" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "O", + "B-LOC" + ] + }, + { + "tokens": [ + "yang", + "sudah", + "mengenal", + "tulisan", + "sejak", + "sekitar", + "tahun", + "3000", + "SM" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "B-TIME", "I-TIME", "I-TIME"] + }, + { + "tokens": [ + "Fakta-fakta", + "masa", + "aksara", + "di", + "Kepulauan", + "Indonesia", + "dihubungkan", + "dengan", + "temuan", + "prasasti", + "peninggalan", + "kerajaan", + "tua" + ], + "labels_ner": [ + "O", + "B-TIME", + "I-TIME", + "O", + "B-LOC", + "I-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "seperti", + "Kerajaan", + "Kutai", + "di", + "Muara", + "Kaman", + "Kalimantan", + "Timur" + ], + "labels_ner": ["O", "O", "B-ORG", "O", "B-LOC", "I-LOC", "I-LOC", "I-LOC"] + }, + + { + "tokens": [ + "Bumi", + "kita", + "yang", + "terhampar", + "luas", + "ini", + "diciptakan", + "Tuhan", + "Yang", + "Maha", + "Pencipta", + "untuk", + "kehidupan", + "dan", + "kepentingan", + "hidup", + "manusia" + ], + "labels_ner": [ + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "B-PER", + "I-PER", + "I-PER", + "I-PER", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Di", + "bumi", + "ini", + "hidup", + "berbagai", + "flora", + "dan", + "fauna", + "serta", + "tempat", + "bersemainya", + "manusia", + "dengan", + "keturunannya" + ], + "labels_ner": [ + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Di", + "bumi", + "ini", + "kita", + "bisa", + "menyaksikan", + "keindahan", + "alam", + "kita", + "bisa", + "beraktivitas", + "dan", + "berikhtiar", + "memenuhi", + "kebutuhan", + "hidup", + "kita" + ], + "labels_ner": [ + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Namun", + "harus", + "dipahami", + "bahwa", + "bumi", + "kita", + "juga", + "sering", + "menimbulkan", + "bencana" + ], + "labels_ner": ["O", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Sebagai", + "contoh", + "munculnya", + "aktivitas", + "lempeng", + "bumi", + "yang", + "kemudian", + "melahirkan", + "gempa", + "baik", + "tektonis", + "maupun", + "vulkanis", + "bahkan", + "sampai", + "menimbulkan", + "tsunami" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Sebagai", + "contoh", + "tentu", + "kamu", + "masih", + "ingat", + "gempa", + "dan", + "tsunami", + "yang", + "terjadi", + "di", + "Aceh" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC" + ] + }, + { + "tokens": [ + "gempa", + "di", + "Yogyakarta", + "di", + "Papua", + "dan", + "beberapa", + "daerah", + "lain", + "termasuk", + "beberapa", + "gunung", + "api", + "meletus" + ], + "labels_ner": [ + "O", + "O", + "B-LOC", + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Bencana", + "tersebut", + "telah", + "mengakibatkan", + "ribuan", + "nyawa", + "hilang", + "dan", + "harta", + "benda", + "melayang" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Fenomena", + "alam", + "yang", + "terjadi", + "itu", + "merupakan", + "bagian", + "tak", + "terpisahkan", + "dari", + "aktivitas", + "panjang", + "bumi", + "kita", + "sejak", + "proses", + "terjadinya", + "alam", + "semesta", + "ratusan", + "ribuan", + "bahkan", + "juta", + "tahun", + "yang", + "lalu" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "B-TIME", + "I-TIME", + "I-TIME", + "I-TIME", + "I-TIME", + "I-TIME", + "O", + "O" + ] + }, + { + "tokens": [ + "Proses", + "tersebut", + "secara", + "geologis", + "mengalami", + "beberapa", + "tahapan", + "atau", + "pembabakan", + "waktu" + ], + "labels_ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Berikut", + "ini", + "kita", + "mencoba", + "menelaah", + "tentang", + "pembabakan", + "waktu", + "alam", + "secara", + "geologis", + "dan", + "terbentuknya", + "Kepulauan", + "Indonesia", + "terbentuk" + ], + "labels_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "I-LOC", + "O" + ] + }, + { + "tokens": ["dani", "pergi", "ke", "surabaya", "sore", "ini"], + "labels_ner": ["B-PER", "O", "O", "B-LOC", "B-TIME", "O"] + }, + { + "tokens": [ + "malam", + "nanti", + "jun", + "sedang", + "menonton", + "film", + "dengan", + "pacarnya" + ], + "labels_ner": ["B-TIME", "O", "B-PER", "O", "O", "O", "O", "B-PER"] + } +] diff --git a/dataset/lstm_ner_dataset.json b/dataset/lstm_ner_dataset.json new file mode 100644 index 0000000..7ff244b --- /dev/null +++ b/dataset/lstm_ner_dataset.json @@ -0,0 +1,1990 @@ +[ + { + "tokens": ["Barack", "Obama", "adalah", "kanselir", "asal", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "pemain bola", "asal", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "datang", "dari", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Cristiano", + "Ronaldo", + "adalah", + "pemain bola", + "asal", + "Inggris" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "lahir", "di", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "adalah", "kanselir", "asal", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "pernah", "tinggal", "di", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "datang", "dari", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "lahir", "di", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "adalah", "presiden", "asal", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joko", + "Widodo", + "bekerja", + "sebagai", + "presiden", + "di", + "Kanada" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Angela", + "Merkel", + "bekerja", + "sebagai", + "ilmuwan", + "di", + "Indonesia" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "lahir", "di", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "lahir", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Cristiano", + "Ronaldo", + "adalah", + "ilmuwan", + "asal", + "Indonesia" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "pernah", "tinggal", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "ilmuwan", "asal", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "aktivis", "asal", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "pernah", "tinggal", "di", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "datang", "dari", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "pernah", "tinggal", "di", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Angela", + "Merkel", + "bekerja", + "sebagai", + "penyanyi", + "di", + "Brazil" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "adalah", "aktivis", "asal", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Emma", + "Watson", + "bekerja", + "sebagai", + "ilmuwan", + "di", + "Italia" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "adalah", "presiden", "asal", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "pernah", "tinggal", "di", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "datang", "dari", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "lahir", "di", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Elon", + "Musk", + "bekerja", + "sebagai", + "aktivis", + "di", + "Spanyol" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "pernah", "tinggal", "di", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "pernah", "tinggal", "di", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "lahir", "di", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "lahir", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "pernah", "tinggal", "di", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Barack", + "Obama", + "bekerja", + "sebagai", + "kanselir", + "di", + "Kanada" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "lahir", "di", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "datang", "dari", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "penyanyi", "asal", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "aktivis", "asal", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "penyanyi", "asal", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "lahir", "di", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "pernah", "tinggal", "di", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "pemain bola", "asal", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "adalah", "pemain bola", "asal", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Taylor", + "Swift", + "bekerja", + "sebagai", + "aktivis", + "di", + "Brazil" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joe", + "Biden", + "bekerja", + "sebagai", + "pemain bola", + "di", + "Perancis" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "adalah", "aktor", "asal", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "adalah", "ilmuwan", "asal", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "adalah", "kanselir", "asal", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "adalah", "penyanyi", "asal", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Elon", + "Musk", + "bekerja", + "sebagai", + "penyanyi", + "di", + "Perancis" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "adalah", "aktivis", "asal", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "adalah", "presiden", "asal", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "datang", "dari", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "adalah", "ilmuwan", "asal", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "lahir", "di", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joko", + "Widodo", + "bekerja", + "sebagai", + "ilmuwan", + "di", + "Italia" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "adalah", "aktivis", "asal", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "pernah", "tinggal", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "pernah", "tinggal", "di", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "adalah", "kanselir", "asal", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "lahir", "di", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Greta", "Thunberg", "datang", "dari", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Cristiano", + "Ronaldo", + "bekerja", + "sebagai", + "aktivis", + "di", + "Spanyol" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Amerika"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "adalah", "kanselir", "asal", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "pernah", "tinggal", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Taylor", "Swift", "lahir", "di", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "adalah", "penyanyi", "asal", "Kanada"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "datang", "dari", "Hawaii"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "adalah", "pemain bola", "asal", "Italia"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "lahir", "di", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Elon", + "Musk", + "bekerja", + "sebagai", + "kanselir", + "di", + "Jerman" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Elon", "Musk", "pernah", "tinggal", "di", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "pernah", "tinggal", "di", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Lionel", "Messi", "datang", "dari", "Indonesia"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "adalah", "aktivis", "asal", "Perancis"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Barack", "Obama", "datang", "dari", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Emma", "Watson", "pernah", "tinggal", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joe", "Biden", "pernah", "tinggal", "di", "Jerman"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "lahir", "di", "Inggris"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Angela", "Merkel", "pernah", "tinggal", "di", "Spanyol"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Cristiano", "Ronaldo", "lahir", "di", "Brazil"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + + { + "tokens": ["Nadiem", "Makarim", "lahir", "di", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Yogyakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "pernah", "tinggal", "di", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "artis", + "di", + "Bandung" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "bekerja", "sebagai", "artis", "di", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "datang", "dari", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Ridwan", + "Kamil", + "bekerja", + "sebagai", + "menteri", + "di", + "Semarang" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "adalah", "penulis", "asal", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "datang", "dari", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Prabowo", + "Subianto", + "bekerja", + "sebagai", + "aktivis", + "di", + "Semarang" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "penulis", "asal", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "pernah", "tinggal", "di", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Susi", + "Pudjiastuti", + "adalah", + "walikota", + "asal", + "Yogyakarta" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "pernah", "tinggal", "di", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "datang", "dari", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "adalah", "penulis", "asal", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Susi", + "Pudjiastuti", + "bekerja", + "sebagai", + "dosen", + "di", + "Medan" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "walikota", "asal", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Prabowo", + "Subianto", + "bekerja", + "sebagai", + "menteri", + "di", + "Palembang" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "adalah", "artis", "asal", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "adalah", "walikota", "asal", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "pernah", "tinggal", "di", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "datang", "dari", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "lahir", "di", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "adalah", "dosen", "asal", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "datang", "dari", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "menteri", "asal", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Nadiem", + "Makarim", + "bekerja", + "sebagai", + "menteri", + "di", + "Palembang" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "adalah", "walikota", "asal", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "pernah", "tinggal", "di", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "lahir", "di", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "adalah", "dosen", "asal", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "lahir", "di", "Surabaya"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Sri", + "Mulyani", + "bekerja", + "sebagai", + "walikota", + "di", + "Surabaya" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "adalah", "menteri", "asal", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "pernah", "tinggal", "di", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "datang", "dari", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "lahir", "di", "Surabaya"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "lahir", "di", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "datang", "dari", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "adalah", "penulis", "asal", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "datang", "dari", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Prabowo", + "Subianto", + "adalah", + "penulis", + "asal", + "Yogyakarta" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "pernah", "tinggal", "di", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "penulis", + "di", + "Semarang" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "adalah", "walikota", "asal", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "adalah", "walikota", "asal", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "pernah", "tinggal", "di", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "dosen", + "di", + "Yogyakarta" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "lahir", "di", "Yogyakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "pernah", "tinggal", "di", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Joko", + "Widodo", + "bekerja", + "sebagai", + "gubernur", + "di", + "Yogyakarta" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "adalah", "aktivis", "asal", "Yogyakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "datang", "dari", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "datang", "dari", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "datang", "dari", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "lahir", "di", "Surabaya"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "lahir", "di", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "datang", "dari", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "datang", "dari", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "adalah", "dosen", "asal", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "datang", "dari", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ridwan", "Kamil", "lahir", "di", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Nadiem", "Makarim", "datang", "dari", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "artis", + "di", + "Bandung" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "lahir", "di", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Bali"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "pernah", "tinggal", "di", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Susi", "Pudjiastuti", "datang", "dari", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "pernah", "tinggal", "di", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Ganjar", + "Pranowo", + "bekerja", + "sebagai", + "pengusaha", + "di", + "Makassar" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "adalah", "dosen", "asal", "Makassar"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "lahir", "di", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Budi", "Gunadi", "lahir", "di", "Jakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "penulis", + "di", + "Makassar" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "lahir", "di", "Yogyakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Maudy", + "Ayunda", + "bekerja", + "sebagai", + "penulis", + "di", + "Yogyakarta" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "datang", "dari", "Surabaya"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "bekerja", "sebagai", "artis", "di", "Medan"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Sri", "Mulyani", "lahir", "di", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["Najwa", "Shihab", "adalah", "aktivis", "asal", "Semarang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "pernah", "tinggal", "di", "Bandung"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Joko", "Widodo", "pernah", "tinggal", "di", "Yogyakarta"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "gubernur", + "di", + "Semarang" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Maudy", "Ayunda", "pernah", "tinggal", "di", "Padang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Budi", + "Gunadi", + "bekerja", + "sebagai", + "pengusaha", + "di", + "Bandung" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Ganjar", "Pranowo", "datang", "dari", "Surabaya"], + "labels": ["B-PER", "I-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Nadiem", + "Makarim", + "bekerja", + "sebagai", + "aktivis", + "di", + "Bali" + ], + "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["Prabowo", "Subianto", "adalah", "aktivis", "asal", "Palembang"], + "labels": ["B-PER", "I-PER", "O", "O", "O", "B-LOC"] + }, + { + "tokens": ["jakarta", "adalah", "ibu", "kota", "Indonesia"], + "labels": ["B-LOC", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "presiden", + "indonesia", + "saat", + "ini", + "adalah", + "prabowo", + "subianto" + ], + "labels": ["O", "B-LOC", "O", "O", "O", "B-PER", "I-PER"] + }, + { + "tokens": ["dani", "datang", "dari", "jakarta"], + "labels": ["B-PER", "O", "O", "B-LOC"] + }, + { + "tokens": ["dani", "pergi", "ke", "surabaya"], + "labels": ["B-PER", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "Arti", + "penting", + "dari", + "pembelajaran", + "tentang", + "sejarah", + "kehidupan", + "zaman", + "praaksara" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "B-TIME", "I-TIME"] + }, + { + "tokens": [ + "pertama-tama", + "adalah", + "kesadaran", + "akan", + "asal", + "usul", + "manusia" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": ["Tumbuhan", "memiliki", "akar"], + "labels": ["O", "O", "O"] + }, + { + "tokens": [ + "Semakin", + "tinggi", + "tumbuhan", + "itu", + "semakin", + "dalam", + "pula", + "akarnya", + "menghunjam", + "ke", + "bumi" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC"] + }, + { + "tokens": [ + "hingga", + "tidak", + "mudah", + "tumbang", + "dari", + "terpaan", + "angin", + "badai", + "atau", + "bencana", + "alam", + "lainnya" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": ["Demikian", "pula", "halnya", "dengan", "manusia"], + "labels": ["O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Semakin", + "berbudaya", + "seseorang", + "atau", + "kelompok", + "masyarakat" + ], + "labels": ["O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "semakin", + "dalam", + "pula", + "kesadaran", + "kolektifnya", + "tentang", + "asal", + "usul", + "dan", + "penghargaan", + "terhadap", + "tradisi" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-MISC"] + }, + { + "tokens": [ + "Jika", + "tidak", + "demikian", + "manusia", + "yang", + "melupakan", + "budaya", + "bangsanya" + ], + "labels": ["O", "O", "O", "O", "O", "O", "B-MISC", "B-MISC"] + }, + { + "tokens": [ + "akan", + "mudah", + "terombang-ambing", + "oleh", + "terpaan", + "budaya", + "asing", + "yang", + "lebih", + "kuat" + ], + "labels": ["O", "O", "O", "O", "O", "B-MISC", "B-MISC", "O", "O", "O"] + }, + { + "tokens": [ + "sehingga", + "dengan", + "sendirinya", + "kehilangan", + "identitas", + "diri" + ], + "labels": ["O", "O", "O", "O", "B-MISC", "I-MISC"] + }, + { + "tokens": [ + "Jadi", + "bangsa", + "yang", + "gampang", + "meninggalkan", + "tradisi", + "nenek", + "moyangnya" + ], + "labels": ["O", "O", "O", "O", "O", "B-MISC", "O", "O"] + }, + { + "tokens": [ + "akan", + "mudah", + "didikte", + "oleh", + "budaya", + "dominan", + "dari", + "luar", + "yang", + "bukan", + "miliknya" + ], + "labels": [ + "O", + "O", + "O", + "O", + "B-MISC", + "I-MISC", + "O", + "B-LOC", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Kita", + "bisa", + "belajar", + "banyak", + "dari", + "keberhasilan", + "dan", + "capaian", + "prestasi", + "terbaik", + "dari", + "pendahulu", + "kita" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Sebaliknya", + "kita", + "juga", + "belajar", + "dari", + "kegagalan", + "mereka", + "yang", + "telah", + "menimbulkan", + "malapetaka", + "bagi", + "dirinya", + "atau", + "bagi", + "banyak", + "orang" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": ["Untuk", "memetik", "pelajaran", "dari", "uraian", "ini"], + "labels": ["O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "dapat", + "kita", + "katakan", + "bahwa", + "nilai", + "terpenting", + "dalam", + "pembelajaran", + "sejarah", + "tentang", + "zaman", + "praaksara" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-TIME", + "I-TIME" + ] + }, + { + "tokens": [ + "dan", + "sesudahnya", + "ada", + "dua", + "yaitu", + "sebagai", + "inspirasi", + "untuk", + "pengembangan", + "nalar", + "kehidupan", + "dan", + "sebagai", + "peringatan" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Selebihnya", + "kecerdasan", + "dan", + "pikiran-pikiran", + "kritis", + "lah", + "yang", + "akan", + "menerangi", + "kehidupan", + "masa", + "kini", + "dan", + "masa", + "depan" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-TIME", + "I-TIME", + "O", + "B-TIME", + "I-TIME" + ] + }, + { + "tokens": [ + "Sekarang", + "muncul", + "pertanyaan", + "sejak", + "kapan", + "zaman", + "praaksara", + "berakhir" + ], + "labels": ["O", "O", "O", "O", "O", "B-TIME", "I-TIME", "O"] + }, + { + "tokens": [ + "Sudah", + "barang", + "tentu", + "zaman", + "praaksara", + "itu", + "berakhir", + "setelah", + "kehidupan", + "manusia", + "mulai", + "mengenal", + "tulisan" + ], + "labels": [ + "O", + "O", + "O", + "B-TIME", + "I-TIME", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Terkait", + "dengan", + "masa", + "berakhirnya", + "zaman", + "praaksara", + "masing-masing", + "tempat", + "akan", + "berbeda" + ], + "labels": ["O", "O", "O", "O", "B-TIME", "I-TIME", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Penduduk", + "di", + "Kepulauan", + "Indonesia", + "baru", + "memasuki", + "masa", + "aksara", + "sekitar", + "abad", + "ke-5", + "M" + ], + "labels": [ + "O", + "O", + "B-LOC", + "I-LOC", + "O", + "O", + "B-TIME", + "I-TIME", + "O", + "B-TIME", + "I-TIME", + "I-TIME" + ] + }, + { + "tokens": [ + "Hal", + "ini", + "jauh", + "lebih", + "terlambat", + "bila", + "dibandingkan", + "di", + "tempat", + "lain", + "misalnya", + "Mesir", + "dan", + "Mesopotamia" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "O", + "B-LOC" + ] + }, + { + "tokens": [ + "yang", + "sudah", + "mengenal", + "tulisan", + "sejak", + "sekitar", + "tahun", + "3000", + "SM" + ], + "labels": ["O", "O", "O", "O", "O", "O", "B-TIME", "I-TIME", "I-TIME"] + }, + { + "tokens": [ + "Fakta-fakta", + "masa", + "aksara", + "di", + "Kepulauan", + "Indonesia", + "dihubungkan", + "dengan", + "temuan", + "prasasti", + "peninggalan", + "kerajaan", + "tua" + ], + "labels": [ + "O", + "B-TIME", + "I-TIME", + "O", + "B-LOC", + "I-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "seperti", + "Kerajaan", + "Kutai", + "di", + "Muara", + "Kaman", + "Kalimantan", + "Timur" + ], + "labels": ["O", "O", "B-ORG", "O", "B-LOC", "I-LOC", "I-LOC", "I-LOC"] + }, + + { + "tokens": [ + "Bumi", + "kita", + "yang", + "terhampar", + "luas", + "ini", + "diciptakan", + "Tuhan", + "Yang", + "Maha", + "Pencipta", + "untuk", + "kehidupan", + "dan", + "kepentingan", + "hidup", + "manusia" + ], + "labels": [ + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "B-PER", + "I-PER", + "I-PER", + "I-PER", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Di", + "bumi", + "ini", + "hidup", + "berbagai", + "flora", + "dan", + "fauna", + "serta", + "tempat", + "bersemainya", + "manusia", + "dengan", + "keturunannya" + ], + "labels": [ + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Di", + "bumi", + "ini", + "kita", + "bisa", + "menyaksikan", + "keindahan", + "alam", + "kita", + "bisa", + "beraktivitas", + "dan", + "berikhtiar", + "memenuhi", + "kebutuhan", + "hidup", + "kita" + ], + "labels": [ + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Namun", + "harus", + "dipahami", + "bahwa", + "bumi", + "kita", + "juga", + "sering", + "menimbulkan", + "bencana" + ], + "labels": ["O", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Sebagai", + "contoh", + "munculnya", + "aktivitas", + "lempeng", + "bumi", + "yang", + "kemudian", + "melahirkan", + "gempa", + "baik", + "tektonis", + "maupun", + "vulkanis", + "bahkan", + "sampai", + "menimbulkan", + "tsunami" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Sebagai", + "contoh", + "tentu", + "kamu", + "masih", + "ingat", + "gempa", + "dan", + "tsunami", + "yang", + "terjadi", + "di", + "Aceh" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC" + ] + }, + { + "tokens": [ + "gempa", + "di", + "Yogyakarta", + "di", + "Papua", + "dan", + "beberapa", + "daerah", + "lain", + "termasuk", + "beberapa", + "gunung", + "api", + "meletus" + ], + "labels": [ + "O", + "O", + "B-LOC", + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ] + }, + { + "tokens": [ + "Bencana", + "tersebut", + "telah", + "mengakibatkan", + "ribuan", + "nyawa", + "hilang", + "dan", + "harta", + "benda", + "melayang" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Fenomena", + "alam", + "yang", + "terjadi", + "itu", + "merupakan", + "bagian", + "tak", + "terpisahkan", + "dari", + "aktivitas", + "panjang", + "bumi", + "kita", + "sejak", + "proses", + "terjadinya", + "alam", + "semesta", + "ratusan", + "ribuan", + "bahkan", + "juta", + "tahun", + "yang", + "lalu" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "O", + "O", + "O", + "O", + "O", + "O", + "B-TIME", + "I-TIME", + "I-TIME", + "I-TIME", + "I-TIME", + "I-TIME", + "O", + "O" + ] + }, + { + "tokens": [ + "Proses", + "tersebut", + "secara", + "geologis", + "mengalami", + "beberapa", + "tahapan", + "atau", + "pembabakan", + "waktu" + ], + "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + }, + { + "tokens": [ + "Berikut", + "ini", + "kita", + "mencoba", + "menelaah", + "tentang", + "pembabakan", + "waktu", + "alam", + "secara", + "geologis", + "dan", + "terbentuknya", + "Kepulauan", + "Indonesia", + "terbentuk" + ], + "labels": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "I-LOC", + "O" + ] + }, + { + "tokens": ["dani", "pergi", "ke", "surabaya", "sore", "ini"], + "labels": ["B-PER", "O", "O", "B-LOC", "B-TIME", "O"] + }, + { + "tokens": [ + "malam", + "nanti", + "jun", + "sedang", + "menonton", + "film", + "dengan", + "pacarnya" + ], + "labels": ["B-TIME", "O", "B-PER", "O", "O", "O", "O", "B-PER"] + } +] diff --git a/dataset/lstm_sentiment_analys.json b/dataset/lstm_sentiment_analys.json new file mode 100644 index 0000000..e69de29 diff --git a/lstm_multi_output_model.keras b/lstm_multi_output_model.keras index 00a6876..f9f13fa 100644 Binary files a/lstm_multi_output_model.keras and b/lstm_multi_output_model.keras differ diff --git a/lstm_multi_output_ner_model.keras b/lstm_multi_output_ner_model.keras new file mode 100644 index 0000000..5d1ef4c Binary files /dev/null and b/lstm_multi_output_ner_model.keras differ diff --git a/ner_lstm.ipynb b/ner_lstm.ipynb new file mode 100644 index 0000000..9c43f2b --- /dev/null +++ b/ner_lstm.ipynb @@ -0,0 +1,640 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/akeon/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package averaged_perceptron_tagger to\n", + "[nltk_data] /home/akeon/nltk_data...\n", + "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", + "[nltk_data] date!\n", + "[nltk_data] Downloading package maxent_ne_chunker to\n", + "[nltk_data] /home/akeon/nltk_data...\n", + "[nltk_data] Package maxent_ne_chunker is already up-to-date!\n", + "[nltk_data] Downloading package words to /home/akeon/nltk_data...\n", + "[nltk_data] Package words is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "import string, re, pickle\n", + "import numpy as np\n", + "from nltk.tokenize import word_tokenize\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from sklearn.model_selection import train_test_split\n", + "from tensorflow.keras.models import Model\n", + "from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense\n", + "import spacy\n", + "import nltk\n", + "\n", + "\n", + "nltk.download ('punkt')\n", + "nltk.download ('averaged_perceptron_tagger')\n", + "nltk.download ('maxent_ne_chunker')\n", + "nltk.download ('words')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kerajaan Aceh PER\n", + "Iskandar Muda PER\n", + "ke-17 MISC\n", + "Islam MISC\n", + "Nusantara LOC\n" + ] + } + ], + "source": [ + "import spacy\n", + "\n", + "# Load model multibahasa yang mendukung Indonesia\n", + "nlp = spacy.load(\"xx_ent_wiki_sm\")\n", + "\n", + "# Contoh teks\n", + "text = \"Kerajaan Aceh mencapai puncak kejayaannya di bawah pemerintahan Sultan Iskandar Muda pada abad ke-17. Aceh menjadi pusat perdagangan dan kebudayaan Islam di wilayah Nusantara.\"\n", + "\n", + "# Proses teks dengan model\n", + "doc = nlp(text)\n", + "\n", + "# Cetak entitas yang dikenali\n", + "for ent in doc.ents:\n", + " print(ent.text, ent.label_)\n", + " \n", + "\n", + "# def generate_ner_context(text):\n", + "# # Load the pretrained spaCy model (small Indo model or use multilingual model if needed)\n", + "# nlp = spacy.load(\"xx_ent_wiki_sm\") # Load multilingual model\n", + " \n", + "# # Process the text\n", + "# doc = nlp(text)\n", + " \n", + "# # Tokenization and Named Entity Recognition (NER)\n", + "# tokens = [token.text for token in doc]\n", + "# ner_tags = []\n", + "# for token in doc:\n", + "# if token.ent_type_:\n", + "# ner_tags.append(f\"B-{token.ent_type_}\")\n", + "# else:\n", + "# ner_tags.append(\"O\")\n", + " \n", + "# return tokens, ner_tags\n", + "\n", + "# # Example input context\n", + "# context = \"Perang Diponegoro berlangsung dari tahun 1825 hingga 1830. Perang ini dipimpin oleh Pangeran Diponegoro melawan pemerintah kolonial Belanda di Jawa Tengah.\"\n", + "\n", + "# # Generate NER and tokens\n", + "# tokens, ner_tags = generate_ner_context(context)\n", + "\n", + "# # Construct the JSON result\n", + "# result = {\n", + "# \"context\": context,\n", + "# \"context_tokens\": tokens,\n", + "# \"context_ner\": ner_tags,\n", + "# \"question_posibility\": [\n", + "# {\n", + "# \"type\": \"true_false\",\n", + "# \"question\": \"Perang Diponegoro berlangsung selama lima tahun.\",\n", + "# \"answer\": \"True\"\n", + "# },\n", + "# {\n", + "# \"type\": \"true_false\",\n", + "# \"question\": \"Perang Diponegoro berakhir pada tahun 1850.\",\n", + "# \"answer\": \"False\"\n", + "# }\n", + "# ]\n", + "# }\n", + "\n", + "# # Output the result\n", + "# import json\n", + "# print(json.dumps(result, indent=4, ensure_ascii=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = [\n", + " {\n", + " \"context\": \"Pertempuran Surabaya terjadi pada 10 November 1945 antara pasukan Indonesia melawan pasukan sekutu Inggris yang berusaha mengambil alih kota setelah Jepang menyerah dalam Perang Dunia II. Pertempuran ini dikenang sebagai Hari Pahlawan di Indonesia.\",\n", + " \"context_tokens\": [\n", + " \"Pertempuran\", \"Surabaya\", \"terjadi\", \"pada\", \"10\", \"November\", \"1945\",\n", + " \"antara\", \"pasukan\", \"Indonesia\", \"melawan\", \"pasukan\", \"sekutu\", \"Inggris\",\n", + " \"yang\", \"berusaha\", \"mengambil\", \"alih\", \"kota\", \"setelah\", \"Jepang\", \"menyerah\",\n", + " \"dalam\", \"Perang\", \"Dunia\", \"II\", \".\", \"Pertempuran\", \"ini\", \"dikenang\", \"sebagai\",\n", + " \"Hari\", \"Pahlawan\", \"di\", \"Indonesia\", \".\"\n", + " ],\n", + " \"context_ner\": [\n", + " \"O\", \"B-LOC\", \"O\", \"O\", \"B-DATE\", \"I-DATE\", \"I-DATE\",\n", + " \"O\", \"O\", \"B-LOC\", \"O\", \"O\", \"O\", \"B-LOC\",\n", + " \"O\", \"O\", \"O\", \"O\", \"O\", \"O\", \"B-LOC\", \"O\",\n", + " \"O\", \"B-MISC\", \"I-MISC\", \"I-MISC\", \"O\", \"O\", \"O\", \"O\", \"O\",\n", + " \"O\", \"O\", \"O\", \"B-LOC\", \"O\"\n", + " ],\n", + " \"question_posibility\": [\n", + " {\n", + " \"type\": \"fill_in_the_blank\",\n", + " \"question\": \"Pertempuran Surabaya terjadi pada tanggal _______.\",\n", + " \"answer\": \"10 November 1945\"\n", + " },\n", + " {\n", + " \"type\": \"multiple_choice\",\n", + " \"question\": \"Pasukan yang dihadapi Indonesia dalam Pertempuran Surabaya berasal dari negara apa?\",\n", + " \"options\": [\"Jepang\", \"Belanda\", \"Inggris\", \"Australia\"],\n", + " \"answer\": \"Inggris\"\n", + " },\n", + " {\n", + " \"type\": \"true_false\",\n", + " \"question\": \"Pertempuran Surabaya diperingati sebagai Hari Pahlawan di Indonesia.\",\n", + " \"answer\": \"True\"\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"context\": \"Perang Diponegoro berlangsung dari tahun 1825 hingga 1830. Perang ini dipimpin oleh Pangeran Diponegoro melawan pemerintah kolonial Belanda di Jawa Tengah.\",\n", + " \"context_tokens\": [\n", + " \"Perang\", \"Diponegoro\", \"berlangsung\", \"dari\", \"tahun\", \"1825\", \"hingga\", \"1830\", \".\",\n", + " \"Perang\", \"ini\", \"dipimpin\", \"oleh\", \"Pangeran\", \"Diponegoro\", \"melawan\",\n", + " \"pemerintah\", \"kolonial\", \"Belanda\", \"di\", \"Jawa\", \"Tengah\", \".\"\n", + " ],\n", + " \"context_ner\": [\n", + " \"O\", \"B-PER\", \"O\", \"O\", \"O\", \"B-DATE\", \"O\", \"B-DATE\", \"O\",\n", + " \"O\", \"O\", \"O\", \"O\", \"B-PER\", \"I-PER\", \"O\",\n", + " \"O\", \"O\", \"B-LOC\", \"O\", \"O\", \"B-LOC\", \"O\"\n", + " ],\n", + " \"question_posibility\": [\n", + " {\n", + " \"type\": \"true_false\",\n", + " \"question\": \"Perang Diponegoro berlangsung selama lima tahun.\",\n", + " \"answer\": \"True\"\n", + " },\n", + " {\n", + " \"type\": \"true_false\",\n", + " \"question\": \"Perang Diponegoro berakhir pada tahun 1850.\",\n", + " \"answer\": \"False\"\n", + " }\n", + " ]\n", + " }\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "contexts_tokens = []\n", + "contexts_ner = []\n", + "questions = []\n", + "answers = []\n", + "qtypes = []\n", + "\n", + "for entry in dataset:\n", + " contexts_tokens.append(entry[\"context_tokens\"])\n", + " contexts_ner.append(entry[\"context_ner\"])\n", + " qa = entry[\"question_posibility\"][0] # pilih soal pertama\n", + " questions.append(qa[\"question\"])\n", + " answers.append(qa[\"answer\"])\n", + " qtypes.append(qa[\"type\"]) # misalnya \"fill_in_the_blank\"\n", + "\n", + "# ----------------------------\n", + "# Tokenisasi untuk Kata\n", + "# ----------------------------\n", + "# Kita gabungkan semua teks dari context (dari tokens), pertanyaan, dan jawaban\n", + "all_texts = []\n", + "for tokens in contexts_tokens:\n", + " all_texts.append(\" \".join(tokens))\n", + "all_texts += questions\n", + "all_texts += answers\n", + "\n", + "tokenizer = Tokenizer(oov_token=\"\")\n", + "tokenizer.fit_on_texts(all_texts)\n", + "\n", + "# Ubah context_tokens menjadi sequence angka\n", + "context_sequences = [tokenizer.texts_to_sequences([\" \".join(tokens)])[0] for tokens in contexts_tokens]\n", + "question_sequences = tokenizer.texts_to_sequences(questions)\n", + "answer_sequences = tokenizer.texts_to_sequences(answers)\n", + "\n", + "# Padding sequence ke panjang tetap\n", + "MAX_LENGTH = 50 # sesuaikan dengan panjang teks maksimum yang diinginkan\n", + "context_padded = pad_sequences(context_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n", + "question_padded = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n", + "answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# ----------------------------\n", + "# Tokenisasi untuk Label NER\n", + "# ----------------------------\n", + "# Kumpulkan semua label NER untuk membangun mapping label ke indeks\n", + "all_ner_labels = []\n", + "for ner_seq in contexts_ner:\n", + " all_ner_labels += ner_seq\n", + "\n", + "ner_labels_set = sorted(list(set(all_ner_labels)))\n", + "# Contoh: ['B-DATE', 'B-LOC', 'B-MISC', 'B-PER', 'I-DATE', 'I-MISC', 'I-PER', 'O']\n", + "ner2idx = {label: idx for idx, label in enumerate(ner_labels_set)}\n", + "idx2ner = {idx: label for label, idx in ner2idx.items()}\n", + "\n", + "# Ubah label NER ke dalam bentuk sequence angka\n", + "ner_sequences = []\n", + "for ner_seq in contexts_ner:\n", + " seq = [ner2idx[label] for label in ner_seq]\n", + " ner_sequences.append(seq)\n", + "\n", + "# Padding sequence label NER (gunakan nilai default misal label \"O\")\n", + "ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\", value=ner2idx[\"O\"])\n", + "\n", + "# ----------------------------\n", + "# Label Tipe Soal\n", + "# ----------------------------\n", + "qtype_dict = {\"fill_in_the_blank\": 0, \"true_false\": 1, \"multiple_choice\": 2}\n", + "qtype_labels = np.array([qtype_dict[q] for q in qtypes])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# ----------------------------\n", + "# Split Data Training dan Validation\n", + "# ----------------------------\n", + "(context_train, context_val, \n", + " question_train, question_val, \n", + " answer_train, answer_val, \n", + " qtype_train, qtype_val,\n", + " ner_train, ner_val) = train_test_split(\n", + " context_padded, question_padded, answer_padded, qtype_labels, ner_padded,\n", + " test_size=0.2, random_state=42\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-03-23 15:06:59.338033: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n" + ] + }, + { + "data": { + "text/html": [ + "
Model: \"functional\"\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1mModel: \"functional\"\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ Layer (type)        ┃ Output Shape      ┃    Param # ┃ Connected to      ┃\n",
+       "┑━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
+       "β”‚ context_input       β”‚ (None, 50)        β”‚          0 β”‚ -                 β”‚\n",
+       "β”‚ (InputLayer)        β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ question_decoder_i… β”‚ (None, 50)        β”‚          0 β”‚ -                 β”‚\n",
+       "β”‚ (InputLayer)        β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ context_embedding   β”‚ (None, 50, 300)   β”‚     15,600 β”‚ context_input[0]… β”‚\n",
+       "β”‚ (Embedding)         β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ not_equal           β”‚ (None, 50)        β”‚          0 β”‚ context_input[0]… β”‚\n",
+       "β”‚ (NotEqual)          β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ question_embedding  β”‚ (None, 50, 300)   β”‚     15,600 β”‚ question_decoder… β”‚\n",
+       "β”‚ (Embedding)         β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ encoder_lstm (LSTM) β”‚ [(None, 256),     β”‚    570,368 β”‚ context_embeddin… β”‚\n",
+       "β”‚                     β”‚ (None, 256),      β”‚            β”‚ not_equal[0][0]   β”‚\n",
+       "β”‚                     β”‚ (None, 256)]      β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ question_lstm       β”‚ [(None, 50, 256), β”‚    570,368 β”‚ question_embeddi… β”‚\n",
+       "β”‚ (LSTM)              β”‚ (None, 256),      β”‚            β”‚ encoder_lstm[0][… β”‚\n",
+       "β”‚                     β”‚ (None, 256)]      β”‚            β”‚ encoder_lstm[0][… β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ answer_lstm (LSTM)  β”‚ [(None, 50, 256), β”‚    570,368 β”‚ context_embeddin… β”‚\n",
+       "β”‚                     β”‚ (None, 256),      β”‚            β”‚ encoder_lstm[0][… β”‚\n",
+       "β”‚                     β”‚ (None, 256)]      β”‚            β”‚ encoder_lstm[0][… β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ dense (Dense)       β”‚ (None, 128)       β”‚     32,896 β”‚ encoder_lstm[0][… β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ ner_lstm            β”‚ (None, 50, 512)   β”‚  1,140,736 β”‚ context_embeddin… β”‚\n",
+       "β”‚ (Bidirectional)     β”‚                   β”‚            β”‚ not_equal[0][0]   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ question_output     β”‚ (None, 50, 52)    β”‚     13,364 β”‚ question_lstm[0]… β”‚\n",
+       "β”‚ (Dense)             β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ answer_output       β”‚ (None, 50, 52)    β”‚     13,364 β”‚ answer_lstm[0][0] β”‚\n",
+       "β”‚ (Dense)             β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ question_type_outp… β”‚ (None, 3)         β”‚        387 β”‚ dense[0][0]       β”‚\n",
+       "β”‚ (Dense)             β”‚                   β”‚            β”‚                   β”‚\n",
+       "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+       "β”‚ ner_output          β”‚ (None, 50, 8)     β”‚      4,104 β”‚ ner_lstm[0][0],   β”‚\n",
+       "β”‚ (TimeDistributed)   β”‚                   β”‚            β”‚ not_equal[0][0]   β”‚\n",
+       "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n",
+       "
\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n", + "┑━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", + "β”‚ context_input β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) β”‚ \u001b[38;5;34m0\u001b[0m β”‚ - β”‚\n", + "β”‚ (\u001b[38;5;33mInputLayer\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ question_decoder_i… β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) β”‚ \u001b[38;5;34m0\u001b[0m β”‚ - β”‚\n", + "β”‚ (\u001b[38;5;33mInputLayer\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ context_embedding β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m300\u001b[0m) β”‚ \u001b[38;5;34m15,600\u001b[0m β”‚ context_input[\u001b[38;5;34m0\u001b[0m]… β”‚\n", + "β”‚ (\u001b[38;5;33mEmbedding\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ not_equal β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) β”‚ \u001b[38;5;34m0\u001b[0m β”‚ context_input[\u001b[38;5;34m0\u001b[0m]… β”‚\n", + "β”‚ (\u001b[38;5;33mNotEqual\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ question_embedding β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m300\u001b[0m) β”‚ \u001b[38;5;34m15,600\u001b[0m β”‚ question_decoder… β”‚\n", + "β”‚ (\u001b[38;5;33mEmbedding\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) β”‚ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), β”‚ \u001b[38;5;34m570,368\u001b[0m β”‚ context_embeddin… β”‚\n", + "β”‚ β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), β”‚ β”‚ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] β”‚\n", + "β”‚ β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ question_lstm β”‚ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m256\u001b[0m), β”‚ \u001b[38;5;34m570,368\u001b[0m β”‚ question_embeddi… β”‚\n", + "β”‚ (\u001b[38;5;33mLSTM\u001b[0m) β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), β”‚ β”‚ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m β”‚\n", + "β”‚ β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] β”‚ β”‚ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ answer_lstm (\u001b[38;5;33mLSTM\u001b[0m) β”‚ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m256\u001b[0m), β”‚ \u001b[38;5;34m570,368\u001b[0m β”‚ context_embeddin… β”‚\n", + "β”‚ β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), β”‚ β”‚ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m β”‚\n", + "β”‚ β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] β”‚ β”‚ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ dense (\u001b[38;5;33mDense\u001b[0m) β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) β”‚ \u001b[38;5;34m32,896\u001b[0m β”‚ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ ner_lstm β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m512\u001b[0m) β”‚ \u001b[38;5;34m1,140,736\u001b[0m β”‚ context_embeddin… β”‚\n", + "β”‚ (\u001b[38;5;33mBidirectional\u001b[0m) β”‚ β”‚ β”‚ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ question_output β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m52\u001b[0m) β”‚ \u001b[38;5;34m13,364\u001b[0m β”‚ question_lstm[\u001b[38;5;34m0\u001b[0m]… β”‚\n", + "β”‚ (\u001b[38;5;33mDense\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ answer_output β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m52\u001b[0m) β”‚ \u001b[38;5;34m13,364\u001b[0m β”‚ answer_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] β”‚\n", + "β”‚ (\u001b[38;5;33mDense\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ question_type_outp… β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m) β”‚ \u001b[38;5;34m387\u001b[0m β”‚ dense[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] β”‚\n", + "β”‚ (\u001b[38;5;33mDense\u001b[0m) β”‚ β”‚ β”‚ β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ ner_output β”‚ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m8\u001b[0m) β”‚ \u001b[38;5;34m4,104\u001b[0m β”‚ ner_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], β”‚\n", + "β”‚ (\u001b[38;5;33mTimeDistributed\u001b[0m) β”‚ β”‚ β”‚ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Total params: 2,947,155 (11.24 MB)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,947,155\u001b[0m (11.24 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Trainable params: 2,947,155 (11.24 MB)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,947,155\u001b[0m (11.24 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Non-trainable params: 0 (0.00 B)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-03-23 15:07:06.004502: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 8s/step - answer_output_accuracy: 0.0000e+00 - answer_output_loss: 3.9473 - loss: 11.0828 - ner_output_accuracy: 0.0800 - ner_output_loss: 2.0766 - question_output_accuracy: 0.0400 - question_output_loss: 3.9452 - question_type_output_accuracy: 0.0000e+00 - question_type_output_loss: 1.1138 - val_answer_output_accuracy: 0.3200 - val_answer_output_loss: 3.9260 - val_loss: 11.0343 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 2.0489 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9441 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1153\n", + "Epoch 2/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 300ms/step - answer_output_accuracy: 0.6800 - answer_output_loss: 3.8844 - loss: 10.8637 - ner_output_accuracy: 0.7800 - ner_output_loss: 2.0194 - question_output_accuracy: 0.0800 - question_output_loss: 3.9047 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.0550 - val_answer_output_accuracy: 0.5800 - val_answer_output_loss: 3.8962 - val_loss: 10.9915 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 2.0227 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9453 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1273\n", + "Epoch 3/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 300ms/step - answer_output_accuracy: 0.9000 - answer_output_loss: 3.8076 - loss: 10.6189 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.9522 - question_output_accuracy: 0.0800 - question_output_loss: 3.8585 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.0005 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.8543 - val_loss: 10.9334 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.9867 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9469 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1455\n", + "Epoch 4/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 340ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 3.6877 - loss: 10.2657 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.8489 - question_output_accuracy: 0.0600 - question_output_loss: 3.8010 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.9281 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.7881 - val_loss: 10.8457 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.9324 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9492 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1760\n", + "Epoch 5/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 330ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 3.4683 - loss: 9.6920 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.6792 - question_output_accuracy: 0.0600 - question_output_loss: 3.7188 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.8258 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.6742 - val_loss: 10.7083 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.8475 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9535 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.2331\n", + "Epoch 6/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 344ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 2.9986 - loss: 8.6406 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.3997 - question_output_accuracy: 0.0400 - question_output_loss: 3.5829 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.6593 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.4580 - val_loss: 10.4731 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.7102 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9628 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.3420\n", + "Epoch 7/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 319ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 1.9364 - loss: 6.7078 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.0844 - question_output_accuracy: 0.0400 - question_output_loss: 3.3048 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.3822 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 2.9410 - val_loss: 10.0188 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.5038 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9871 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.5870\n", + "Epoch 8/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 318ms/step - answer_output_accuracy: 0.9600 - answer_output_loss: 0.9184 - loss: 4.9883 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.3771 - question_output_accuracy: 0.0400 - question_output_loss: 2.6239 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.0690 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 1.7714 - val_loss: 9.6522 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.4667 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 4.0805 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 2.3336\n", + "Epoch 9/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 286ms/step - answer_output_accuracy: 0.9600 - answer_output_loss: 0.4511 - loss: 3.5983 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.3641 - question_output_accuracy: 0.0400 - question_output_loss: 1.7815 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.0015 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 0.8089 - val_loss: 11.4588 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.5131 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 4.6062 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 4.5306\n", + "Epoch 10/10\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 304ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 0.3244 - loss: 2.9690 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.1538 - question_output_accuracy: 0.0600 - question_output_loss: 1.4906 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.7498e-04 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 0.3998 - val_loss: 16.2049 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.5880 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 6.0197 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 8.1974\n" + ] + } + ], + "source": [ + "# ----------------------------\n", + "# Parameter Model\n", + "# ----------------------------\n", + "VOCAB_SIZE = len(tokenizer.word_index) + 1\n", + "EMBEDDING_DIM = 300\n", + "LSTM_UNITS = 256\n", + "BATCH_SIZE = 16\n", + "EPOCHS = 10\n", + "NUM_NER_TAGS = len(ner2idx)\n", + "\n", + "# ----------------------------\n", + "# Arsitektur Model Multi-Output\n", + "# ----------------------------\n", + "\n", + "# Encoder: Input context\n", + "context_input = Input(shape=(MAX_LENGTH,), name=\"context_input\")\n", + "context_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"context_embedding\")(context_input)\n", + "encoder_lstm = LSTM(LSTM_UNITS, return_state=True, name=\"encoder_lstm\")\n", + "encoder_output, state_h, state_c = encoder_lstm(context_embedding)\n", + "\n", + "# Branch untuk pembuatan soal (Question Decoder)\n", + "question_decoder_input = Input(shape=(MAX_LENGTH,), name=\"question_decoder_input\")\n", + "question_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"question_embedding\")(question_decoder_input)\n", + "question_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name=\"question_lstm\")\n", + "question_output, _, _ = question_lstm(question_embedding, initial_state=[state_h, state_c])\n", + "question_dense = Dense(VOCAB_SIZE, activation=\"softmax\", name=\"question_output\")(question_output)\n", + "\n", + "# Branch untuk pembuatan jawaban (Answer Decoder)\n", + "answer_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name=\"answer_lstm\")\n", + "answer_output, _, _ = answer_lstm(context_embedding, initial_state=[state_h, state_c])\n", + "answer_dense = Dense(VOCAB_SIZE, activation=\"softmax\", name=\"answer_output\")(answer_output)\n", + "\n", + "# Branch untuk klasifikasi tipe soal\n", + "type_dense = Dense(128, activation=\"relu\")(encoder_output)\n", + "question_type_output = Dense(3, activation=\"softmax\", name=\"question_type_output\")(type_dense)\n", + "\n", + "# Branch untuk NER: Menggunakan context_embedding untuk melakukan sequence tagging\n", + "ner_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, recurrent_dropout=0.1), name=\"ner_lstm\")(context_embedding)\n", + "ner_output = TimeDistributed(Dense(NUM_NER_TAGS, activation=\"softmax\"), name=\"ner_output\")(ner_lstm)\n", + "\n", + "# Gabungkan semua branch dalam satu model multi-output\n", + "model = Model(\n", + " inputs=[context_input, question_decoder_input],\n", + " outputs=[question_dense, answer_dense, question_type_output, ner_output]\n", + ")\n", + "\n", + "model.compile(\n", + " optimizer=\"adam\",\n", + " loss={\n", + " \"question_output\": \"sparse_categorical_crossentropy\",\n", + " \"answer_output\": \"sparse_categorical_crossentropy\",\n", + " \"question_type_output\": \"sparse_categorical_crossentropy\",\n", + " \"ner_output\": \"sparse_categorical_crossentropy\"\n", + " },\n", + " metrics={\n", + " \"question_output\": [\"accuracy\"],\n", + " \"answer_output\": [\"accuracy\"],\n", + " \"question_type_output\": [\"accuracy\"],\n", + " \"ner_output\": [\"accuracy\"]\n", + " }\n", + ")\n", + "\n", + "model.summary()\n", + "\n", + "# ----------------------------\n", + "# Training Model\n", + "# ----------------------------\n", + "model.fit(\n", + " [context_train, question_train],\n", + " {\n", + " \"question_output\": question_train,\n", + " \"answer_output\": answer_train,\n", + " \"question_type_output\": qtype_train,\n", + " \"ner_output\": ner_train\n", + " },\n", + " batch_size=BATCH_SIZE,\n", + " epochs=EPOCHS,\n", + " validation_data=(\n", + " [context_val, question_val],\n", + " {\n", + " \"question_output\": question_val,\n", + " \"answer_output\": answer_val,\n", + " \"question_type_output\": qtype_val,\n", + " \"ner_output\": ner_val\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Simpan model dan tokenizer bila diperlukan\n", + "model.save(\"lstm_multi_output_ner_model.keras\")\n", + "with open(\"tokenizer.pkl\", \"wb\") as handle:\n", + " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tokenizer.pkl b/tokenizer.pkl index 8e5c4a3..a0e3be5 100644 Binary files a/tokenizer.pkl and b/tokenizer.pkl differ diff --git a/training_model.ipynb b/training_model.ipynb index 23827d8..f01174a 100644 --- a/training_model.ipynb +++ b/training_model.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 112, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 163, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ "True" ] }, - "execution_count": 113, + "execution_count": 163, "metadata": {}, "output_type": "execute_result" } @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 164, "metadata": {}, "outputs": [ { @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -187,7 +187,6 @@ "tokenizer = Tokenizer(oov_token=\"\")\n", "tokenizer.fit_on_texts(contexts + questions + correct_answers)\n", "\n", - "\n", "context_sequences = tokenizer.texts_to_sequences(contexts)\n", "question_sequences = tokenizer.texts_to_sequences(questions)\n", "answer_sequences = tokenizer.texts_to_sequences(correct_answers)\n", @@ -206,7 +205,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 166, "metadata": {}, "outputs": [ { @@ -237,33 +236,53 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 167, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 1s/step - answer_output_accuracy: 0.0344 - answer_output_loss: 6.2090 - loss: 13.5239 - question_output_accuracy: 0.0000e+00 - question_output_loss: 6.2154 - question_type_output_accuracy: 0.3004 - question_type_output_loss: 1.0991 - val_answer_output_accuracy: 0.2287 - val_answer_output_loss: 6.1669 - val_loss: 13.4815 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.2101 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1046\n", - "Epoch 2/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 526ms/step - answer_output_accuracy: 0.2277 - answer_output_loss: 6.1421 - loss: 13.4196 - question_output_accuracy: 0.0113 - question_output_loss: 6.1984 - question_type_output_accuracy: 0.6445 - question_type_output_loss: 1.0780 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 6.0462 - val_loss: 13.3570 - val_question_output_accuracy: 0.0081 - val_question_output_loss: 6.2031 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1076\n", - "Epoch 3/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 528ms/step - answer_output_accuracy: 0.9837 - answer_output_loss: 5.9539 - loss: 13.1879 - question_output_accuracy: 0.0171 - question_output_loss: 6.1802 - question_type_output_accuracy: 0.5799 - question_type_output_loss: 1.0503 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 5.5439 - val_loss: 12.8565 - val_question_output_accuracy: 0.0087 - val_question_output_loss: 6.1941 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.1185\n", - "Epoch 4/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 533ms/step - answer_output_accuracy: 0.9839 - answer_output_loss: 5.1228 - loss: 12.2985 - question_output_accuracy: 0.0137 - question_output_loss: 6.1532 - question_type_output_accuracy: 0.5164 - question_type_output_loss: 1.0060 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 3.2875 - val_loss: 10.6708 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.1772 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.2060\n", - "Epoch 5/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 520ms/step - answer_output_accuracy: 0.9835 - answer_output_loss: 2.7939 - loss: 9.9397 - question_output_accuracy: 0.0056 - question_output_loss: 6.0862 - question_type_output_accuracy: 0.5263 - question_type_output_loss: 1.0473 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 1.1028 - val_loss: 9.0601 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.1277 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.8296\n", - "Epoch 6/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 541ms/step - answer_output_accuracy: 0.9828 - answer_output_loss: 1.2315 - loss: 8.3718 - question_output_accuracy: 0.0016 - question_output_loss: 5.8773 - question_type_output_accuracy: 0.5055 - question_type_output_loss: 1.2478 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6227 - val_loss: 8.6339 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.0831 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 1.9281\n", - "Epoch 7/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 492ms/step - answer_output_accuracy: 0.9842 - answer_output_loss: 0.7375 - loss: 7.4714 - question_output_accuracy: 9.6824e-04 - question_output_loss: 5.5770 - question_type_output_accuracy: 0.4612 - question_type_output_loss: 1.1578 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5788 - val_loss: 7.9850 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.1148 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 1.2913\n", - "Epoch 8/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 455ms/step - answer_output_accuracy: 0.9847 - answer_output_loss: 0.6731 - loss: 6.9870 - question_output_accuracy: 0.0011 - question_output_loss: 5.3263 - question_type_output_accuracy: 0.5596 - question_type_output_loss: 0.9895 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6030 - val_loss: 7.8753 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.2693 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.0031\n", - "Epoch 9/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 458ms/step - answer_output_accuracy: 0.9836 - answer_output_loss: 0.7391 - loss: 6.9393 - question_output_accuracy: 0.0017 - question_output_loss: 5.0887 - question_type_output_accuracy: 0.4841 - question_type_output_loss: 1.1123 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6056 - val_loss: 8.1353 - val_question_output_accuracy: 0.0019 - val_question_output_loss: 6.4616 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.0682\n", - "Epoch 10/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 454ms/step - answer_output_accuracy: 0.9847 - answer_output_loss: 0.6727 - loss: 6.6312 - question_output_accuracy: 0.0018 - question_output_loss: 4.9620 - question_type_output_accuracy: 0.5258 - question_type_output_loss: 1.0078 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5869 - val_loss: 8.5074 - val_question_output_accuracy: 0.0037 - val_question_output_loss: 6.6207 - val_question_type_output_accuracy: 0.3750 - val_question_type_output_loss: 1.2998\n" + "Epoch 1/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 752ms/step - answer_output_accuracy: 0.0239 - answer_output_loss: 6.2109 - loss: 13.5254 - question_output_accuracy: 4.3716e-04 - question_output_loss: 6.2153 - question_type_output_accuracy: 0.3332 - question_type_output_loss: 1.0988 - val_answer_output_accuracy: 0.1931 - val_answer_output_loss: 6.1791 - val_loss: 13.4936 - val_question_output_accuracy: 0.0056 - val_question_output_loss: 6.2104 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1042\n", + "Epoch 2/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 329ms/step - answer_output_accuracy: 0.1947 - answer_output_loss: 6.1534 - loss: 13.4319 - question_output_accuracy: 0.0122 - question_output_loss: 6.1985 - question_type_output_accuracy: 0.6445 - question_type_output_loss: 1.0791 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 6.0885 - val_loss: 13.4025 - val_question_output_accuracy: 0.0112 - val_question_output_loss: 6.2026 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1115\n", + "Epoch 3/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 316ms/step - answer_output_accuracy: 0.9831 - answer_output_loss: 6.0118 - loss: 13.2454 - question_output_accuracy: 0.0183 - question_output_loss: 6.1792 - question_type_output_accuracy: 0.6341 - question_type_output_loss: 1.0521 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 5.7557 - val_loss: 13.0783 - val_question_output_accuracy: 0.0106 - val_question_output_loss: 6.1923 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1303\n", + "Epoch 4/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 318ms/step - answer_output_accuracy: 0.9831 - answer_output_loss: 5.4126 - loss: 12.5932 - question_output_accuracy: 0.0159 - question_output_loss: 6.1526 - question_type_output_accuracy: 0.6132 - question_type_output_loss: 1.0133 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 4.0785 - val_loss: 11.4385 - val_question_output_accuracy: 0.0087 - val_question_output_loss: 6.1729 - val_question_type_output_accuracy: 0.4375 - val_question_type_output_loss: 1.1871\n", + "Epoch 5/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 321ms/step - answer_output_accuracy: 0.9833 - answer_output_loss: 3.5350 - loss: 10.6302 - question_output_accuracy: 0.0109 - question_output_loss: 6.0941 - question_type_output_accuracy: 0.5482 - question_type_output_loss: 0.9777 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 1.4486 - val_loss: 9.1339 - val_question_output_accuracy: 0.0069 - val_question_output_loss: 6.1108 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.5745\n", + "Epoch 6/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 324ms/step - answer_output_accuracy: 0.9830 - answer_output_loss: 1.3763 - loss: 8.3790 - question_output_accuracy: 0.0050 - question_output_loss: 5.8928 - question_type_output_accuracy: 0.5596 - question_type_output_loss: 1.0961 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6301 - val_loss: 8.8752 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.0297 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 2.2154\n", + "Epoch 7/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 320ms/step - answer_output_accuracy: 0.9827 - answer_output_loss: 0.8154 - loss: 7.6408 - question_output_accuracy: 0.0030 - question_output_loss: 5.5596 - question_type_output_accuracy: 0.5596 - question_type_output_loss: 1.2471 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5587 - val_loss: 7.8821 - val_question_output_accuracy: 0.0044 - val_question_output_loss: 6.0440 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.2793\n", + "Epoch 8/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 315ms/step - answer_output_accuracy: 0.9845 - answer_output_loss: 0.6699 - loss: 7.0558 - question_output_accuracy: 0.0025 - question_output_loss: 5.2922 - question_type_output_accuracy: 0.5159 - question_type_output_loss: 1.0964 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5644 - val_loss: 7.7566 - val_question_output_accuracy: 0.0044 - val_question_output_loss: 6.1598 - val_question_type_output_accuracy: 0.4375 - val_question_type_output_loss: 1.0324\n", + "Epoch 9/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 319ms/step - answer_output_accuracy: 0.9837 - answer_output_loss: 0.7007 - loss: 6.7585 - question_output_accuracy: 0.0021 - question_output_loss: 5.0688 - question_type_output_accuracy: 0.5804 - question_type_output_loss: 0.9895 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5536 - val_loss: 8.3754 - val_question_output_accuracy: 0.0044 - val_question_output_loss: 6.3426 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 1.4793\n", + "Epoch 10/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 316ms/step - answer_output_accuracy: 0.9841 - answer_output_loss: 0.6571 - loss: 6.6996 - question_output_accuracy: 0.0020 - question_output_loss: 4.9654 - question_type_output_accuracy: 0.3769 - question_type_output_loss: 1.0726 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5333 - val_loss: 8.2096 - val_question_output_accuracy: 0.0044 - val_question_output_loss: 6.5258 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.1504\n", + "Epoch 11/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 313ms/step - answer_output_accuracy: 0.9846 - answer_output_loss: 0.5896 - loss: 6.3947 - question_output_accuracy: 0.0029 - question_output_loss: 4.8274 - question_type_output_accuracy: 0.5367 - question_type_output_loss: 0.9851 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5115 - val_loss: 8.4411 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.6733 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.2564\n", + "Epoch 12/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 287ms/step - answer_output_accuracy: 0.9832 - answer_output_loss: 0.6274 - loss: 6.3656 - question_output_accuracy: 0.0030 - question_output_loss: 4.7141 - question_type_output_accuracy: 0.4950 - question_type_output_loss: 1.0145 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5007 - val_loss: 8.7380 - val_question_output_accuracy: 0.0037 - val_question_output_loss: 6.7743 - val_question_type_output_accuracy: 0.1875 - val_question_type_output_loss: 1.4630\n", + "Epoch 13/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 294ms/step - answer_output_accuracy: 0.9841 - answer_output_loss: 0.5365 - loss: 6.0931 - question_output_accuracy: 0.0028 - question_output_loss: 4.6340 - question_type_output_accuracy: 0.6330 - question_type_output_loss: 0.9268 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5095 - val_loss: 8.8004 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.8402 - val_question_type_output_accuracy: 0.1875 - val_question_type_output_loss: 1.4508\n", + "Epoch 14/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 293ms/step - answer_output_accuracy: 0.9839 - answer_output_loss: 0.5214 - loss: 5.9535 - question_output_accuracy: 0.0038 - question_output_loss: 4.5068 - question_type_output_accuracy: 0.6023 - question_type_output_loss: 0.9284 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5292 - val_loss: 8.5136 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.8903 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.0940\n", + "Epoch 15/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 294ms/step - answer_output_accuracy: 0.9838 - answer_output_loss: 0.5345 - loss: 5.8897 - question_output_accuracy: 0.0041 - question_output_loss: 4.4544 - question_type_output_accuracy: 0.5596 - question_type_output_loss: 0.9003 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5447 - val_loss: 8.5770 - val_question_output_accuracy: 0.0056 - val_question_output_loss: 6.9331 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.0993\n", + "Epoch 16/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 292ms/step - answer_output_accuracy: 0.9832 - answer_output_loss: 0.5705 - loss: 5.8373 - question_output_accuracy: 0.0048 - question_output_loss: 4.3814 - question_type_output_accuracy: 0.6351 - question_type_output_loss: 0.8800 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5496 - val_loss: 8.8434 - val_question_output_accuracy: 0.0062 - val_question_output_loss: 6.9745 - val_question_type_output_accuracy: 0.2500 - val_question_type_output_loss: 1.3193\n", + "Epoch 17/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 288ms/step - answer_output_accuracy: 0.9832 - answer_output_loss: 0.5433 - loss: 5.6367 - question_output_accuracy: 0.0053 - question_output_loss: 4.2834 - question_type_output_accuracy: 0.6773 - question_type_output_loss: 0.8080 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5488 - val_loss: 8.9683 - val_question_output_accuracy: 0.0062 - val_question_output_loss: 7.0182 - val_question_type_output_accuracy: 0.1875 - val_question_type_output_loss: 1.4014\n", + "Epoch 18/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 292ms/step - answer_output_accuracy: 0.9843 - answer_output_loss: 0.4771 - loss: 5.4290 - question_output_accuracy: 0.0060 - question_output_loss: 4.1923 - question_type_output_accuracy: 0.6877 - question_type_output_loss: 0.7646 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5510 - val_loss: 9.0373 - val_question_output_accuracy: 0.0062 - val_question_output_loss: 7.0739 - val_question_type_output_accuracy: 0.2500 - val_question_type_output_loss: 1.4124\n", + "Epoch 19/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 305ms/step - answer_output_accuracy: 0.9846 - answer_output_loss: 0.4586 - loss: 5.3489 - question_output_accuracy: 0.0053 - question_output_loss: 4.1443 - question_type_output_accuracy: 0.6668 - question_type_output_loss: 0.7466 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5583 - val_loss: 9.1426 - val_question_output_accuracy: 0.0062 - val_question_output_loss: 7.1137 - val_question_type_output_accuracy: 0.1875 - val_question_type_output_loss: 1.4707\n", + "Epoch 20/20\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 305ms/step - answer_output_accuracy: 0.9830 - answer_output_loss: 0.5251 - loss: 5.2352 - question_output_accuracy: 0.0066 - question_output_loss: 4.0488 - question_type_output_accuracy: 0.7298 - question_type_output_loss: 0.6596 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5674 - val_loss: 9.4190 - val_question_output_accuracy: 0.0062 - val_question_output_loss: 7.1243 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 1.7272\n" ] } ], @@ -273,7 +292,7 @@ "EMBEDDING_DIM = 300\n", "LSTM_UNITS = 256\n", "BATCH_SIZE = 32\n", - "EPOCHS = 10\n", + "EPOCHS = 20\n", "\n", "context_input = Input(shape=(MAX_LENGTH,), name=\"context_input\")\n", "context_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"context_embedding\")(context_input)\n", @@ -325,12 +344,12 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 168, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -375,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -383,65 +402,7 @@ "output_type": "stream", "text": [ "\n", - "=== Evaluation on Test Data ===\n", - "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 389ms/step\n", - "Classification Report for Question Type (Test Set):\n", - " precision recall f1-score support\n", - "\n", - " 0 0.00 0.00 0.00 4\n", - " 1 0.40 0.67 0.50 3\n", - " 2 0.20 0.33 0.25 3\n", - "\n", - " accuracy 0.30 10\n", - " macro avg 0.20 0.33 0.25 10\n", - "weighted avg 0.18 0.30 0.23 10\n", - "\n", - "Test Accuracy: 0.3\n", - "Test Precision: 0.18000000000000002\n", - "Test Recall: 0.3\n", - "BLEU Score for first test sample (question generation): 0.02664466031983166\n", - "BLEU Score for first test sample (answer generation): 0\n", - "\n", - "=== Evaluation on Validation Data ===\n", - "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 92ms/step\n", - "Classification Report for Question Type (Validation Set):\n", - " precision recall f1-score support\n", - "\n", - " 0 0.00 0.00 0.00 4\n", - " 1 0.50 1.00 0.67 3\n", - " 2 0.25 0.33 0.29 3\n", - "\n", - " accuracy 0.40 10\n", - " macro avg 0.25 0.44 0.32 10\n", - "weighted avg 0.23 0.40 0.29 10\n", - "\n", - "Validation Accuracy: 0.4\n", - "Validation Precision: 0.225\n", - "Validation Recall: 0.4\n", - "BLEU Score for first validation sample (question generation): 0.008991061769415444\n", - "BLEU Score for first validation sample (answer generation): 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + "=== Evaluation on Test Data ===\n" ] } ], diff --git a/uji.py b/uji.py index aabe8c7..e26256a 100644 --- a/uji.py +++ b/uji.py @@ -1,163 +1,163 @@ -import numpy as np -import pickle -import tensorflow as tf -from tensorflow.keras.preprocessing.sequence import pad_sequences -import nltk -import random -import string -import re -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords +# import numpy as np +# import pickle +# import tensorflow as tf +# from tensorflow.keras.preprocessing.sequence import pad_sequences +# import nltk +# import random +# import string +# import re +# from nltk.tokenize import word_tokenize +# from nltk.corpus import stopwords -# Ensure NLTK resources are available -nltk.download("punkt") -nltk.download("stopwords") +# # Ensure NLTK resources are available +# nltk.download("punkt") +# nltk.download("stopwords") -class QuestionGenerator: - def __init__( - self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl" - ): - """ - Initializes the QuestionGenerator by loading the trained model and tokenizer. - """ - # Load trained model - self.model = tf.keras.models.load_model(model_path) +# class QuestionGenerator: +# def __init__( +# self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl" +# ): +# """ +# Initializes the QuestionGenerator by loading the trained model and tokenizer. +# """ +# # Load trained model +# self.model = tf.keras.models.load_model(model_path) - # Load tokenizer - with open(tokenizer_path, "rb") as handle: - self.tokenizer = pickle.load(handle) +# # Load tokenizer +# with open(tokenizer_path, "rb") as handle: +# self.tokenizer = pickle.load(handle) - # Define question type mapping - self.question_type_dict = { - 0: "fill_in_the_blank", - 1: "true_false", - 2: "multiple_choice", - } +# # Define question type mapping +# self.question_type_dict = { +# 0: "fill_in_the_blank", +# 1: "true_false", +# 2: "multiple_choice", +# } - # Load Indonesian stopwords - self.stop_words = set(stopwords.words("indonesian")) +# # Load Indonesian stopwords +# self.stop_words = set(stopwords.words("indonesian")) - # Custom word normalization dictionary - self.normalization_dict = { - "yg": "yang", - "gokil": "kocak", - "kalo": "kalau", - "gue": "saya", - "elo": "kamu", - "nih": "ini", - "trs": "terus", - "tdk": "tidak", - "gmna": "bagaimana", - "tp": "tapi", - "jd": "jadi", - "aja": "saja", - "krn": "karena", - "blm": "belum", - "dgn": "dengan", - "skrg": "sekarang", - "msh": "masih", - "lg": "lagi", - "sy": "saya", - "sm": "sama", - "bgt": "banget", - "dr": "dari", - "kpn": "kapan", - "hrs": "harus", - "cm": "cuma", - "sbnrnya": "sebenarnya", - } +# # Custom word normalization dictionary +# self.normalization_dict = { +# "yg": "yang", +# "gokil": "kocak", +# "kalo": "kalau", +# "gue": "saya", +# "elo": "kamu", +# "nih": "ini", +# "trs": "terus", +# "tdk": "tidak", +# "gmna": "bagaimana", +# "tp": "tapi", +# "jd": "jadi", +# "aja": "saja", +# "krn": "karena", +# "blm": "belum", +# "dgn": "dengan", +# "skrg": "sekarang", +# "msh": "masih", +# "lg": "lagi", +# "sy": "saya", +# "sm": "sama", +# "bgt": "banget", +# "dr": "dari", +# "kpn": "kapan", +# "hrs": "harus", +# "cm": "cuma", +# "sbnrnya": "sebenarnya", +# } - def preprocess_text(self, text): - """ - Preprocesses the input text by: - - Converting to lowercase - - Removing punctuation - - Tokenizing - - Normalizing words - - Removing stopwords - """ - text = text.lower() - text = text.translate( - str.maketrans("", "", string.punctuation) - ) # Remove punctuation - text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces - tokens = word_tokenize(text) # Tokenization - tokens = [ - self.normalization_dict.get(word, word) for word in tokens - ] # Normalize words - tokens = [ - word for word in tokens if word not in self.stop_words - ] # Remove stopwords - return " ".join(tokens) +# def preprocess_text(self, text): +# """ +# Preprocesses the input text by: +# - Converting to lowercase +# - Removing punctuation +# - Tokenizing +# - Normalizing words +# - Removing stopwords +# """ +# text = text.lower() +# text = text.translate( +# str.maketrans("", "", string.punctuation) +# ) # Remove punctuation +# text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces +# tokens = word_tokenize(text) # Tokenization +# tokens = [ +# self.normalization_dict.get(word, word) for word in tokens +# ] # Normalize words +# tokens = [ +# word for word in tokens if word not in self.stop_words +# ] # Remove stopwords +# return " ".join(tokens) - def sequence_to_text(self, sequence): - """ - Converts a tokenized sequence back into readable text. - """ - return " ".join( - [ - self.tokenizer.index_word.get(idx, "") - for idx in sequence - if idx != 0 - ] - ) +# def sequence_to_text(self, sequence): +# """ +# Converts a tokenized sequence back into readable text. +# """ +# return " ".join( +# [ +# self.tokenizer.index_word.get(idx, "") +# for idx in sequence +# if idx != 0 +# ] +# ) - def generate_qa_from_paragraph(self, paragraph): - """ - Generates a question, answer, and question type from the given paragraph. - If it's a multiple-choice question, it also returns answer options. - """ - # Preprocess the input paragraph - processed_paragraph = self.preprocess_text(paragraph) +# def generate_qa_from_paragraph(self, paragraph): +# """ +# Generates a question, answer, and question type from the given paragraph. +# If it's a multiple-choice question, it also returns answer options. +# """ +# # Preprocess the input paragraph +# processed_paragraph = self.preprocess_text(paragraph) - # Convert text to sequence - input_seq = self.tokenizer.texts_to_sequences([processed_paragraph]) - input_seq = pad_sequences(input_seq, maxlen=100, padding="post") +# # Convert text to sequence +# input_seq = self.tokenizer.texts_to_sequences([processed_paragraph]) +# input_seq = pad_sequences(input_seq, maxlen=100, padding="post") - # Predict question, answer, and type - pred_question, pred_answer, pred_qtype = self.model.predict( - [input_seq, input_seq] - ) +# # Predict question, answer, and type +# pred_question, pred_answer, pred_qtype = self.model.predict( +# [input_seq, input_seq] +# ) - # Decode predictions - generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1)) - generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1)) - question_type_index = np.argmax(pred_qtype[0]) - generated_qtype = self.question_type_dict[question_type_index] +# # Decode predictions +# generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1)) +# generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1)) +# question_type_index = np.argmax(pred_qtype[0]) +# generated_qtype = self.question_type_dict[question_type_index] - # Handle multiple-choice options - options = None - if generated_qtype == "multiple_choice": - words = processed_paragraph.split() - random.shuffle(words) - distractors = [ - word for word in words if word.lower() != generated_answer.lower() - ] - options = [generated_answer] + distractors[:3] - random.shuffle(options) # Shuffle options +# # Handle multiple-choice options +# options = None +# if generated_qtype == "multiple_choice": +# words = processed_paragraph.split() +# random.shuffle(words) +# distractors = [ +# word for word in words if word.lower() != generated_answer.lower() +# ] +# options = [generated_answer] + distractors[:3] +# random.shuffle(options) # Shuffle options - # Return the generated data - return { - "generated_question": generated_question, - "generated_answer": generated_answer, - "question_type": generated_qtype, - "options": options if generated_qtype == "multiple_choice" else None, - } +# # Return the generated data +# return { +# "generated_question": generated_question, +# "generated_answer": generated_answer, +# "question_type": generated_qtype, +# "options": options if generated_qtype == "multiple_choice" else None, +# } -# Initialize the question generator -qg = QuestionGenerator() +# # Initialize the question generator +# qg = QuestionGenerator() -# Example input paragraph -sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur." +# # Example input paragraph +# sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur." -# Generate question, answer, and type -generated_result = qg.generate_qa_from_paragraph(sample_paragraph) +# # Generate question, answer, and type +# generated_result = qg.generate_qa_from_paragraph(sample_paragraph) -# Print output -print("Generated Question:", generated_result["generated_question"]) -print("Generated Answer:", generated_result["generated_answer"]) -print("Question Type:", generated_result["question_type"]) -if generated_result["options"]: - print("Options:", generated_result["options"]) +# # Print output +# print("Generated Question:", generated_result["generated_question"]) +# print("Generated Answer:", generated_result["generated_answer"]) +# print("Question Type:", generated_result["question_type"]) +# if generated_result["options"]: +# print("Options:", generated_result["options"])