feat: adding dataset
This commit is contained in:
parent
ab0d260648
commit
d1f94d5918
File diff suppressed because it is too large
Load Diff
BIN
QC/lstm_qg.keras
BIN
QC/lstm_qg.keras
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,73 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text):
|
||||||
|
"""Tokenisasi sederhana berbasis spasi dan tanda baca."""
|
||||||
|
text = re.sub(r"([.,!?])", r" \1", text)
|
||||||
|
return text.strip().split()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_entry(entry, idx):
|
||||||
|
"""Normalisasi satu entri ke format token-based, dengan detail error jika ada."""
|
||||||
|
required_keys = {"tokens", "ner", "srl", "question", "answer", "type"}
|
||||||
|
missing_keys = required_keys - entry.keys()
|
||||||
|
|
||||||
|
if missing_keys:
|
||||||
|
raise ValueError(f"index {idx}: missing keys {missing_keys}")
|
||||||
|
|
||||||
|
# Validasi tipe data
|
||||||
|
if not isinstance(entry["tokens"], list):
|
||||||
|
raise TypeError(f"index {idx}: 'tokens' harus list")
|
||||||
|
if not isinstance(entry["ner"], list):
|
||||||
|
raise TypeError(f"index {idx}: 'ner' harus list")
|
||||||
|
if not isinstance(entry["srl"], list):
|
||||||
|
raise TypeError(f"index {idx}: 'srl' harus list")
|
||||||
|
if not isinstance(entry["question"], (str, list)):
|
||||||
|
raise TypeError(f"index {idx}: 'question' harus string atau list")
|
||||||
|
if not isinstance(entry["answer"], (str, list)):
|
||||||
|
raise TypeError(f"index {idx}: 'answer' harus string atau list")
|
||||||
|
if not isinstance(entry["type"], str):
|
||||||
|
raise TypeError(f"index {idx}: 'type' harus string")
|
||||||
|
|
||||||
|
question = (
|
||||||
|
tokenize(entry["question"])
|
||||||
|
if isinstance(entry["question"], str)
|
||||||
|
else entry["question"]
|
||||||
|
)
|
||||||
|
answer = (
|
||||||
|
tokenize(entry["answer"])
|
||||||
|
if isinstance(entry["answer"], str)
|
||||||
|
else entry["answer"]
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"tokens": entry["tokens"],
|
||||||
|
"ner": entry["ner"],
|
||||||
|
"srl": entry["srl"],
|
||||||
|
"question": question,
|
||||||
|
"answer": answer,
|
||||||
|
"type": entry["type"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Load original data
|
||||||
|
with open("QC/dataset_qc_tokenized.json", encoding="utf-8") as f:
|
||||||
|
raw_data = json.load(f)
|
||||||
|
|
||||||
|
# Normalisasi semua entri
|
||||||
|
normalized_data = []
|
||||||
|
for idx, entry in enumerate(raw_data):
|
||||||
|
try:
|
||||||
|
normalized = normalize_entry(entry, idx)
|
||||||
|
normalized_data.append(normalized)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error pada index {idx}: {e}")
|
||||||
|
|
||||||
|
# Simpan ke file JSON baru
|
||||||
|
with open("QC/normalized_dataset.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(normalized_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n✅ Berhasil disimpan: {len(normalized_data)} entri ke 'QC/normalized_dataset.json'"
|
||||||
|
)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -9,7 +9,7 @@ def infer_from_input(input_data, maxlen=50):
|
||||||
with open("QC/tokenizers.pkl", "rb") as f:
|
with open("QC/tokenizers.pkl", "rb") as f:
|
||||||
tokenizers = pickle.load(f)
|
tokenizers = pickle.load(f)
|
||||||
|
|
||||||
model = load_model("QC/lstm_qg.keras")
|
model = load_model("QC/new_model_lstm_qg.keras")
|
||||||
|
|
||||||
tok_token = tokenizers["token"]
|
tok_token = tokenizers["token"]
|
||||||
tok_ner = tokenizers["ner"]
|
tok_ner = tokenizers["ner"]
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue