TIF_E41211115_lstm-quiz-gen.../old/QC/normalize.py

74 lines
2.3 KiB
Python

import json
import re
def tokenize(text):
"""Tokenisasi sederhana berbasis spasi dan tanda baca."""
text = re.sub(r"([.,!?])", r" \1", text)
return text.strip().split()
def normalize_entry(entry, idx):
"""Normalisasi satu entri ke format token-based, dengan detail error jika ada."""
required_keys = {"tokens", "ner", "srl", "question", "answer", "type"}
missing_keys = required_keys - entry.keys()
if missing_keys:
raise ValueError(f"index {idx}: missing keys {missing_keys}")
# Validasi tipe data
if not isinstance(entry["tokens"], list):
raise TypeError(f"index {idx}: 'tokens' harus list")
if not isinstance(entry["ner"], list):
raise TypeError(f"index {idx}: 'ner' harus list")
if not isinstance(entry["srl"], list):
raise TypeError(f"index {idx}: 'srl' harus list")
if not isinstance(entry["question"], (str, list)):
raise TypeError(f"index {idx}: 'question' harus string atau list")
if not isinstance(entry["answer"], (str, list)):
raise TypeError(f"index {idx}: 'answer' harus string atau list")
if not isinstance(entry["type"], str):
raise TypeError(f"index {idx}: 'type' harus string")
question = (
tokenize(entry["question"])
if isinstance(entry["question"], str)
else entry["question"]
)
answer = (
tokenize(entry["answer"])
if isinstance(entry["answer"], str)
else entry["answer"]
)
return {
"tokens": entry["tokens"],
"ner": entry["ner"],
"srl": entry["srl"],
"question": question,
"answer": answer,
"type": entry["type"],
}
# Load original data
with open("QC/dataset_qc_tokenized.json", encoding="utf-8") as f:
raw_data = json.load(f)
# Normalisasi semua entri
normalized_data = []
for idx, entry in enumerate(raw_data):
try:
normalized = normalize_entry(entry, idx)
normalized_data.append(normalized)
except Exception as e:
print(f"❌ Error pada index {idx}: {e}")
# Simpan ke file JSON baru
with open("QC/normalized_dataset.json", "w", encoding="utf-8") as f:
json.dump(normalized_data, f, ensure_ascii=False, indent=2)
print(
f"\n✅ Berhasil disimpan: {len(normalized_data)} entri ke 'QC/normalized_dataset.json'"
)