feat: adding dataset

This commit is contained in:
akhdanre 2025-04-29 13:02:07 +07:00
parent ab0d260648
commit d1f94d5918
12 changed files with 87468 additions and 2944 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

BIN
QC/new_model_lstm_qg.keras Normal file

Binary file not shown.

73
QC/normalize.py Normal file
View File

@ -0,0 +1,73 @@
import json
import re
def tokenize(text):
"""Tokenisasi sederhana berbasis spasi dan tanda baca."""
text = re.sub(r"([.,!?])", r" \1", text)
return text.strip().split()
def normalize_entry(entry, idx):
"""Normalisasi satu entri ke format token-based, dengan detail error jika ada."""
required_keys = {"tokens", "ner", "srl", "question", "answer", "type"}
missing_keys = required_keys - entry.keys()
if missing_keys:
raise ValueError(f"index {idx}: missing keys {missing_keys}")
# Validasi tipe data
if not isinstance(entry["tokens"], list):
raise TypeError(f"index {idx}: 'tokens' harus list")
if not isinstance(entry["ner"], list):
raise TypeError(f"index {idx}: 'ner' harus list")
if not isinstance(entry["srl"], list):
raise TypeError(f"index {idx}: 'srl' harus list")
if not isinstance(entry["question"], (str, list)):
raise TypeError(f"index {idx}: 'question' harus string atau list")
if not isinstance(entry["answer"], (str, list)):
raise TypeError(f"index {idx}: 'answer' harus string atau list")
if not isinstance(entry["type"], str):
raise TypeError(f"index {idx}: 'type' harus string")
question = (
tokenize(entry["question"])
if isinstance(entry["question"], str)
else entry["question"]
)
answer = (
tokenize(entry["answer"])
if isinstance(entry["answer"], str)
else entry["answer"]
)
return {
"tokens": entry["tokens"],
"ner": entry["ner"],
"srl": entry["srl"],
"question": question,
"answer": answer,
"type": entry["type"],
}
# Load original data
with open("QC/dataset_qc_tokenized.json", encoding="utf-8") as f:
raw_data = json.load(f)
# Normalisasi semua entri
normalized_data = []
for idx, entry in enumerate(raw_data):
try:
normalized = normalize_entry(entry, idx)
normalized_data.append(normalized)
except Exception as e:
print(f"❌ Error pada index {idx}: {e}")
# Simpan ke file JSON baru
with open("QC/normalized_dataset.json", "w", encoding="utf-8") as f:
json.dump(normalized_data, f, ensure_ascii=False, indent=2)
print(
f"\n✅ Berhasil disimpan: {len(normalized_data)} entri ke 'QC/normalized_dataset.json'"
)

46490
QC/normalized_dataset.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -9,7 +9,7 @@ def infer_from_input(input_data, maxlen=50):
with open("QC/tokenizers.pkl", "rb") as f: with open("QC/tokenizers.pkl", "rb") as f:
tokenizers = pickle.load(f) tokenizers = pickle.load(f)
model = load_model("QC/lstm_qg.keras") model = load_model("QC/new_model_lstm_qg.keras")
tok_token = tokenizers["token"] tok_token = tokenizers["token"]
tok_ner = tokenizers["ner"] tok_ner = tokenizers["ner"]

Binary file not shown.