feat: adding new dataset and doing some clean up
This commit is contained in:
parent
74b7dd177b
commit
668e659147
Binary file not shown.
Before Width: | Height: | Size: 62 KiB After Width: | Height: | Size: 62 KiB |
File diff suppressed because one or more lines are too long
|
@ -8,20 +8,97 @@ from pathlib import Path
|
|||
# Daftar label NER yang valid (bisa disesuaikan)
|
||||
VALID_NER_LABELS = {
|
||||
"O",
|
||||
"B-LOC", "I-LOC",
|
||||
"B-PER", "I-PER",
|
||||
"B-ORG", "I-ORG",
|
||||
"B-DATE", "I-DATE",
|
||||
"B-TIME", "I-TIME",
|
||||
"B-EVENT", "I-EVENT"
|
||||
"LOC",
|
||||
"LOC",
|
||||
"PER",
|
||||
"PER",
|
||||
"ORG",
|
||||
"ORG",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"TIME",
|
||||
"TIME",
|
||||
"EVENT",
|
||||
"EVENT",
|
||||
"MISC",
|
||||
}
|
||||
|
||||
|
||||
# Daftar label NER yang valid (bisa disesuaikan)
|
||||
VALID_NER_LABELS = {"O", "LOC", "PER", "ORG", "DATE", "TIME", "EVENT", "MISC"}
|
||||
|
||||
# Daftar label SRL yang valid
|
||||
VALID_SRL_LABELS = {
|
||||
"ARG0",
|
||||
"ARG1",
|
||||
"ARG2",
|
||||
"ARG3",
|
||||
"ARGM-TMP",
|
||||
"ARGM-LOC",
|
||||
"ARGM-CAU",
|
||||
"ARGM-MNR",
|
||||
"ARGM-MOD",
|
||||
"ARGM-NEG",
|
||||
"V",
|
||||
"O",
|
||||
}
|
||||
|
||||
|
||||
# def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
||||
# with open(json_path, encoding="utf-8") as f:
|
||||
# records = json.load(f)
|
||||
|
||||
# seen_sentences: set[tuple[str, ...]] = set()
|
||||
|
||||
# with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
|
||||
# writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
|
||||
|
||||
# for idx, rec in enumerate(records):
|
||||
# contexxt = rec.get("context")
|
||||
# tokens = rec.get("tokens")
|
||||
# ner_tags = rec.get("ner")
|
||||
# srl_tags = rec.get("srl")
|
||||
|
||||
# if not (len(tokens) == len(ner_tags) == len(srl_tags)):
|
||||
# raise ValueError(
|
||||
# f"❌ Panjang tidak sama di record index {idx}:\n"
|
||||
# f" context ({len(contexxt)}): {contexxt}\n"
|
||||
# f" tokens ({len(tokens)}): {tokens}\n"
|
||||
# f" ner ({len(ner_tags)}): {ner_tags}\n"
|
||||
# f" srl ({len(srl_tags)}): {srl_tags}\n"
|
||||
# )
|
||||
|
||||
# # Validasi label NER
|
||||
# for i, ner_label in enumerate(ner_tags):
|
||||
# if ner_label not in VALID_NER_LABELS:
|
||||
# raise ValueError(
|
||||
# f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
# f" ner_label: {ner_label}\n"
|
||||
# f" value: {tokens}"
|
||||
# )
|
||||
|
||||
# # Validasi label SRL
|
||||
# for i, srl_label in enumerate(srl_tags):
|
||||
# if srl_label not in VALID_SRL_LABELS:
|
||||
# raise ValueError(
|
||||
# f"❌ Label SRL tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
# f" srl_label: {srl_label}\n"
|
||||
# f" value: {tokens}"
|
||||
# )
|
||||
|
||||
# key = tuple(tokens)
|
||||
# if key in seen_sentences:
|
||||
# continue
|
||||
# seen_sentences.add(key)
|
||||
|
||||
# for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
|
||||
# writer.writerow([tok, ner, srl])
|
||||
# writer.writerow([])
|
||||
|
||||
# print(f"✔️ TSV selesai, simpan di: {tsv_path}")
|
||||
|
||||
|
||||
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
||||
"""
|
||||
Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
|
||||
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
|
||||
Jika ada record yang tokens, ner, dan srl tidak sama panjang, atau ada label NER tidak valid, akan diberi info error lengkap.
|
||||
"""
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
records = json.load(f)
|
||||
|
||||
|
@ -31,26 +108,46 @@ def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
|||
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
|
||||
|
||||
for idx, rec in enumerate(records):
|
||||
context = rec.get("context")
|
||||
tokens = rec.get("tokens")
|
||||
ner_tags = rec.get("ner")
|
||||
srl_tags = rec.get("srl")
|
||||
|
||||
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
|
||||
raise ValueError(
|
||||
print(
|
||||
f"❌ Panjang tidak sama di record index {idx}:\n"
|
||||
f" context: {context}\n"
|
||||
f" tokens ({len(tokens)}): {tokens}\n"
|
||||
f" ner ({len(ner_tags)}): {ner_tags}\n"
|
||||
f" srl ({len(srl_tags)}): {srl_tags}\n"
|
||||
)
|
||||
continue
|
||||
|
||||
# Validasi label NER
|
||||
invalid_ner = False
|
||||
for i, ner_label in enumerate(ner_tags):
|
||||
if ner_label not in VALID_NER_LABELS:
|
||||
raise ValueError(
|
||||
print(
|
||||
f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
f" ner_label: {ner_label}\n"
|
||||
f" value: {tokens}"
|
||||
)
|
||||
invalid_ner = True
|
||||
break
|
||||
if invalid_ner:
|
||||
continue
|
||||
|
||||
invalid_srl = False
|
||||
for i, srl_label in enumerate(srl_tags):
|
||||
if srl_label not in VALID_SRL_LABELS:
|
||||
print(
|
||||
f"❌ Label SRL tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
f" srl_label: {srl_label}\n"
|
||||
f" value: {tokens}"
|
||||
)
|
||||
invalid_srl = True
|
||||
break
|
||||
if invalid_srl:
|
||||
continue
|
||||
|
||||
key = tuple(tokens)
|
||||
if key in seen_sentences:
|
||||
|
@ -118,4 +215,4 @@ def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
|||
# CONTOH PEMAKAIAN
|
||||
# ---------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
json_to_tsv("QC/normalize_dataset.json", "QC/new_LNS.tsv")
|
||||
json_to_tsv("../dataset/stable_qg_qa_train_dataset.json", "new_LNS_2.tsv")
|
Binary file not shown.
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 55 KiB |
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,174 @@
|
|||
import json
|
||||
|
||||
def validate_and_sort_data(input_data, valid_output_file='valid_data.json', invalid_output_file='invalid_data.json'):
|
||||
"""
|
||||
Memvalidasi dan mengurutkan data berdasarkan konsistensi panjang token, NER, dan SRL
|
||||
|
||||
Args:
|
||||
input_data: List data yang akan divalidasi
|
||||
valid_output_file: Nama file untuk data yang valid
|
||||
invalid_output_file: Nama file untuk data yang tidak valid
|
||||
"""
|
||||
|
||||
valid_data = []
|
||||
invalid_data = []
|
||||
|
||||
for i, item in enumerate(input_data):
|
||||
# Cek apakah semua field yang diperlukan ada
|
||||
required_fields = ['context', 'tokens', 'ner', 'srl']
|
||||
missing_fields = [field for field in required_fields if field not in item]
|
||||
|
||||
if missing_fields:
|
||||
item['validation_error'] = f"Missing fields: {missing_fields}"
|
||||
invalid_data.append(item)
|
||||
continue
|
||||
|
||||
# Cek panjang konsistensi
|
||||
tokens_len = len(item['tokens'])
|
||||
ner_len = len(item['ner'])
|
||||
srl_len = len(item['srl'])
|
||||
|
||||
# Validasi panjang
|
||||
if tokens_len == ner_len == srl_len:
|
||||
# Data valid
|
||||
item['validation_status'] = 'valid'
|
||||
item['token_count'] = tokens_len
|
||||
valid_data.append(item)
|
||||
else:
|
||||
# Data tidak valid
|
||||
item['validation_status'] = 'invalid'
|
||||
item['validation_error'] = {
|
||||
'tokens_length': tokens_len,
|
||||
'ner_length': ner_len,
|
||||
'srl_length': srl_len,
|
||||
'issue': 'Length mismatch between tokens, NER, and SRL'
|
||||
}
|
||||
invalid_data.append(item)
|
||||
|
||||
# Urutkan data valid berdasarkan jumlah token (ascending)
|
||||
valid_data.sort(key=lambda x: x['token_count'])
|
||||
|
||||
# Simpan ke file JSON
|
||||
with open(valid_output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(valid_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
with open(invalid_output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(invalid_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Print statistik
|
||||
print(f"=== HASIL VALIDASI DATA ===")
|
||||
print(f"Total data: {len(input_data)}")
|
||||
print(f"Data valid: {len(valid_data)}")
|
||||
print(f"Data tidak valid: {len(invalid_data)}")
|
||||
print(f"\nFile output:")
|
||||
print(f"- Data valid: {valid_output_file}")
|
||||
print(f"- Data tidak valid: {invalid_output_file}")
|
||||
|
||||
if invalid_data:
|
||||
print(f"\n=== DETAIL DATA TIDAK VALID ===")
|
||||
for i, item in enumerate(invalid_data):
|
||||
if 'validation_error' in item:
|
||||
if isinstance(item['validation_error'], dict):
|
||||
error = item['validation_error']
|
||||
print(f"Data {i+1}: {error['issue']}")
|
||||
print(f" - Tokens: {error['tokens_length']}")
|
||||
print(f" - NER: {error['ner_length']}")
|
||||
print(f" - SRL: {error['srl_length']}")
|
||||
else:
|
||||
print(f"Data {i+1}: {item['validation_error']}")
|
||||
print()
|
||||
|
||||
return valid_data, invalid_data
|
||||
|
||||
def load_data_from_file(file_path):
|
||||
"""
|
||||
Memuat data dari file JSON
|
||||
|
||||
Args:
|
||||
file_path: Path ke file JSON
|
||||
|
||||
Returns:
|
||||
List data
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"File {file_path} tidak ditemukan!")
|
||||
return []
|
||||
except json.JSONDecodeError:
|
||||
print(f"Error parsing JSON dari file {file_path}")
|
||||
return []
|
||||
|
||||
# Contoh penggunaan
|
||||
if __name__ == "__main__":
|
||||
# Data contoh dari input Anda
|
||||
sample_data = [
|
||||
{
|
||||
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
||||
"tokens": [
|
||||
"raden", "ajeng", "kartini", "lahir", "pada", "21",
|
||||
"april", "1879", "di", "jepara"
|
||||
],
|
||||
"ner": ["PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"],
|
||||
"srl": [
|
||||
"ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP",
|
||||
"ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "isian",
|
||||
"question": "Dimana kartini lahir ___",
|
||||
"answer": "jepara",
|
||||
"id": "qa_0_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
||||
"options": ["true", "false"],
|
||||
"answer": "false",
|
||||
"id": "qa_0_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
"tokens": [
|
||||
"kerajaan", "majapahit", "berdiri", "pada",
|
||||
"tahun", "1293", "di", "trowulan"
|
||||
],
|
||||
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
|
||||
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
|
||||
"qas": [
|
||||
{
|
||||
"type": "opsi",
|
||||
"question": "Dimana kerajaan majapahit berdiri ___",
|
||||
"options": ["trowulan", "singasari", "kuta", "banten"],
|
||||
"answer": "trowulan",
|
||||
"id": "qa_1_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
||||
"options": ["true", "false"],
|
||||
"answer": "false",
|
||||
"id": "qa_1_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
# Contoh data tidak valid (panjang tidak sama)
|
||||
{
|
||||
"context": "contoh data tidak valid",
|
||||
"tokens": ["contoh", "data", "tidak"],
|
||||
"ner": ["O", "O"], # Panjang tidak sama dengan tokens
|
||||
"srl": ["ARG0", "ARG1", "V", "O"], # Panjang tidak sama dengan tokens
|
||||
"qas": []
|
||||
}
|
||||
]
|
||||
|
||||
# Jalankan validasi
|
||||
# valid, invalid = validate_and_sort_data(sample_data)
|
||||
|
||||
# Atau jika ingin memuat dari file:
|
||||
data = load_data_from_file('need_clean_dataset.json')
|
||||
valid, invalid = validate_and_sort_data(data)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Before Width: | Height: | Size: 52 KiB After Width: | Height: | Size: 53 KiB |
Loading…
Reference in New Issue