174 lines
6.2 KiB
Python
174 lines
6.2 KiB
Python
import json
|
|
|
|
def validate_and_sort_data(input_data, valid_output_file='valid_data.json', invalid_output_file='invalid_data.json'):
|
|
"""
|
|
Memvalidasi dan mengurutkan data berdasarkan konsistensi panjang token, NER, dan SRL
|
|
|
|
Args:
|
|
input_data: List data yang akan divalidasi
|
|
valid_output_file: Nama file untuk data yang valid
|
|
invalid_output_file: Nama file untuk data yang tidak valid
|
|
"""
|
|
|
|
valid_data = []
|
|
invalid_data = []
|
|
|
|
for i, item in enumerate(input_data):
|
|
# Cek apakah semua field yang diperlukan ada
|
|
required_fields = ['context', 'tokens', 'ner', 'srl']
|
|
missing_fields = [field for field in required_fields if field not in item]
|
|
|
|
if missing_fields:
|
|
item['validation_error'] = f"Missing fields: {missing_fields}"
|
|
invalid_data.append(item)
|
|
continue
|
|
|
|
# Cek panjang konsistensi
|
|
tokens_len = len(item['tokens'])
|
|
ner_len = len(item['ner'])
|
|
srl_len = len(item['srl'])
|
|
|
|
# Validasi panjang
|
|
if tokens_len == ner_len == srl_len:
|
|
# Data valid
|
|
item['validation_status'] = 'valid'
|
|
item['token_count'] = tokens_len
|
|
valid_data.append(item)
|
|
else:
|
|
# Data tidak valid
|
|
item['validation_status'] = 'invalid'
|
|
item['validation_error'] = {
|
|
'tokens_length': tokens_len,
|
|
'ner_length': ner_len,
|
|
'srl_length': srl_len,
|
|
'issue': 'Length mismatch between tokens, NER, and SRL'
|
|
}
|
|
invalid_data.append(item)
|
|
|
|
# Urutkan data valid berdasarkan jumlah token (ascending)
|
|
valid_data.sort(key=lambda x: x['token_count'])
|
|
|
|
# Simpan ke file JSON
|
|
with open(valid_output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(valid_data, f, ensure_ascii=False, indent=2)
|
|
|
|
with open(invalid_output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(invalid_data, f, ensure_ascii=False, indent=2)
|
|
|
|
# Print statistik
|
|
print(f"=== HASIL VALIDASI DATA ===")
|
|
print(f"Total data: {len(input_data)}")
|
|
print(f"Data valid: {len(valid_data)}")
|
|
print(f"Data tidak valid: {len(invalid_data)}")
|
|
print(f"\nFile output:")
|
|
print(f"- Data valid: {valid_output_file}")
|
|
print(f"- Data tidak valid: {invalid_output_file}")
|
|
|
|
if invalid_data:
|
|
print(f"\n=== DETAIL DATA TIDAK VALID ===")
|
|
for i, item in enumerate(invalid_data):
|
|
if 'validation_error' in item:
|
|
if isinstance(item['validation_error'], dict):
|
|
error = item['validation_error']
|
|
print(f"Data {i+1}: {error['issue']}")
|
|
print(f" - Tokens: {error['tokens_length']}")
|
|
print(f" - NER: {error['ner_length']}")
|
|
print(f" - SRL: {error['srl_length']}")
|
|
else:
|
|
print(f"Data {i+1}: {item['validation_error']}")
|
|
print()
|
|
|
|
return valid_data, invalid_data
|
|
|
|
def load_data_from_file(file_path):
|
|
"""
|
|
Memuat data dari file JSON
|
|
|
|
Args:
|
|
file_path: Path ke file JSON
|
|
|
|
Returns:
|
|
List data
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
print(f"File {file_path} tidak ditemukan!")
|
|
return []
|
|
except json.JSONDecodeError:
|
|
print(f"Error parsing JSON dari file {file_path}")
|
|
return []
|
|
|
|
# Contoh penggunaan
|
|
if __name__ == "__main__":
|
|
# Data contoh dari input Anda
|
|
sample_data = [
|
|
{
|
|
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
|
"tokens": [
|
|
"raden", "ajeng", "kartini", "lahir", "pada", "21",
|
|
"april", "1879", "di", "jepara"
|
|
],
|
|
"ner": ["PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"],
|
|
"srl": [
|
|
"ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP",
|
|
"ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
|
|
],
|
|
"qas": [
|
|
{
|
|
"type": "isian",
|
|
"question": "Dimana kartini lahir ___",
|
|
"answer": "jepara",
|
|
"id": "qa_0_q1"
|
|
},
|
|
{
|
|
"type": "true_false",
|
|
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
|
"options": ["true", "false"],
|
|
"answer": "false",
|
|
"id": "qa_0_q2"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
|
"tokens": [
|
|
"kerajaan", "majapahit", "berdiri", "pada",
|
|
"tahun", "1293", "di", "trowulan"
|
|
],
|
|
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
|
|
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
|
|
"qas": [
|
|
{
|
|
"type": "opsi",
|
|
"question": "Dimana kerajaan majapahit berdiri ___",
|
|
"options": ["trowulan", "singasari", "kuta", "banten"],
|
|
"answer": "trowulan",
|
|
"id": "qa_1_q1"
|
|
},
|
|
{
|
|
"type": "true_false",
|
|
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
|
"options": ["true", "false"],
|
|
"answer": "false",
|
|
"id": "qa_1_q2"
|
|
}
|
|
]
|
|
},
|
|
# Contoh data tidak valid (panjang tidak sama)
|
|
{
|
|
"context": "contoh data tidak valid",
|
|
"tokens": ["contoh", "data", "tidak"],
|
|
"ner": ["O", "O"], # Panjang tidak sama dengan tokens
|
|
"srl": ["ARG0", "ARG1", "V", "O"], # Panjang tidak sama dengan tokens
|
|
"qas": []
|
|
}
|
|
]
|
|
|
|
# Jalankan validasi
|
|
# valid, invalid = validate_and_sort_data(sample_data)
|
|
|
|
# Atau jika ingin memuat dari file:
|
|
data = load_data_from_file('need_clean_dataset.json')
|
|
valid, invalid = validate_and_sort_data(data) |