TIF_E41211115_lstm-quiz-gen.../dataset/py_cleanup.py

174 lines
6.2 KiB
Python

import json
def validate_and_sort_data(input_data, valid_output_file='valid_data.json', invalid_output_file='invalid_data.json'):
"""
Memvalidasi dan mengurutkan data berdasarkan konsistensi panjang token, NER, dan SRL
Args:
input_data: List data yang akan divalidasi
valid_output_file: Nama file untuk data yang valid
invalid_output_file: Nama file untuk data yang tidak valid
"""
valid_data = []
invalid_data = []
for i, item in enumerate(input_data):
# Cek apakah semua field yang diperlukan ada
required_fields = ['context', 'tokens', 'ner', 'srl']
missing_fields = [field for field in required_fields if field not in item]
if missing_fields:
item['validation_error'] = f"Missing fields: {missing_fields}"
invalid_data.append(item)
continue
# Cek panjang konsistensi
tokens_len = len(item['tokens'])
ner_len = len(item['ner'])
srl_len = len(item['srl'])
# Validasi panjang
if tokens_len == ner_len == srl_len:
# Data valid
item['validation_status'] = 'valid'
item['token_count'] = tokens_len
valid_data.append(item)
else:
# Data tidak valid
item['validation_status'] = 'invalid'
item['validation_error'] = {
'tokens_length': tokens_len,
'ner_length': ner_len,
'srl_length': srl_len,
'issue': 'Length mismatch between tokens, NER, and SRL'
}
invalid_data.append(item)
# Urutkan data valid berdasarkan jumlah token (ascending)
valid_data.sort(key=lambda x: x['token_count'])
# Simpan ke file JSON
with open(valid_output_file, 'w', encoding='utf-8') as f:
json.dump(valid_data, f, ensure_ascii=False, indent=2)
with open(invalid_output_file, 'w', encoding='utf-8') as f:
json.dump(invalid_data, f, ensure_ascii=False, indent=2)
# Print statistik
print(f"=== HASIL VALIDASI DATA ===")
print(f"Total data: {len(input_data)}")
print(f"Data valid: {len(valid_data)}")
print(f"Data tidak valid: {len(invalid_data)}")
print(f"\nFile output:")
print(f"- Data valid: {valid_output_file}")
print(f"- Data tidak valid: {invalid_output_file}")
if invalid_data:
print(f"\n=== DETAIL DATA TIDAK VALID ===")
for i, item in enumerate(invalid_data):
if 'validation_error' in item:
if isinstance(item['validation_error'], dict):
error = item['validation_error']
print(f"Data {i+1}: {error['issue']}")
print(f" - Tokens: {error['tokens_length']}")
print(f" - NER: {error['ner_length']}")
print(f" - SRL: {error['srl_length']}")
else:
print(f"Data {i+1}: {item['validation_error']}")
print()
return valid_data, invalid_data
def load_data_from_file(file_path):
"""
Memuat data dari file JSON
Args:
file_path: Path ke file JSON
Returns:
List data
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"File {file_path} tidak ditemukan!")
return []
except json.JSONDecodeError:
print(f"Error parsing JSON dari file {file_path}")
return []
# Contoh penggunaan
if __name__ == "__main__":
# Data contoh dari input Anda
sample_data = [
{
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
"tokens": [
"raden", "ajeng", "kartini", "lahir", "pada", "21",
"april", "1879", "di", "jepara"
],
"ner": ["PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"],
"srl": [
"ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP",
"ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
],
"qas": [
{
"type": "isian",
"question": "Dimana kartini lahir ___",
"answer": "jepara",
"id": "qa_0_q1"
},
{
"type": "true_false",
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
"options": ["true", "false"],
"answer": "false",
"id": "qa_0_q2"
}
]
},
{
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
"tokens": [
"kerajaan", "majapahit", "berdiri", "pada",
"tahun", "1293", "di", "trowulan"
],
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
"qas": [
{
"type": "opsi",
"question": "Dimana kerajaan majapahit berdiri ___",
"options": ["trowulan", "singasari", "kuta", "banten"],
"answer": "trowulan",
"id": "qa_1_q1"
},
{
"type": "true_false",
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
"options": ["true", "false"],
"answer": "false",
"id": "qa_1_q2"
}
]
},
# Contoh data tidak valid (panjang tidak sama)
{
"context": "contoh data tidak valid",
"tokens": ["contoh", "data", "tidak"],
"ner": ["O", "O"], # Panjang tidak sama dengan tokens
"srl": ["ARG0", "ARG1", "V", "O"], # Panjang tidak sama dengan tokens
"qas": []
}
]
# Jalankan validasi
# valid, invalid = validate_and_sort_data(sample_data)
# Atau jika ingin memuat dari file:
data = load_data_from_file('need_clean_dataset.json')
valid, invalid = validate_and_sort_data(data)