import json def validate_and_sort_data(input_data, valid_output_file='valid_data.json', invalid_output_file='invalid_data.json'): """ Memvalidasi dan mengurutkan data berdasarkan konsistensi panjang token, NER, dan SRL Args: input_data: List data yang akan divalidasi valid_output_file: Nama file untuk data yang valid invalid_output_file: Nama file untuk data yang tidak valid """ valid_data = [] invalid_data = [] for i, item in enumerate(input_data): # Cek apakah semua field yang diperlukan ada required_fields = ['context', 'tokens', 'ner', 'srl'] missing_fields = [field for field in required_fields if field not in item] if missing_fields: item['validation_error'] = f"Missing fields: {missing_fields}" invalid_data.append(item) continue # Cek panjang konsistensi tokens_len = len(item['tokens']) ner_len = len(item['ner']) srl_len = len(item['srl']) # Validasi panjang if tokens_len == ner_len == srl_len: # Data valid item['validation_status'] = 'valid' item['token_count'] = tokens_len valid_data.append(item) else: # Data tidak valid item['validation_status'] = 'invalid' item['validation_error'] = { 'tokens_length': tokens_len, 'ner_length': ner_len, 'srl_length': srl_len, 'issue': 'Length mismatch between tokens, NER, and SRL' } invalid_data.append(item) # Urutkan data valid berdasarkan jumlah token (ascending) valid_data.sort(key=lambda x: x['token_count']) # Simpan ke file JSON with open(valid_output_file, 'w', encoding='utf-8') as f: json.dump(valid_data, f, ensure_ascii=False, indent=2) with open(invalid_output_file, 'w', encoding='utf-8') as f: json.dump(invalid_data, f, ensure_ascii=False, indent=2) # Print statistik print(f"=== HASIL VALIDASI DATA ===") print(f"Total data: {len(input_data)}") print(f"Data valid: {len(valid_data)}") print(f"Data tidak valid: {len(invalid_data)}") print(f"\nFile output:") print(f"- Data valid: {valid_output_file}") print(f"- Data tidak valid: {invalid_output_file}") if invalid_data: print(f"\n=== DETAIL DATA TIDAK VALID ===") for i, item in enumerate(invalid_data): if 'validation_error' in item: if isinstance(item['validation_error'], dict): error = item['validation_error'] print(f"Data {i+1}: {error['issue']}") print(f" - Tokens: {error['tokens_length']}") print(f" - NER: {error['ner_length']}") print(f" - SRL: {error['srl_length']}") else: print(f"Data {i+1}: {item['validation_error']}") print() return valid_data, invalid_data def load_data_from_file(file_path): """ Memuat data dari file JSON Args: file_path: Path ke file JSON Returns: List data """ try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: print(f"File {file_path} tidak ditemukan!") return [] except json.JSONDecodeError: print(f"Error parsing JSON dari file {file_path}") return [] # Contoh penggunaan if __name__ == "__main__": # Data contoh dari input Anda sample_data = [ { "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara", "tokens": [ "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara" ], "ner": ["PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"], "srl": [ "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC" ], "qas": [ { "type": "isian", "question": "Dimana kartini lahir ___", "answer": "jepara", "id": "qa_0_q1" }, { "type": "true_false", "question": "Kartini lahir pada tanggal 21 mei 1879 ___", "options": ["true", "false"], "answer": "false", "id": "qa_0_q2" } ] }, { "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan", "tokens": [ "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan" ], "ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"], "srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"], "qas": [ { "type": "opsi", "question": "Dimana kerajaan majapahit berdiri ___", "options": ["trowulan", "singasari", "kuta", "banten"], "answer": "trowulan", "id": "qa_1_q1" }, { "type": "true_false", "question": "Kerajaan majapahit berdiri pada tahun 1300 ___", "options": ["true", "false"], "answer": "false", "id": "qa_1_q2" } ] }, # Contoh data tidak valid (panjang tidak sama) { "context": "contoh data tidak valid", "tokens": ["contoh", "data", "tidak"], "ner": ["O", "O"], # Panjang tidak sama dengan tokens "srl": ["ARG0", "ARG1", "V", "O"], # Panjang tidak sama dengan tokens "qas": [] } ] # Jalankan validasi # valid, invalid = validate_and_sort_data(sample_data) # Atau jika ingin memuat dari file: data = load_data_from_file('need_clean_dataset.json') valid, invalid = validate_and_sort_data(data)