TIFNGK_E41222719/src/utils/generate_data_sample.py

100 lines
3.5 KiB
Python

import pandas as pd
import os
def generate_final_grouped_sample(input_file, output_file, total_sample=356):
if not os.path.exists(input_file):
print(f"Error: File {input_file} tidak ditemukan!")
return
# 1. Load Dataset
df = pd.read_csv(input_file)
# 2. Definisikan Proporsi Sesuai Data Asli
proportions = {
'positif': 0.50,
'negatif': 0.307,
'netral': 0.193
}
print("=== Rincian Rencana Sampling (Stratified) ===")
print(f"Total Populasi: {len(df)} data")
print(f"Target Sampel: {total_sample} data\n")
# 3. Proses Sampling Per Kelas
sampled_list = []
summary_data = []
# Urutan tetap: Positif, Negatif, Netral
for sentiment in ['positif', 'negatif', 'netral']:
weight = proportions[sentiment]
# Hitung jumlah sampel untuk kelas ini
n_target = int(round(total_sample * weight))
# Ambil data dari dataset asli
class_data = df[df['Sentiment'] == sentiment]
n_available = len(class_data)
# Ambil sampel (n_target)
sample_class = class_data.sample(n=min(n_available, n_target), random_state=42)
sampled_list.append(sample_class)
# Simpan rincian untuk ditampilkan
summary_data.append({
'Kelas': sentiment.capitalize(),
'Proporsi (%)': f"{weight*100:.1f}%",
'Jumlah Target': n_target,
'Tersedia': n_available
})
# Gabungkan semua (berdasarkan urutan kelas, tidak di-shuffle)
sampled_df = pd.concat(sampled_list)
# Tampilkan Tabel Rincian di Terminal
summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print("-" * 45)
print(f"Total Data Berhasil Dibuat: {len(sampled_df)}\n")
# 4. Tambahkan Kolom Manual Label & Rapikan
sampled_df['Manual_Label'] = ""
cols = ['Username', 'Rating', 'Sentiment', 'Manual_Label', 'Review', 'Cleaned_Review', 'Date']
sampled_df = sampled_df[[c for c in cols if c in sampled_df.columns]]
# 5. Simpan ke Excel dengan Format Rapi
try:
writer = pd.ExcelWriter(output_file, engine='xlsxwriter')
sampled_df.to_excel(writer, index=False, sheet_name='Validasi_356')
workbook = writer.book
worksheet = writer.sheets['Validasi_356']
# Formatting
header_fmt = workbook.add_format({'bold': True, 'bg_color': '#D7E4BC', 'border': 1, 'align': 'center'})
wrap_fmt = workbook.add_format({'text_wrap': True, 'valign': 'top', 'font_size': 10})
input_fmt = workbook.add_format({'bg_color': '#FFFF00', 'border': 2, 'valign': 'top', 'bold': True})
# Atur Lebar Kolom
worksheet.set_column('A:A', 15) # Username
worksheet.set_column('B:C', 10) # Rating & Sentiment
worksheet.set_column('D:D', 18, input_fmt) # Manual_Label (Kuning)
worksheet.set_column('E:F', 55, wrap_fmt) # Review & Cleaned
worksheet.set_column('G:G', 15) # Date
worksheet.freeze_panes(1, 0) # Freeze baris pertama
for col_num, value in enumerate(sampled_df.columns.values):
worksheet.write(0, col_num, value, header_fmt)
writer.close()
print(f"File Berhasil Dibuat: {output_file}")
except Exception as e:
print(f"Gagal memformat Excel: {e}")
sampled_df.to_csv('backup_validasi.csv', index=False)
# Jalankan
generate_final_grouped_sample(
input_file='robust_data/dataset/trimmed_sentiment_dataset.csv',
output_file='Validasi_Manual_356_Proporsional.xlsx'
)