TIFNJK_E41221588/utils/deteksi_kolom.py

42 lines
919 B
Python

import pandas as pd
def is_datetime_column(series: pd.Series) -> bool:
parsed = pd.to_datetime(series, errors="coerce")
return parsed.notna().mean() > 0.8
def is_valid_text_column(series: pd.Series) -> bool:
texts = series.dropna().astype(str)
if len(texts) == 0:
return False
if is_datetime_column(texts):
return False
# rata-rata panjang kata
if texts.str.len().mean() < 20:
return False
# rata-rata jumlah kata
if texts.str.split().apply(len).mean() < 5:
return False
vocab_ratio = (
texts.str.split()
.explode()
.nunique() /
max(texts.str.split().explode().count(), 1)
)
return vocab_ratio > 0.03
def detect_text_column(df: pd.DataFrame):
for col in df.columns:
if df[col].dtype == object:
if is_valid_text_column(df[col]):
return col
return None