42 lines
919 B
Python
42 lines
919 B
Python
import pandas as pd
|
|
|
|
|
|
def is_datetime_column(series: pd.Series) -> bool:
|
|
parsed = pd.to_datetime(series, errors="coerce")
|
|
return parsed.notna().mean() > 0.8
|
|
|
|
|
|
def is_valid_text_column(series: pd.Series) -> bool:
|
|
texts = series.dropna().astype(str)
|
|
|
|
if len(texts) == 0:
|
|
return False
|
|
|
|
if is_datetime_column(texts):
|
|
return False
|
|
|
|
# rata-rata panjang kata
|
|
if texts.str.len().mean() < 20:
|
|
return False
|
|
|
|
# rata-rata jumlah kata
|
|
if texts.str.split().apply(len).mean() < 5:
|
|
return False
|
|
|
|
vocab_ratio = (
|
|
texts.str.split()
|
|
.explode()
|
|
.nunique() /
|
|
max(texts.str.split().explode().count(), 1)
|
|
)
|
|
|
|
return vocab_ratio > 0.03
|
|
|
|
|
|
def detect_text_column(df: pd.DataFrame):
|
|
for col in df.columns:
|
|
if df[col].dtype == object:
|
|
if is_valid_text_column(df[col]):
|
|
return col
|
|
return None
|