chore: cleaning the code
This commit is contained in:
parent
817924bd8c
commit
3613b1a120
2
run.py
2
run.py
|
|
@ -3,10 +3,8 @@ import sys
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Paksa penggunaan SelectorEventLoop di level paling dasar OS Windows
|
|
||||||
if sys.platform == 'win32':
|
if sys.platform == 'win32':
|
||||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||||
print("✅ Mesin Selector Loop Aktif (Anti-NotImplementedError)")
|
print("✅ Mesin Selector Loop Aktif (Anti-NotImplementedError)")
|
||||||
|
|
||||||
# Jalankan uvicorn dari sini, bukan dari terminal langsung
|
|
||||||
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)
|
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)
|
||||||
|
|
@ -11,7 +11,6 @@ def clean_product_name(name: str) -> str:
|
||||||
return name.strip()
|
return name.strip()
|
||||||
|
|
||||||
async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request):
|
async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request):
|
||||||
# 1. SETUP ASPEK (Initialize score 0 untuk setiap kategori)
|
|
||||||
aspect_stats = {
|
aspect_stats = {
|
||||||
aspect: {"positive": 0, "total": 0}
|
aspect: {"positive": 0, "total": 0}
|
||||||
for aspect in config.ASPECT_KEYWORDS.keys()
|
for aspect in config.ASPECT_KEYWORDS.keys()
|
||||||
|
|
@ -19,7 +18,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
||||||
|
|
||||||
print(f"🔍 Memulai Analisis ABSA: {candidate.name[:30]}...")
|
print(f"🔍 Memulai Analisis ABSA: {candidate.name[:30]}...")
|
||||||
|
|
||||||
# 2. DATABASE PRE-CHECK (Model & User)
|
|
||||||
model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"})
|
model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"})
|
||||||
if not model_db:
|
if not model_db:
|
||||||
print("❌ ERROR: Model XGBoost tidak ditemukan di database!")
|
print("❌ ERROR: Model XGBoost tidak ditemukan di database!")
|
||||||
|
|
@ -30,7 +28,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
||||||
print(f"⚠️ User {user_email} tidak ditemukan!")
|
print(f"⚠️ User {user_email} tidak ditemukan!")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 3. PRODUCT PERSISTENCE
|
|
||||||
brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown"
|
brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown"
|
||||||
product_name = clean_product_name(candidate.name)
|
product_name = clean_product_name(candidate.name)
|
||||||
|
|
||||||
|
|
@ -49,7 +46,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. NLP PREDICTION & ASPECT TAGGING LOOP
|
|
||||||
total_reviews = len(candidate.reviews)
|
total_reviews = len(candidate.reviews)
|
||||||
if total_reviews == 0: return None
|
if total_reviews == 0: return None
|
||||||
|
|
||||||
|
|
@ -69,7 +65,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
||||||
pred_idx = ml_core.model_optimized.predict(vec)[0]
|
pred_idx = ml_core.model_optimized.predict(vec)[0]
|
||||||
label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower()
|
label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower()
|
||||||
|
|
||||||
# Confidence Score dari XGBoost
|
|
||||||
try:
|
try:
|
||||||
prob = ml_core.model_optimized.predict_proba(vec)[0]
|
prob = ml_core.model_optimized.predict_proba(vec)[0]
|
||||||
confidence_score = float(max(prob))
|
confidence_score = float(max(prob))
|
||||||
|
|
@ -107,13 +102,11 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
||||||
"userId": user_db.id
|
"userId": user_db.id
|
||||||
})
|
})
|
||||||
|
|
||||||
# 5. DATABASE SYNC (Batch Operations)
|
|
||||||
if reviews_data_to_save:
|
if reviews_data_to_save:
|
||||||
async with prisma.tx() as transaction:
|
async with prisma.tx() as transaction:
|
||||||
await transaction.review.delete_many(where={"productId": product_db.productId})
|
await transaction.review.delete_many(where={"productId": product_db.productId})
|
||||||
await transaction.review.create_many(data=reviews_data_to_save)
|
await transaction.review.create_many(data=reviews_data_to_save)
|
||||||
|
|
||||||
# 6. CALCULATION & VERDICT GENERATION
|
|
||||||
final_aspect_scores = {}
|
final_aspect_scores = {}
|
||||||
for aspect, stat in aspect_stats.items():
|
for aspect, stat in aspect_stats.items():
|
||||||
score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0
|
score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0
|
||||||
|
|
@ -140,14 +133,12 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
||||||
else:
|
else:
|
||||||
verdict_label = "Kurang Disarankan"
|
verdict_label = "Kurang Disarankan"
|
||||||
|
|
||||||
# 1. Buat Analysis terlebih dahulu untuk mendapatkan ID-nya
|
|
||||||
new_analysis = await prisma.analysis.create(
|
new_analysis = await prisma.analysis.create(
|
||||||
data={
|
data={
|
||||||
"userId": user_db.id,
|
"userId": user_db.id,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2. Buat Metric dan hubungkan ke Analysis yang baru saja dibuat
|
|
||||||
await prisma.metric.create(
|
await prisma.metric.create(
|
||||||
data={
|
data={
|
||||||
"generalSentiment": general_sentiment_pct,
|
"generalSentiment": general_sentiment_pct,
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,6 @@ from pathlib import Path
|
||||||
from xgboost import XGBClassifier
|
from xgboost import XGBClassifier
|
||||||
from sklearn.metrics import classification_report, confusion_matrix
|
from sklearn.metrics import classification_report, confusion_matrix
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 1. KONFIGURASI PATH (PATHLIB)
|
|
||||||
# ==========================================
|
|
||||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||||
DATA_DIR = PROJECT_ROOT / "robust_data"
|
DATA_DIR = PROJECT_ROOT / "robust_data"
|
||||||
|
|
@ -20,9 +17,6 @@ PATHS = {
|
||||||
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
|
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
|
||||||
}
|
}
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 2. LOAD DATA
|
|
||||||
# ==========================================
|
|
||||||
print("\n--- MEMUAT DATA BASELINE ---")
|
print("\n--- MEMUAT DATA BASELINE ---")
|
||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
|
|
@ -47,11 +41,6 @@ le = data['le']
|
||||||
|
|
||||||
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
|
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 3. SETUP MODEL BASELINE
|
|
||||||
# ==========================================
|
|
||||||
# Tanpa Grid Search, menggunakan settingan default XGBoost
|
|
||||||
# Default XGBoost biasanya: learning_rate=0.3, max_depth=6, n_estimators=100
|
|
||||||
model_baseline = XGBClassifier(
|
model_baseline = XGBClassifier(
|
||||||
objective='multi:softprob',
|
objective='multi:softprob',
|
||||||
num_class=3,
|
num_class=3,
|
||||||
|
|
@ -60,9 +49,6 @@ model_baseline = XGBClassifier(
|
||||||
use_label_encoder=False
|
use_label_encoder=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 4. EKSEKUSI TRAINING
|
|
||||||
# ==========================================
|
|
||||||
print("\n🔥 MULAI TRAINING BASELINE (SCENARIO 1)...")
|
print("\n🔥 MULAI TRAINING BASELINE (SCENARIO 1)...")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
|
@ -71,31 +57,21 @@ model_baseline.fit(X_train, y_train)
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
print(f"\n✅ SELESAI! Waktu proses: {duration:.2f} detik")
|
print(f"\n✅ SELESAI! Waktu proses: {duration:.2f} detik")
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 5. MENAMPILKAN PARAMETER DEFAULT (BARU)
|
|
||||||
# ==========================================
|
|
||||||
# Karena tidak pakai GridSearch, kita ambil parameter langsung dari modelnya
|
|
||||||
print("\n" + "="*40)
|
print("\n" + "="*40)
|
||||||
print("PARAMETER YANG DIGUNAKAN (DEFAULT)")
|
print("PARAMETER YANG DIGUNAKAN (DEFAULT)")
|
||||||
print("="*40)
|
print("="*40)
|
||||||
|
|
||||||
# Mengambil seluruh parameter model
|
|
||||||
all_params = model_baseline.get_params()
|
all_params = model_baseline.get_params()
|
||||||
|
|
||||||
# Kita filter hanya parameter penting untuk dibandingkan dengan Skenario 2 & 3
|
|
||||||
key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree']
|
key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree']
|
||||||
shown_params = {k: all_params.get(k) for k in key_params}
|
shown_params = {k: all_params.get(k) for k in key_params}
|
||||||
|
|
||||||
# Jika n_estimators atau learning_rate None (karena default library), kita set nilai standarnya manual untuk info
|
|
||||||
if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)"
|
if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)"
|
||||||
if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default"
|
if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default"
|
||||||
|
|
||||||
print(shown_params)
|
print(shown_params)
|
||||||
print("(Gunakan nilai ini untuk perbandingan di Bab 4)")
|
print("(Gunakan nilai ini untuk perbandingan di Bab 4)")
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 6. EVALUASI & SIMPAN
|
|
||||||
# ==========================================
|
|
||||||
print("\n" + "="*40)
|
print("\n" + "="*40)
|
||||||
print("HASIL SKENARIO 1 (BASELINE)")
|
print("HASIL SKENARIO 1 (BASELINE)")
|
||||||
print("="*40)
|
print("="*40)
|
||||||
|
|
@ -111,7 +87,6 @@ print(classification_report(y_test_label, y_pred_label))
|
||||||
print("\nConfusion Matrix:")
|
print("\nConfusion Matrix:")
|
||||||
print(confusion_matrix(y_test_label, y_pred_label))
|
print(confusion_matrix(y_test_label, y_pred_label))
|
||||||
|
|
||||||
# Simpan Model Baseline
|
|
||||||
model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl'
|
model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl'
|
||||||
joblib.dump(model_baseline, model_path)
|
joblib.dump(model_baseline, model_path)
|
||||||
print(f"\n💾 Model baseline disimpan ke: {model_path}")
|
print(f"\n💾 Model baseline disimpan ke: {model_path}")
|
||||||
|
|
@ -6,9 +6,6 @@ from xgboost import XGBClassifier
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.metrics import classification_report, confusion_matrix
|
from sklearn.metrics import classification_report, confusion_matrix
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 1. KONFIGURASI PATH (PATHLIB)
|
|
||||||
# ==========================================
|
|
||||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||||
DATA_DIR = PROJECT_ROOT / "robust_data"
|
DATA_DIR = PROJECT_ROOT / "robust_data"
|
||||||
|
|
@ -21,9 +18,6 @@ PATHS = {
|
||||||
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
|
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
|
||||||
}
|
}
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 2. LOAD DATA
|
|
||||||
# ==========================================
|
|
||||||
print("\n--- MEMUAT DATA SCENARIO 2 ---")
|
print("\n--- MEMUAT DATA SCENARIO 2 ---")
|
||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
|
|
@ -46,11 +40,6 @@ le = data['le']
|
||||||
|
|
||||||
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
|
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 3. SETUP GRID SEARCH (SAMA DENGAN SKENARIO 3)
|
|
||||||
# ==========================================
|
|
||||||
# Kita gunakan range parameter yang SAMA PERSIS dengan Skenario 3
|
|
||||||
# agar perbandingannya adil (apple-to-apple).
|
|
||||||
param_grid = {
|
param_grid = {
|
||||||
'learning_rate': [0.01, 0.1, 0.2],
|
'learning_rate': [0.01, 0.1, 0.2],
|
||||||
'max_depth': [3, 5, 7],
|
'max_depth': [3, 5, 7],
|
||||||
|
|
@ -67,7 +56,6 @@ xgb = XGBClassifier(
|
||||||
use_label_encoder=False
|
use_label_encoder=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# Gunakan F1-Macro agar Grid Search mencoba adil ke kelas minoritas
|
|
||||||
grid_search = GridSearchCV(
|
grid_search = GridSearchCV(
|
||||||
estimator=xgb,
|
estimator=xgb,
|
||||||
param_grid=param_grid,
|
param_grid=param_grid,
|
||||||
|
|
@ -77,9 +65,6 @@ grid_search = GridSearchCV(
|
||||||
verbose=1
|
verbose=1
|
||||||
)
|
)
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 4. EKSEKUSI TRAINING
|
|
||||||
# ==========================================
|
|
||||||
print("\n🔥 MULAI TRAINING & GRID SEARCH (SCENARIO 2)...")
|
print("\n🔥 MULAI TRAINING & GRID SEARCH (SCENARIO 2)...")
|
||||||
print("Sedang mencari parameter terbaik untuk data Imbalanced...")
|
print("Sedang mencari parameter terbaik untuk data Imbalanced...")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
@ -89,9 +74,6 @@ grid_search.fit(X_train, y_train)
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
|
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 5. EVALUASI & SIMPAN
|
|
||||||
# ==========================================
|
|
||||||
best_model = grid_search.best_estimator_
|
best_model = grid_search.best_estimator_
|
||||||
|
|
||||||
print("\n" + "="*40)
|
print("\n" + "="*40)
|
||||||
|
|
@ -114,7 +96,6 @@ print(classification_report(y_test_label, y_pred_label))
|
||||||
print("\nConfusion Matrix:")
|
print("\nConfusion Matrix:")
|
||||||
print(confusion_matrix(y_test_label, y_pred_label))
|
print(confusion_matrix(y_test_label, y_pred_label))
|
||||||
|
|
||||||
# Simpan Model Skenario 2
|
|
||||||
model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl'
|
model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl'
|
||||||
joblib.dump(best_model, model_path)
|
joblib.dump(best_model, model_path)
|
||||||
print(f"\n💾 Model Skenario 2 disimpan ke: {model_path}")
|
print(f"\n💾 Model Skenario 2 disimpan ke: {model_path}")
|
||||||
|
|
@ -11,9 +11,6 @@ from sklearn.metrics import classification_report, confusion_matrix
|
||||||
from imblearn.pipeline import Pipeline as ImbPipeline
|
from imblearn.pipeline import Pipeline as ImbPipeline
|
||||||
from imblearn.over_sampling import SMOTE
|
from imblearn.over_sampling import SMOTE
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# KONFIGURASI PATH
|
|
||||||
# ==========================================
|
|
||||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||||
DATA_DIR = PROJECT_ROOT / "robust_data"
|
DATA_DIR = PROJECT_ROOT / "robust_data"
|
||||||
|
|
@ -28,7 +25,6 @@ PATHS = {
|
||||||
|
|
||||||
print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---")
|
print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---")
|
||||||
|
|
||||||
# Load Data
|
|
||||||
data = {}
|
data = {}
|
||||||
for name, path in PATHS.items():
|
for name, path in PATHS.items():
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
|
@ -41,9 +37,6 @@ X_train, y_train = data["X_train"], data["y_train"]
|
||||||
X_test, y_test = data["X_test"], data["y_test"]
|
X_test, y_test = data["X_test"], data["y_test"]
|
||||||
le = data["le"]
|
le = data["le"]
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# REPORT PROPORSI DATA (SEBELUM & SESUDAH SMOTE)
|
|
||||||
# ==========================================
|
|
||||||
print("\n" + "="*40)
|
print("\n" + "="*40)
|
||||||
print("REPORT PROPORSI DATA")
|
print("REPORT PROPORSI DATA")
|
||||||
print("="*40)
|
print("="*40)
|
||||||
|
|
@ -57,16 +50,12 @@ def print_proportion(y, title):
|
||||||
|
|
||||||
print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)")
|
print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)")
|
||||||
|
|
||||||
# Simulasi SMOTE untuk melihat hasil akhir yang akan diproses Pipeline
|
|
||||||
sm_sim = SMOTE(random_state=42)
|
sm_sim = SMOTE(random_state=42)
|
||||||
_, y_resampled_sim = sm_sim.fit_resample(X_train, y_train)
|
_, y_resampled_sim = sm_sim.fit_resample(X_train, y_train)
|
||||||
print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)")
|
print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)")
|
||||||
|
|
||||||
print("\n" + "="*40)
|
print("\n" + "="*40)
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# DEFINISI PIPELINE
|
|
||||||
# ==========================================
|
|
||||||
pipeline = ImbPipeline([
|
pipeline = ImbPipeline([
|
||||||
('smote', SMOTE(random_state=42)),
|
('smote', SMOTE(random_state=42)),
|
||||||
('selector', SelectKBest(score_func=chi2, k=2000)),
|
('selector', SelectKBest(score_func=chi2, k=2000)),
|
||||||
|
|
@ -79,9 +68,6 @@ pipeline = ImbPipeline([
|
||||||
))
|
))
|
||||||
])
|
])
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# SETTING GRID SEARCH
|
|
||||||
# ==========================================
|
|
||||||
# param_grid = {
|
# param_grid = {
|
||||||
# 'clf__learning_rate': [0.1, 0.2],
|
# 'clf__learning_rate': [0.1, 0.2],
|
||||||
# 'clf__max_depth': [5, 7],
|
# 'clf__max_depth': [5, 7],
|
||||||
|
|
@ -106,9 +92,6 @@ grid_search = GridSearchCV(
|
||||||
verbose=2
|
verbose=2
|
||||||
)
|
)
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# EKSEKUSI TRAINING
|
|
||||||
# ==========================================
|
|
||||||
print(f"\n🔥 MULAI TRAINING... (Dimensi Awal: {X_train.shape})")
|
print(f"\n🔥 MULAI TRAINING... (Dimensi Awal: {X_train.shape})")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
|
@ -117,9 +100,6 @@ grid_search.fit(X_train, y_train)
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
|
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# EVALUASI
|
|
||||||
# ==========================================
|
|
||||||
best_model = grid_search.best_estimator_
|
best_model = grid_search.best_estimator_
|
||||||
|
|
||||||
print("\n" + "="*40)
|
print("\n" + "="*40)
|
||||||
|
|
@ -129,7 +109,6 @@ print(grid_search.best_params_)
|
||||||
|
|
||||||
y_pred = best_model.predict(X_test)
|
y_pred = best_model.predict(X_test)
|
||||||
|
|
||||||
# Inverse Transform Label
|
|
||||||
y_test_label = le.inverse_transform(y_test)
|
y_test_label = le.inverse_transform(y_test)
|
||||||
y_pred_label = le.inverse_transform(y_pred)
|
y_pred_label = le.inverse_transform(y_pred)
|
||||||
|
|
||||||
|
|
@ -139,9 +118,6 @@ print(classification_report(y_test_label, y_pred_label))
|
||||||
print("\nConfusion Matrix:")
|
print("\nConfusion Matrix:")
|
||||||
print(confusion_matrix(y_test_label, y_pred_label))
|
print(confusion_matrix(y_test_label, y_pred_label))
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# SIMPAN MODEL
|
|
||||||
# ==========================================
|
|
||||||
MODEL_DIR = PROJECT_ROOT / "models"
|
MODEL_DIR = PROJECT_ROOT / "models"
|
||||||
MODEL_DIR.mkdir(exist_ok=True)
|
MODEL_DIR.mkdir(exist_ok=True)
|
||||||
model_path = MODEL_DIR / "xgboost_scenario3.pkl"
|
model_path = MODEL_DIR / "xgboost_scenario3.pkl"
|
||||||
|
|
|
||||||
|
|
@ -2,82 +2,58 @@ import joblib
|
||||||
import os
|
import os
|
||||||
from sklearn.feature_selection import SelectKBest, chi2
|
from sklearn.feature_selection import SelectKBest, chi2
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# KONFIGURASI
|
|
||||||
# ==========================================
|
|
||||||
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||||
|
|
||||||
# Input (Kita butuh Data Train hasil SMOTE dan Data Test asli)
|
|
||||||
input_X_train = 'new_X_train_smote.pkl'
|
input_X_train = 'new_X_train_smote.pkl'
|
||||||
input_y_train = 'new_y_train_smote.pkl'
|
input_y_train = 'new_y_train_smote.pkl'
|
||||||
input_X_test = 'X_test_tfidf.pkl' # Test set asli (belum diapa-apakan selain TFIDF)
|
input_X_test = 'X_test_tfidf.pkl'
|
||||||
|
|
||||||
# Output
|
|
||||||
# output_X_train = 'data/chi2/X_train_chi2.pkl'
|
|
||||||
# output_X_test = 'data/chi2/X_test_chi2.pkl'
|
|
||||||
# output_selector = 'data/chi2/chisquare_selector.pkl' # Simpan logikanya
|
|
||||||
output_X_train = 'X_train_chi2.pkl'
|
output_X_train = 'X_train_chi2.pkl'
|
||||||
output_X_test = 'X_test_chi2.pkl'
|
output_X_test = 'X_test_chi2.pkl'
|
||||||
output_selector = 'chisquare_selector.pkl' # Simpan logikanya
|
output_selector = 'chisquare_selector.pkl'
|
||||||
|
|
||||||
# JUMLAH FITUR YANG INGIN DIAMBIL (Parameter K)
|
|
||||||
# Silakan ubah angka ini. 1000 adalah angka start yang bagus untuk Skripsi S1.
|
|
||||||
# Jika fitur awal Anda < 1000, ubah jadi 'all' atau angka lebih kecil (misal 500).
|
|
||||||
K_FEATURES = 1000
|
K_FEATURES = 1000
|
||||||
|
|
||||||
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
|
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Load Data
|
|
||||||
print("1. Memuat data...")
|
print("1. Memuat data...")
|
||||||
# Load Train (SMOTE)
|
|
||||||
X_train = joblib.load(os.path.join(base_dir, input_X_train))
|
X_train = joblib.load(os.path.join(base_dir, input_X_train))
|
||||||
y_train = joblib.load(os.path.join(base_dir, input_y_train))
|
y_train = joblib.load(os.path.join(base_dir, input_y_train))
|
||||||
|
|
||||||
# Load Test (TF-IDF Asli)
|
|
||||||
# Kita butuh ini agar dimensi Test sama dengan Train nanti
|
|
||||||
X_test = joblib.load(os.path.join(base_dir, input_X_test))
|
X_test = joblib.load(os.path.join(base_dir, input_X_test))
|
||||||
|
|
||||||
print(f" - Dimensi Awal Train: {X_train.shape}")
|
print(f" - Dimensi Awal Train: {X_train.shape}")
|
||||||
print(f" - Dimensi Awal Test: {X_test.shape}")
|
print(f" - Dimensi Awal Test: {X_test.shape}")
|
||||||
|
|
||||||
# Cek jumlah fitur total
|
|
||||||
total_features = X_train.shape[1]
|
total_features = X_train.shape[1]
|
||||||
print(f" - Total kata/fitur saat ini: {total_features}")
|
print(f" - Total kata/fitur saat ini: {total_features}")
|
||||||
|
|
||||||
# Validasi K
|
|
||||||
if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
|
if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
|
||||||
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
|
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
|
||||||
k_final = 'all'
|
k_final = 'all'
|
||||||
else:
|
else:
|
||||||
k_final = K_FEATURES
|
k_final = K_FEATURES
|
||||||
|
|
||||||
# 2. Proses Chi-Square
|
|
||||||
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
|
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
|
||||||
|
|
||||||
# Inisialisasi SelectKBest dengan skor func chi2
|
|
||||||
selector = SelectKBest(score_func=chi2, k=k_final)
|
selector = SelectKBest(score_func=chi2, k=k_final)
|
||||||
|
|
||||||
# FIT hanya pada Data Train! (Pelajari mana kata penting dari data latih)
|
|
||||||
selector.fit(X_train, y_train)
|
selector.fit(X_train, y_train)
|
||||||
|
|
||||||
# TRANSFORM pada Train DAN Test
|
|
||||||
X_train_selected = selector.transform(X_train)
|
X_train_selected = selector.transform(X_train)
|
||||||
X_test_selected = selector.transform(X_test)
|
X_test_selected = selector.transform(X_test)
|
||||||
|
|
||||||
# 3. Validasi Hasil
|
|
||||||
print("\n3. Hasil Seleksi:")
|
print("\n3. Hasil Seleksi:")
|
||||||
print(f" - Dimensi Train Baru: {X_train_selected.shape}")
|
print(f" - Dimensi Train Baru: {X_train_selected.shape}")
|
||||||
print(f" - Dimensi Test Baru: {X_test_selected.shape}")
|
print(f" - Dimensi Test Baru: {X_test_selected.shape}")
|
||||||
|
|
||||||
# Menampilkan beberapa skor fitur (opsional, untuk info saja)
|
|
||||||
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
|
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
|
||||||
|
|
||||||
# 4. Simpan Data
|
|
||||||
print("\n4. Menyimpan hasil...")
|
print("\n4. Menyimpan hasil...")
|
||||||
joblib.dump(X_train_selected, output_X_train)
|
joblib.dump(X_train_selected, output_X_train)
|
||||||
joblib.dump(X_test_selected, output_X_test)
|
joblib.dump(X_test_selected, output_X_test)
|
||||||
joblib.dump(selector, output_selector) # Penting untuk prediksi data baru nanti
|
joblib.dump(selector, output_selector)
|
||||||
|
|
||||||
print("="*40)
|
print("="*40)
|
||||||
print(f"SUKSES! Data siap untuk Training XGBoost.")
|
print(f"SUKSES! Data siap untuk Training XGBoost.")
|
||||||
|
|
|
||||||
|
|
@ -4,58 +4,44 @@ from imblearn.over_sampling import SMOTE
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# KONFIGURASI
|
|
||||||
# ==========================================
|
|
||||||
# Gunakan relative path agar aman (sama seperti sebelumnya)
|
|
||||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# Input files (hasil dari TF-IDF sebelumnya)
|
|
||||||
input_X = 'X_train_tfidf.pkl'
|
input_X = 'X_train_tfidf.pkl'
|
||||||
input_y = 'y_train.pkl'
|
input_y = 'y_train.pkl'
|
||||||
|
|
||||||
# Output files (hasil SMOTE)
|
|
||||||
output_X = 'new_X_train_smote.pkl'
|
output_X = 'new_X_train_smote.pkl'
|
||||||
output_y = 'new_y_train_smote.pkl'
|
output_y = 'new_y_train_smote.pkl'
|
||||||
|
|
||||||
print("--- MEMULAI PROSES SMOTE (Skenario 3) ---")
|
print("--- MEMULAI PROSES SMOTE (Skenario 3) ---")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Load Data TF-IDF (Data Latih Saja)
|
|
||||||
print("1. Memuat data latih TF-IDF...")
|
print("1. Memuat data latih TF-IDF...")
|
||||||
# Cek apakah file ada di folder yang sama atau perlu path khusus
|
|
||||||
if os.path.exists(os.path.join(base_dir, input_X)):
|
if os.path.exists(os.path.join(base_dir, input_X)):
|
||||||
X_train = joblib.load(os.path.join(base_dir, input_X))
|
X_train = joblib.load(os.path.join(base_dir, input_X))
|
||||||
y_train = joblib.load(os.path.join(base_dir, input_y))
|
y_train = joblib.load(os.path.join(base_dir, input_y))
|
||||||
else:
|
else:
|
||||||
# Fallback jika file ada di current directory
|
|
||||||
X_train = joblib.load(input_X)
|
X_train = joblib.load(input_X)
|
||||||
y_train = joblib.load(input_y)
|
y_train = joblib.load(input_y)
|
||||||
|
|
||||||
print(f" - Dimensi Awal: {X_train.shape}")
|
print(f" - Dimensi Awal: {X_train.shape}")
|
||||||
print(f" - Distribusi Kelas Awal: {Counter(y_train)}")
|
print(f" - Distribusi Kelas Awal: {Counter(y_train)}")
|
||||||
# Contoh output: {0: 1964, 1: 485, 2: 303} (tergantung mapping label encoder)
|
|
||||||
|
|
||||||
# 2. Eksekusi SMOTE
|
|
||||||
print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...")
|
print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...")
|
||||||
print(" (Sedang membuat data sintetis untuk kelas minoritas...)")
|
print(" (Sedang membuat data sintetis untuk kelas minoritas...)")
|
||||||
|
|
||||||
smote = SMOTE(random_state=42)
|
smote = SMOTE(random_state=42)
|
||||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
||||||
|
|
||||||
# 3. Validasi Hasil
|
|
||||||
print("\n3. Validasi Hasil SMOTE:")
|
print("\n3. Validasi Hasil SMOTE:")
|
||||||
print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}")
|
print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}")
|
||||||
print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}")
|
print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}")
|
||||||
|
|
||||||
# Pastikan semua kelas jumlahnya sama
|
|
||||||
counts = list(Counter(y_train_resampled).values())
|
counts = list(Counter(y_train_resampled).values())
|
||||||
if len(set(counts)) == 1:
|
if len(set(counts)) == 1:
|
||||||
print(" ✅ SUCCESS: Dataset sekarang SEIMBANG!")
|
print(" ✅ SUCCESS: Dataset sekarang SEIMBANG!")
|
||||||
else:
|
else:
|
||||||
print(" ⚠️ WARNING: Dataset belum seimbang sempurna.")
|
print(" ⚠️ WARNING: Dataset belum seimbang sempurna.")
|
||||||
|
|
||||||
# 4. Simpan Data SMOTE
|
|
||||||
print("\n4. Menyimpan data hasil SMOTE...")
|
print("\n4. Menyimpan data hasil SMOTE...")
|
||||||
joblib.dump(X_train_resampled, output_X)
|
joblib.dump(X_train_resampled, output_X)
|
||||||
joblib.dump(y_train_resampled, output_y)
|
joblib.dump(y_train_resampled, output_y)
|
||||||
|
|
|
||||||
|
|
@ -14,16 +14,11 @@ from nltk.corpus import stopwords
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
import os
|
import os
|
||||||
# Download NLTK resources (Cukup sekali run)
|
|
||||||
# nltk.download('punkt')
|
|
||||||
# nltk.download('stopwords')
|
|
||||||
# nltk.download('wordnet')
|
|
||||||
|
|
||||||
|
|
||||||
class ReviewScraper:
|
class ReviewScraper:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
options = Options()
|
options = Options()
|
||||||
# options.add_argument("--headless")
|
|
||||||
options.add_argument("--start-maximized")
|
options.add_argument("--start-maximized")
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
options.add_experimental_option(
|
options.add_experimental_option(
|
||||||
|
|
@ -43,7 +38,6 @@ class ReviewScraper:
|
||||||
|
|
||||||
def get_review_data(self, container, source_url) -> dict:
|
def get_review_data(self, container, source_url) -> dict:
|
||||||
try:
|
try:
|
||||||
# 1. Username
|
|
||||||
username = "Anonymous"
|
username = "Anonymous"
|
||||||
user_elem = container.find(
|
user_elem = container.find(
|
||||||
'span', attrs={'data-testid': 'proName'})
|
'span', attrs={'data-testid': 'proName'})
|
||||||
|
|
@ -53,7 +47,6 @@ class ReviewScraper:
|
||||||
if user_elem:
|
if user_elem:
|
||||||
username = user_elem.text
|
username = user_elem.text
|
||||||
|
|
||||||
# 2. Rating (Ambil dari aria-label bintang)
|
|
||||||
rating = "5"
|
rating = "5"
|
||||||
rating_elem = container.find(
|
rating_elem = container.find(
|
||||||
'div', attrs={'data-testid': 'icnStarRating'})
|
'div', attrs={'data-testid': 'icnStarRating'})
|
||||||
|
|
@ -63,7 +56,6 @@ class ReviewScraper:
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# 3. Ulasan Text
|
|
||||||
ulasan = ""
|
ulasan = ""
|
||||||
ulasan_elem = container.find(
|
ulasan_elem = container.find(
|
||||||
'span', attrs={'data-testid': 'lblItemUlasan'})
|
'span', attrs={'data-testid': 'lblItemUlasan'})
|
||||||
|
|
@ -72,7 +64,6 @@ class ReviewScraper:
|
||||||
if ulasan_elem:
|
if ulasan_elem:
|
||||||
ulasan = ulasan_elem.text
|
ulasan = ulasan_elem.text
|
||||||
|
|
||||||
# Fallback jika ulasan kosong
|
|
||||||
if not ulasan:
|
if not ulasan:
|
||||||
paragraphs = container.find_all('p')
|
paragraphs = container.find_all('p')
|
||||||
for p in paragraphs:
|
for p in paragraphs:
|
||||||
|
|
@ -80,7 +71,6 @@ class ReviewScraper:
|
||||||
ulasan = p.text
|
ulasan = p.text
|
||||||
break
|
break
|
||||||
|
|
||||||
# 4. Tanggal
|
|
||||||
waktu_komentar = "Unknown"
|
waktu_komentar = "Unknown"
|
||||||
date_elem = container.find(
|
date_elem = container.find(
|
||||||
'p', class_=re.compile(r'timestamp|date', re.I))
|
'p', class_=re.compile(r'timestamp|date', re.I))
|
||||||
|
|
@ -92,7 +82,6 @@ class ReviewScraper:
|
||||||
waktu_komentar = span.text
|
waktu_komentar = span.text
|
||||||
break
|
break
|
||||||
|
|
||||||
# VALIDASI: Jangan simpan jika kosong
|
|
||||||
if not ulasan:
|
if not ulasan:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -118,13 +107,10 @@ class ReviewScraper:
|
||||||
"""
|
"""
|
||||||
print(f" ...Mencoba {action} filter Bintang {rating}...")
|
print(f" ...Mencoba {action} filter Bintang {rating}...")
|
||||||
|
|
||||||
# Scroll agar elemen masuk viewport
|
|
||||||
self.driver.execute_script("window.scrollBy(0, 400);")
|
self.driver.execute_script("window.scrollBy(0, 400);")
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
# STRATEGI XPATH
|
|
||||||
strategies = [
|
strategies = [
|
||||||
# Spesifik Tokped baru
|
|
||||||
f"//label[contains(@for, 'rating') and .//text()='{rating}']",
|
f"//label[contains(@for, 'rating') and .//text()='{rating}']",
|
||||||
f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]",
|
f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]",
|
||||||
f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]",
|
f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]",
|
||||||
|
|
@ -134,38 +120,31 @@ class ReviewScraper:
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
found_element = None
|
found_element = None
|
||||||
|
|
||||||
# Coba cari elemen dengan salah satu strategi
|
|
||||||
for xpath in strategies:
|
for xpath in strategies:
|
||||||
try:
|
try:
|
||||||
# Timeout dipendekkan ke 2 detik agar cepat skip jika tidak ada
|
|
||||||
found_element = WebDriverWait(self.driver, 2).until(
|
found_element = WebDriverWait(self.driver, 2).until(
|
||||||
EC.presence_of_element_located((By.XPATH, xpath))
|
EC.presence_of_element_located((By.XPATH, xpath))
|
||||||
)
|
)
|
||||||
|
|
||||||
# Cek apakah visible & clickable
|
|
||||||
if found_element.is_displayed():
|
if found_element.is_displayed():
|
||||||
# Cek apakah disabled (kelas CSS atau atribut)
|
|
||||||
if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"):
|
if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"):
|
||||||
print(
|
print(
|
||||||
f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).")
|
f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Jika elemen ketemu, siap diklik
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
found_element = None # Ketemu di DOM tapi hidden
|
found_element = None
|
||||||
except TimeoutException:
|
except TimeoutException:
|
||||||
continue # Coba strategi xpath berikutnya
|
continue
|
||||||
|
|
||||||
# HASIL PENCARIAN
|
|
||||||
if found_element:
|
if found_element:
|
||||||
try:
|
try:
|
||||||
# KLIK!
|
|
||||||
self.driver.execute_script(
|
self.driver.execute_script(
|
||||||
"arguments[0].click();", found_element)
|
"arguments[0].click();", found_element)
|
||||||
print(
|
print(
|
||||||
f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!")
|
f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!")
|
||||||
time.sleep(3) # Tunggu loading data
|
time.sleep(3)
|
||||||
return True
|
return True
|
||||||
except Exception as click_error:
|
except Exception as click_error:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
|
|
@ -178,8 +157,6 @@ class ReviewScraper:
|
||||||
f" [ERROR] Gagal klik filter setelah retry: {click_error}")
|
f" [ERROR] Gagal klik filter setelah retry: {click_error}")
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
# PENTING: Jika di attempt pertama tidak ketemu di semua strategi,
|
|
||||||
# asumsikan filter TIDAK ADA. Jangan retry.
|
|
||||||
print(
|
print(
|
||||||
f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.")
|
f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.")
|
||||||
return False
|
return False
|
||||||
|
|
@ -199,7 +176,6 @@ class ReviewScraper:
|
||||||
containers = soup.find_all("article")
|
containers = soup.find_all("article")
|
||||||
|
|
||||||
if not containers:
|
if not containers:
|
||||||
# Double check: kadang loading lambat
|
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
||||||
containers = soup.find_all(
|
containers = soup.find_all(
|
||||||
|
|
@ -214,7 +190,6 @@ class ReviewScraper:
|
||||||
for container in containers:
|
for container in containers:
|
||||||
review_data = self.get_review_data(container, url)
|
review_data = self.get_review_data(container, url)
|
||||||
if review_data:
|
if review_data:
|
||||||
# Validasi Rating sesuai Filter
|
|
||||||
if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context:
|
if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -229,12 +204,11 @@ class ReviewScraper:
|
||||||
else:
|
else:
|
||||||
print(f" . Halaman {page_number} tidak ada data baru.")
|
print(f" . Halaman {page_number} tidak ada data baru.")
|
||||||
empty_page_count += 1
|
empty_page_count += 1
|
||||||
if empty_page_count >= 2: # Stop jika 2 halaman berturut-turut zonk
|
if empty_page_count >= 2:
|
||||||
print(
|
print(
|
||||||
" [STOP] 2 halaman tanpa data baru. Pindah filter.")
|
" [STOP] 2 halaman tanpa data baru. Pindah filter.")
|
||||||
break
|
break
|
||||||
|
|
||||||
# Navigasi Next Button
|
|
||||||
try:
|
try:
|
||||||
next_button = self.driver.find_element(
|
next_button = self.driver.find_element(
|
||||||
By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']")
|
By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']")
|
||||||
|
|
@ -256,35 +230,27 @@ class ReviewScraper:
|
||||||
self.driver.execute_script("window.scrollBy(0, 800);")
|
self.driver.execute_script("window.scrollBy(0, 800);")
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
# TARGET: Negatif (1,2) & Netral (3)
|
|
||||||
target_filters = ['1', '2', '3']
|
target_filters = ['1', '2', '3']
|
||||||
|
|
||||||
for rating in target_filters:
|
for rating in target_filters:
|
||||||
# 1. KLIK FILTER
|
|
||||||
success = self.toggle_filter(rating, action="CHECK")
|
success = self.toggle_filter(rating, action="CHECK")
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
# 2. SCRAPE
|
|
||||||
self.scrape_pages_current_view(
|
self.scrape_pages_current_view(
|
||||||
url, current_rating_context=rating)
|
url, current_rating_context=rating)
|
||||||
|
|
||||||
# 3. UNCHECK (PENTING: Gunakan logic toggle yang sama)
|
|
||||||
# Scroll dikit ke atas biar tombol filter kelihatan lagi
|
|
||||||
self.driver.execute_script("window.scrollBy(0, -300);")
|
self.driver.execute_script("window.scrollBy(0, -300);")
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
uncheck_success = self.toggle_filter(
|
uncheck_success = self.toggle_filter(
|
||||||
rating, action="UNCHECK")
|
rating, action="UNCHECK")
|
||||||
if not uncheck_success:
|
if not uncheck_success:
|
||||||
# Jika gagal uncheck, refresh page adalah jalan ninja
|
|
||||||
print(
|
print(
|
||||||
" [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...")
|
" [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...")
|
||||||
self.driver.refresh()
|
self.driver.refresh()
|
||||||
time.sleep(4)
|
time.sleep(4)
|
||||||
self.driver.execute_script("window.scrollBy(0, 800);")
|
self.driver.execute_script("window.scrollBy(0, 800);")
|
||||||
else:
|
else:
|
||||||
# Jika toggle CHECK gagal/tidak ketemu -> LANJUT ke rating berikutnya
|
|
||||||
# Tidak perlu scrape, tidak perlu uncheck
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
@ -306,23 +272,18 @@ class ReviewScraper:
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
|
|
||||||
if self.data:
|
if self.data:
|
||||||
# 1. Siapkan Data Baru
|
|
||||||
df_new = pd.DataFrame(self.data)
|
df_new = pd.DataFrame(self.data)
|
||||||
df_new = self.label_data(df_new)
|
df_new = self.label_data(df_new)
|
||||||
|
|
||||||
filename = 'new_dataset_fix_balanced.csv'
|
filename = 'new_dataset_fix_balanced.csv'
|
||||||
|
|
||||||
# 2. Cek apakah file sudah ada (Smart Merge Logic)
|
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
try:
|
try:
|
||||||
print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...")
|
print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...")
|
||||||
df_old = pd.read_csv(filename)
|
df_old = pd.read_csv(filename)
|
||||||
|
|
||||||
# Gabungkan data lama dan baru
|
|
||||||
df_combined = pd.concat([df_old, df_new], ignore_index=True)
|
df_combined = pd.concat([df_old, df_new], ignore_index=True)
|
||||||
|
|
||||||
# 3. Hapus Duplikat
|
|
||||||
# Kita anggap duplikat jika Username, Review (yang sudah dibersihkan), dan Tanggal sama persis
|
|
||||||
total_before = len(df_combined)
|
total_before = len(df_combined)
|
||||||
df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True)
|
df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True)
|
||||||
total_after = len(df_combined)
|
total_after = len(df_combined)
|
||||||
|
|
@ -339,7 +300,6 @@ class ReviewScraper:
|
||||||
print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.")
|
print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.")
|
||||||
df_final = df_new
|
df_final = df_new
|
||||||
|
|
||||||
# 4. Simpan Hasil Akhir
|
|
||||||
print("\n=== TOTAL DATASET SETELAH UPDATE ===")
|
print("\n=== TOTAL DATASET SETELAH UPDATE ===")
|
||||||
print(df_final['Sentiment'].value_counts())
|
print(df_final['Sentiment'].value_counts())
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,24 +1,17 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
# 1. Load data
|
|
||||||
df = pd.read_csv('new_final_dataset.csv')
|
df = pd.read_csv('new_final_dataset.csv')
|
||||||
|
|
||||||
# 2. Pisahkan tiap kelas
|
|
||||||
df_pos = df[df['Sentiment'] == 'positif']
|
df_pos = df[df['Sentiment'] == 'positif']
|
||||||
df_neg = df[df['Sentiment'] == 'negatif']
|
df_neg = df[df['Sentiment'] == 'negatif']
|
||||||
df_net = df[df['Sentiment'] == 'netral']
|
df_net = df[df['Sentiment'] == 'netral']
|
||||||
|
|
||||||
# 3. Hitung target (Jumlah Negatif + Netral)
|
target_count = len(df_neg) + len(df_net)
|
||||||
target_count = len(df_neg) + len(df_net) # Hasilnya 1622
|
|
||||||
|
|
||||||
# 4. Ambil sampel acak dari kelas positif sebanyak target_count
|
|
||||||
df_pos_trimmed = df_pos.sample(n=target_count, random_state=42)
|
df_pos_trimmed = df_pos.sample(n=target_count, random_state=42)
|
||||||
|
|
||||||
# 5. Gabungkan kembali semua data
|
|
||||||
df_final = pd.concat([df_pos_trimmed, df_neg, df_net])
|
df_final = pd.concat([df_pos_trimmed, df_neg, df_net])
|
||||||
|
|
||||||
# 6. Acak urutan data agar tidak mengumpul
|
|
||||||
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
|
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
|
|
||||||
# Simpan hasil
|
|
||||||
df_final.to_csv('trimmed_sentiment_dataset.csv', index=False)
|
df_final.to_csv('trimmed_sentiment_dataset.csv', index=False)
|
||||||
|
|
@ -3,29 +3,22 @@ import seaborn as sns
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
# Data Confusion Matrix dari Skenario 3 (Pipeline + SMOTE)
|
|
||||||
# Baris: Aktual, Kolom: Prediksi
|
|
||||||
data_cm = np.array([
|
data_cm = np.array([
|
||||||
[146, 34, 19], # Aktual Negatif
|
[146, 34, 19],
|
||||||
[60, 36, 28], # Aktual Netral
|
[60, 36, 28],
|
||||||
[29, 16, 280] # Aktual Positif
|
[29, 16, 280]
|
||||||
])
|
])
|
||||||
|
|
||||||
# Label kategori
|
|
||||||
labels = ['Negatif', 'Netral', 'Positif']
|
labels = ['Negatif', 'Netral', 'Positif']
|
||||||
|
|
||||||
# Membuat plot
|
|
||||||
plt.figure(figsize=(8, 6))
|
plt.figure(figsize=(8, 6))
|
||||||
sns.set(font_scale=1.2) # Mengatur ukuran font
|
sns.set(font_scale=1.2)
|
||||||
|
|
||||||
# Membuat heatmap
|
|
||||||
ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues',
|
ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues',
|
||||||
xticklabels=labels, yticklabels=labels)
|
xticklabels=labels, yticklabels=labels)
|
||||||
|
|
||||||
# Menambahkan label dan judul
|
|
||||||
plt.xlabel('Prediksi', fontsize=14, labelpad=15)
|
plt.xlabel('Prediksi', fontsize=14, labelpad=15)
|
||||||
plt.ylabel('Aktual', fontsize=14, labelpad=15)
|
plt.ylabel('Aktual', fontsize=14, labelpad=15)
|
||||||
plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20)
|
plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20)
|
||||||
|
|
||||||
# Menampilkan plot
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
# Data dari hasil eksperimen Anda
|
|
||||||
scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)']
|
scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)']
|
||||||
accuracy = [0.78, 0.79, 0.77]
|
accuracy = [0.78, 0.79, 0.77]
|
||||||
macro_f1 = [0.65, 0.66, 0.66]
|
macro_f1 = [0.65, 0.66, 0.66]
|
||||||
|
|
@ -12,12 +11,10 @@ width = 0.25
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(10, 6))
|
fig, ax = plt.subplots(figsize=(10, 6))
|
||||||
|
|
||||||
# Membuat bar chart
|
|
||||||
rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db')
|
rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db')
|
||||||
rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71')
|
rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71')
|
||||||
rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c')
|
rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c')
|
||||||
|
|
||||||
# Menambahkan teks dan label
|
|
||||||
ax.set_ylabel('Scores')
|
ax.set_ylabel('Scores')
|
||||||
ax.set_title('Perbandingan Performa Model XGBoost antar Skenario')
|
ax.set_title('Perbandingan Performa Model XGBoost antar Skenario')
|
||||||
ax.set_xticks(x)
|
ax.set_xticks(x)
|
||||||
|
|
@ -25,7 +22,6 @@ ax.set_xticklabels(scenarios)
|
||||||
ax.legend(loc='lower right')
|
ax.legend(loc='lower right')
|
||||||
ax.set_ylim(0, 1.0)
|
ax.set_ylim(0, 1.0)
|
||||||
|
|
||||||
# Menambahkan label angka di atas bar
|
|
||||||
def autolabel(rects):
|
def autolabel(rects):
|
||||||
for rect in rects:
|
for rect in rects:
|
||||||
height = rect.get_height()
|
height = rect.get_height()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue