diff --git a/run.py b/run.py index deddaff..27ee630 100644 --- a/run.py +++ b/run.py @@ -3,10 +3,8 @@ import sys import uvicorn if __name__ == "__main__": - # Paksa penggunaan SelectorEventLoop di level paling dasar OS Windows if sys.platform == 'win32': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) print("āœ… Mesin Selector Loop Aktif (Anti-NotImplementedError)") - # Jalankan uvicorn dari sini, bukan dari terminal langsung uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True) \ No newline at end of file diff --git a/services.py b/services.py index f81a993..59734a6 100644 --- a/services.py +++ b/services.py @@ -11,7 +11,6 @@ def clean_product_name(name: str) -> str: return name.strip() async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request): - # 1. SETUP ASPEK (Initialize score 0 untuk setiap kategori) aspect_stats = { aspect: {"positive": 0, "total": 0} for aspect in config.ASPECT_KEYWORDS.keys() @@ -19,7 +18,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str, print(f"šŸ” Memulai Analisis ABSA: {candidate.name[:30]}...") - # 2. DATABASE PRE-CHECK (Model & User) model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"}) if not model_db: print("āŒ ERROR: Model XGBoost tidak ditemukan di database!") @@ -30,7 +28,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str, print(f"āš ļø User {user_email} tidak ditemukan!") return None - # 3. PRODUCT PERSISTENCE brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown" product_name = clean_product_name(candidate.name) @@ -49,7 +46,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str, } ) - # 4. NLP PREDICTION & ASPECT TAGGING LOOP total_reviews = len(candidate.reviews) if total_reviews == 0: return None @@ -69,7 +65,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str, pred_idx = ml_core.model_optimized.predict(vec)[0] label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower() - # Confidence Score dari XGBoost try: prob = ml_core.model_optimized.predict_proba(vec)[0] confidence_score = float(max(prob)) @@ -107,13 +102,11 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str, "userId": user_db.id }) - # 5. DATABASE SYNC (Batch Operations) if reviews_data_to_save: async with prisma.tx() as transaction: await transaction.review.delete_many(where={"productId": product_db.productId}) await transaction.review.create_many(data=reviews_data_to_save) - # 6. CALCULATION & VERDICT GENERATION final_aspect_scores = {} for aspect, stat in aspect_stats.items(): score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0 @@ -140,14 +133,12 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str, else: verdict_label = "Kurang Disarankan" - # 1. Buat Analysis terlebih dahulu untuk mendapatkan ID-nya new_analysis = await prisma.analysis.create( data={ "userId": user_db.id, } ) - # 2. Buat Metric dan hubungkan ke Analysis yang baru saja dibuat await prisma.metric.create( data={ "generalSentiment": general_sentiment_pct, diff --git a/src/flow_1/main.py b/src/flow_1/main.py index 4ebcccd..30dc81d 100644 --- a/src/flow_1/main.py +++ b/src/flow_1/main.py @@ -5,9 +5,6 @@ from pathlib import Path from xgboost import XGBClassifier from sklearn.metrics import classification_report, confusion_matrix -# ========================================== -# 1. KONFIGURASI PATH (PATHLIB) -# ========================================== SCRIPT_DIR = Path(__file__).resolve().parent PROJECT_ROOT = SCRIPT_DIR.parents[1] DATA_DIR = PROJECT_ROOT / "robust_data" @@ -20,9 +17,6 @@ PATHS = { "le": DATA_DIR / "tokenize" / "label_encoder.pkl", } -# ========================================== -# 2. LOAD DATA -# ========================================== print("\n--- MEMUAT DATA BASELINE ---") data = {} @@ -47,11 +41,6 @@ le = data['le'] print(f"\nDimensi Training (Imbalanced): {X_train.shape}") -# ========================================== -# 3. SETUP MODEL BASELINE -# ========================================== -# Tanpa Grid Search, menggunakan settingan default XGBoost -# Default XGBoost biasanya: learning_rate=0.3, max_depth=6, n_estimators=100 model_baseline = XGBClassifier( objective='multi:softprob', num_class=3, @@ -60,9 +49,6 @@ model_baseline = XGBClassifier( use_label_encoder=False ) -# ========================================== -# 4. EKSEKUSI TRAINING -# ========================================== print("\nšŸ”„ MULAI TRAINING BASELINE (SCENARIO 1)...") start_time = time.time() @@ -71,31 +57,21 @@ model_baseline.fit(X_train, y_train) duration = time.time() - start_time print(f"\nāœ… SELESAI! Waktu proses: {duration:.2f} detik") -# ========================================== -# 5. MENAMPILKAN PARAMETER DEFAULT (BARU) -# ========================================== -# Karena tidak pakai GridSearch, kita ambil parameter langsung dari modelnya print("\n" + "="*40) print("PARAMETER YANG DIGUNAKAN (DEFAULT)") print("="*40) -# Mengambil seluruh parameter model all_params = model_baseline.get_params() -# Kita filter hanya parameter penting untuk dibandingkan dengan Skenario 2 & 3 key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree'] shown_params = {k: all_params.get(k) for k in key_params} -# Jika n_estimators atau learning_rate None (karena default library), kita set nilai standarnya manual untuk info if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)" if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default" print(shown_params) print("(Gunakan nilai ini untuk perbandingan di Bab 4)") -# ========================================== -# 6. EVALUASI & SIMPAN -# ========================================== print("\n" + "="*40) print("HASIL SKENARIO 1 (BASELINE)") print("="*40) @@ -111,7 +87,6 @@ print(classification_report(y_test_label, y_pred_label)) print("\nConfusion Matrix:") print(confusion_matrix(y_test_label, y_pred_label)) -# Simpan Model Baseline model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl' joblib.dump(model_baseline, model_path) print(f"\nšŸ’¾ Model baseline disimpan ke: {model_path}") \ No newline at end of file diff --git a/src/flow_2/main.py b/src/flow_2/main.py index 32a4c5e..82b43ec 100644 --- a/src/flow_2/main.py +++ b/src/flow_2/main.py @@ -6,9 +6,6 @@ from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report, confusion_matrix -# ========================================== -# 1. KONFIGURASI PATH (PATHLIB) -# ========================================== SCRIPT_DIR = Path(__file__).resolve().parent PROJECT_ROOT = SCRIPT_DIR.parents[1] DATA_DIR = PROJECT_ROOT / "robust_data" @@ -21,9 +18,6 @@ PATHS = { "le": DATA_DIR / "tokenize" / "label_encoder.pkl", } -# ========================================== -# 2. LOAD DATA -# ========================================== print("\n--- MEMUAT DATA SCENARIO 2 ---") data = {} @@ -46,11 +40,6 @@ le = data['le'] print(f"\nDimensi Training (Imbalanced): {X_train.shape}") -# ========================================== -# 3. SETUP GRID SEARCH (SAMA DENGAN SKENARIO 3) -# ========================================== -# Kita gunakan range parameter yang SAMA PERSIS dengan Skenario 3 -# agar perbandingannya adil (apple-to-apple). param_grid = { 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], @@ -67,7 +56,6 @@ xgb = XGBClassifier( use_label_encoder=False ) -# Gunakan F1-Macro agar Grid Search mencoba adil ke kelas minoritas grid_search = GridSearchCV( estimator=xgb, param_grid=param_grid, @@ -77,9 +65,6 @@ grid_search = GridSearchCV( verbose=1 ) -# ========================================== -# 4. EKSEKUSI TRAINING -# ========================================== print("\nšŸ”„ MULAI TRAINING & GRID SEARCH (SCENARIO 2)...") print("Sedang mencari parameter terbaik untuk data Imbalanced...") start_time = time.time() @@ -89,9 +74,6 @@ grid_search.fit(X_train, y_train) duration = time.time() - start_time print(f"\nāœ… SELESAI! Waktu proses: {duration/60:.2f} menit") -# ========================================== -# 5. EVALUASI & SIMPAN -# ========================================== best_model = grid_search.best_estimator_ print("\n" + "="*40) @@ -114,7 +96,6 @@ print(classification_report(y_test_label, y_pred_label)) print("\nConfusion Matrix:") print(confusion_matrix(y_test_label, y_pred_label)) -# Simpan Model Skenario 2 model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl' joblib.dump(best_model, model_path) print(f"\nšŸ’¾ Model Skenario 2 disimpan ke: {model_path}") \ No newline at end of file diff --git a/src/flow_3/pipeline.py b/src/flow_3/pipeline.py index d2bcdc4..0f7a4f6 100644 --- a/src/flow_3/pipeline.py +++ b/src/flow_3/pipeline.py @@ -11,9 +11,6 @@ from sklearn.metrics import classification_report, confusion_matrix from imblearn.pipeline import Pipeline as ImbPipeline from imblearn.over_sampling import SMOTE -# ========================================== -# KONFIGURASI PATH -# ========================================== SCRIPT_DIR = Path(__file__).resolve().parent PROJECT_ROOT = SCRIPT_DIR.parents[1] DATA_DIR = PROJECT_ROOT / "robust_data" @@ -28,7 +25,6 @@ PATHS = { print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---") -# Load Data data = {} for name, path in PATHS.items(): if not path.exists(): @@ -41,9 +37,6 @@ X_train, y_train = data["X_train"], data["y_train"] X_test, y_test = data["X_test"], data["y_test"] le = data["le"] -# ========================================== -# REPORT PROPORSI DATA (SEBELUM & SESUDAH SMOTE) -# ========================================== print("\n" + "="*40) print("REPORT PROPORSI DATA") print("="*40) @@ -57,16 +50,12 @@ def print_proportion(y, title): print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)") -# Simulasi SMOTE untuk melihat hasil akhir yang akan diproses Pipeline sm_sim = SMOTE(random_state=42) _, y_resampled_sim = sm_sim.fit_resample(X_train, y_train) print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)") print("\n" + "="*40) -# ========================================== -# DEFINISI PIPELINE -# ========================================== pipeline = ImbPipeline([ ('smote', SMOTE(random_state=42)), ('selector', SelectKBest(score_func=chi2, k=2000)), @@ -79,9 +68,6 @@ pipeline = ImbPipeline([ )) ]) -# ========================================== -# SETTING GRID SEARCH -# ========================================== # param_grid = { # 'clf__learning_rate': [0.1, 0.2], # 'clf__max_depth': [5, 7], @@ -106,9 +92,6 @@ grid_search = GridSearchCV( verbose=2 ) -# ========================================== -# EKSEKUSI TRAINING -# ========================================== print(f"\nšŸ”„ MULAI TRAINING... (Dimensi Awal: {X_train.shape})") start_time = time.time() @@ -117,9 +100,6 @@ grid_search.fit(X_train, y_train) duration = time.time() - start_time print(f"\nāœ… SELESAI! Waktu proses: {duration/60:.2f} menit") -# ========================================== -# EVALUASI -# ========================================== best_model = grid_search.best_estimator_ print("\n" + "="*40) @@ -129,7 +109,6 @@ print(grid_search.best_params_) y_pred = best_model.predict(X_test) -# Inverse Transform Label y_test_label = le.inverse_transform(y_test) y_pred_label = le.inverse_transform(y_pred) @@ -139,9 +118,6 @@ print(classification_report(y_test_label, y_pred_label)) print("\nConfusion Matrix:") print(confusion_matrix(y_test_label, y_pred_label)) -# ========================================== -# SIMPAN MODEL -# ========================================== MODEL_DIR = PROJECT_ROOT / "models" MODEL_DIR.mkdir(exist_ok=True) model_path = MODEL_DIR / "xgboost_scenario3.pkl" diff --git a/src/flow_3/process_chisquare.py b/src/flow_3/process_chisquare.py index e378502..e55d9b3 100644 --- a/src/flow_3/process_chisquare.py +++ b/src/flow_3/process_chisquare.py @@ -2,82 +2,58 @@ import joblib import os from sklearn.feature_selection import SelectKBest, chi2 -# ========================================== -# KONFIGURASI -# ========================================== base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -# Input (Kita butuh Data Train hasil SMOTE dan Data Test asli) input_X_train = 'new_X_train_smote.pkl' input_y_train = 'new_y_train_smote.pkl' -input_X_test = 'X_test_tfidf.pkl' # Test set asli (belum diapa-apakan selain TFIDF) +input_X_test = 'X_test_tfidf.pkl' -# Output -# output_X_train = 'data/chi2/X_train_chi2.pkl' -# output_X_test = 'data/chi2/X_test_chi2.pkl' -# output_selector = 'data/chi2/chisquare_selector.pkl' # Simpan logikanya output_X_train = 'X_train_chi2.pkl' output_X_test = 'X_test_chi2.pkl' -output_selector = 'chisquare_selector.pkl' # Simpan logikanya +output_selector = 'chisquare_selector.pkl' -# JUMLAH FITUR YANG INGIN DIAMBIL (Parameter K) -# Silakan ubah angka ini. 1000 adalah angka start yang bagus untuk Skripsi S1. -# Jika fitur awal Anda < 1000, ubah jadi 'all' atau angka lebih kecil (misal 500). K_FEATURES = 1000 print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---") try: - # 1. Load Data print("1. Memuat data...") - # Load Train (SMOTE) X_train = joblib.load(os.path.join(base_dir, input_X_train)) y_train = joblib.load(os.path.join(base_dir, input_y_train)) - # Load Test (TF-IDF Asli) - # Kita butuh ini agar dimensi Test sama dengan Train nanti X_test = joblib.load(os.path.join(base_dir, input_X_test)) print(f" - Dimensi Awal Train: {X_train.shape}") print(f" - Dimensi Awal Test: {X_test.shape}") - # Cek jumlah fitur total total_features = X_train.shape[1] print(f" - Total kata/fitur saat ini: {total_features}") - # Validasi K if isinstance(K_FEATURES, int) and K_FEATURES > total_features: print(f" āš ļø WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.") k_final = 'all' else: k_final = K_FEATURES - # 2. Proses Chi-Square print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...") - # Inisialisasi SelectKBest dengan skor func chi2 selector = SelectKBest(score_func=chi2, k=k_final) - # FIT hanya pada Data Train! (Pelajari mana kata penting dari data latih) selector.fit(X_train, y_train) - # TRANSFORM pada Train DAN Test X_train_selected = selector.transform(X_train) X_test_selected = selector.transform(X_test) - # 3. Validasi Hasil print("\n3. Hasil Seleksi:") print(f" - Dimensi Train Baru: {X_train_selected.shape}") print(f" - Dimensi Test Baru: {X_test_selected.shape}") - # Menampilkan beberapa skor fitur (opsional, untuk info saja) print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.") - # 4. Simpan Data print("\n4. Menyimpan hasil...") joblib.dump(X_train_selected, output_X_train) joblib.dump(X_test_selected, output_X_test) - joblib.dump(selector, output_selector) # Penting untuk prediksi data baru nanti + joblib.dump(selector, output_selector) print("="*40) print(f"SUKSES! Data siap untuk Training XGBoost.") diff --git a/src/flow_3/process_smote.py b/src/flow_3/process_smote.py index 84e9199..4b48ccc 100644 --- a/src/flow_3/process_smote.py +++ b/src/flow_3/process_smote.py @@ -4,58 +4,44 @@ from imblearn.over_sampling import SMOTE from collections import Counter import os -# ========================================== -# KONFIGURASI -# ========================================== -# Gunakan relative path agar aman (sama seperti sebelumnya) base_dir = os.path.dirname(os.path.abspath(__file__)) -# Input files (hasil dari TF-IDF sebelumnya) input_X = 'X_train_tfidf.pkl' input_y = 'y_train.pkl' -# Output files (hasil SMOTE) output_X = 'new_X_train_smote.pkl' output_y = 'new_y_train_smote.pkl' print("--- MEMULAI PROSES SMOTE (Skenario 3) ---") try: - # 1. Load Data TF-IDF (Data Latih Saja) print("1. Memuat data latih TF-IDF...") - # Cek apakah file ada di folder yang sama atau perlu path khusus if os.path.exists(os.path.join(base_dir, input_X)): X_train = joblib.load(os.path.join(base_dir, input_X)) y_train = joblib.load(os.path.join(base_dir, input_y)) else: - # Fallback jika file ada di current directory X_train = joblib.load(input_X) y_train = joblib.load(input_y) print(f" - Dimensi Awal: {X_train.shape}") print(f" - Distribusi Kelas Awal: {Counter(y_train)}") - # Contoh output: {0: 1964, 1: 485, 2: 303} (tergantung mapping label encoder) - # 2. Eksekusi SMOTE print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...") print(" (Sedang membuat data sintetis untuk kelas minoritas...)") smote = SMOTE(random_state=42) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) - # 3. Validasi Hasil print("\n3. Validasi Hasil SMOTE:") print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}") print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}") - # Pastikan semua kelas jumlahnya sama counts = list(Counter(y_train_resampled).values()) if len(set(counts)) == 1: print(" āœ… SUCCESS: Dataset sekarang SEIMBANG!") else: print(" āš ļø WARNING: Dataset belum seimbang sempurna.") - # 4. Simpan Data SMOTE print("\n4. Menyimpan data hasil SMOTE...") joblib.dump(X_train_resampled, output_X) joblib.dump(y_train_resampled, output_y) diff --git a/src/mining/main.py b/src/mining/main.py index 15a5f94..711b8fd 100644 --- a/src/mining/main.py +++ b/src/mining/main.py @@ -14,16 +14,11 @@ from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import os -# Download NLTK resources (Cukup sekali run) -# nltk.download('punkt') -# nltk.download('stopwords') -# nltk.download('wordnet') class ReviewScraper: def __init__(self): options = Options() - # options.add_argument("--headless") options.add_argument("--start-maximized") options.add_argument("--disable-blink-features=AutomationControlled") options.add_experimental_option( @@ -43,7 +38,6 @@ class ReviewScraper: def get_review_data(self, container, source_url) -> dict: try: - # 1. Username username = "Anonymous" user_elem = container.find( 'span', attrs={'data-testid': 'proName'}) @@ -53,7 +47,6 @@ class ReviewScraper: if user_elem: username = user_elem.text - # 2. Rating (Ambil dari aria-label bintang) rating = "5" rating_elem = container.find( 'div', attrs={'data-testid': 'icnStarRating'}) @@ -63,7 +56,6 @@ class ReviewScraper: except: pass - # 3. Ulasan Text ulasan = "" ulasan_elem = container.find( 'span', attrs={'data-testid': 'lblItemUlasan'}) @@ -72,7 +64,6 @@ class ReviewScraper: if ulasan_elem: ulasan = ulasan_elem.text - # Fallback jika ulasan kosong if not ulasan: paragraphs = container.find_all('p') for p in paragraphs: @@ -80,7 +71,6 @@ class ReviewScraper: ulasan = p.text break - # 4. Tanggal waktu_komentar = "Unknown" date_elem = container.find( 'p', class_=re.compile(r'timestamp|date', re.I)) @@ -92,7 +82,6 @@ class ReviewScraper: waktu_komentar = span.text break - # VALIDASI: Jangan simpan jika kosong if not ulasan: return None @@ -118,13 +107,10 @@ class ReviewScraper: """ print(f" ...Mencoba {action} filter Bintang {rating}...") - # Scroll agar elemen masuk viewport self.driver.execute_script("window.scrollBy(0, 400);") time.sleep(1) - # STRATEGI XPATH strategies = [ - # Spesifik Tokped baru f"//label[contains(@for, 'rating') and .//text()='{rating}']", f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]", f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]", @@ -134,38 +120,31 @@ class ReviewScraper: for attempt in range(max_retries): found_element = None - # Coba cari elemen dengan salah satu strategi for xpath in strategies: try: - # Timeout dipendekkan ke 2 detik agar cepat skip jika tidak ada found_element = WebDriverWait(self.driver, 2).until( EC.presence_of_element_located((By.XPATH, xpath)) ) - # Cek apakah visible & clickable if found_element.is_displayed(): - # Cek apakah disabled (kelas CSS atau atribut) if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"): print( f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).") return False - # Jika elemen ketemu, siap diklik break else: - found_element = None # Ketemu di DOM tapi hidden + found_element = None except TimeoutException: - continue # Coba strategi xpath berikutnya + continue - # HASIL PENCARIAN if found_element: try: - # KLIK! self.driver.execute_script( "arguments[0].click();", found_element) print( f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!") - time.sleep(3) # Tunggu loading data + time.sleep(3) return True except Exception as click_error: if attempt < max_retries - 1: @@ -178,8 +157,6 @@ class ReviewScraper: f" [ERROR] Gagal klik filter setelah retry: {click_error}") return False else: - # PENTING: Jika di attempt pertama tidak ketemu di semua strategi, - # asumsikan filter TIDAK ADA. Jangan retry. print( f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.") return False @@ -199,7 +176,6 @@ class ReviewScraper: containers = soup.find_all("article") if not containers: - # Double check: kadang loading lambat time.sleep(2) soup = BeautifulSoup(self.driver.page_source, "html.parser") containers = soup.find_all( @@ -214,7 +190,6 @@ class ReviewScraper: for container in containers: review_data = self.get_review_data(container, url) if review_data: - # Validasi Rating sesuai Filter if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context: continue @@ -229,12 +204,11 @@ class ReviewScraper: else: print(f" . Halaman {page_number} tidak ada data baru.") empty_page_count += 1 - if empty_page_count >= 2: # Stop jika 2 halaman berturut-turut zonk + if empty_page_count >= 2: print( " [STOP] 2 halaman tanpa data baru. Pindah filter.") break - # Navigasi Next Button try: next_button = self.driver.find_element( By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']") @@ -256,35 +230,27 @@ class ReviewScraper: self.driver.execute_script("window.scrollBy(0, 800);") time.sleep(2) - # TARGET: Negatif (1,2) & Netral (3) target_filters = ['1', '2', '3'] for rating in target_filters: - # 1. KLIK FILTER success = self.toggle_filter(rating, action="CHECK") if success: - # 2. SCRAPE self.scrape_pages_current_view( url, current_rating_context=rating) - # 3. UNCHECK (PENTING: Gunakan logic toggle yang sama) - # Scroll dikit ke atas biar tombol filter kelihatan lagi self.driver.execute_script("window.scrollBy(0, -300);") time.sleep(1) uncheck_success = self.toggle_filter( rating, action="UNCHECK") if not uncheck_success: - # Jika gagal uncheck, refresh page adalah jalan ninja print( " [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...") self.driver.refresh() time.sleep(4) self.driver.execute_script("window.scrollBy(0, 800);") else: - # Jika toggle CHECK gagal/tidak ketemu -> LANJUT ke rating berikutnya - # Tidak perlu scrape, tidak perlu uncheck continue time.sleep(1) @@ -306,23 +272,18 @@ class ReviewScraper: self.driver.quit() if self.data: - # 1. Siapkan Data Baru df_new = pd.DataFrame(self.data) df_new = self.label_data(df_new) filename = 'new_dataset_fix_balanced.csv' - # 2. Cek apakah file sudah ada (Smart Merge Logic) if os.path.exists(filename): try: print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...") df_old = pd.read_csv(filename) - # Gabungkan data lama dan baru df_combined = pd.concat([df_old, df_new], ignore_index=True) - # 3. Hapus Duplikat - # Kita anggap duplikat jika Username, Review (yang sudah dibersihkan), dan Tanggal sama persis total_before = len(df_combined) df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True) total_after = len(df_combined) @@ -339,7 +300,6 @@ class ReviewScraper: print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.") df_final = df_new - # 4. Simpan Hasil Akhir print("\n=== TOTAL DATASET SETELAH UPDATE ===") print(df_final['Sentiment'].value_counts()) diff --git a/trim_dataset.py b/trim_dataset.py index 671e2b6..05e6313 100644 --- a/trim_dataset.py +++ b/trim_dataset.py @@ -1,24 +1,17 @@ import pandas as pd -# 1. Load data df = pd.read_csv('new_final_dataset.csv') -# 2. Pisahkan tiap kelas df_pos = df[df['Sentiment'] == 'positif'] df_neg = df[df['Sentiment'] == 'negatif'] df_net = df[df['Sentiment'] == 'netral'] -# 3. Hitung target (Jumlah Negatif + Netral) -target_count = len(df_neg) + len(df_net) # Hasilnya 1622 +target_count = len(df_neg) + len(df_net) -# 4. Ambil sampel acak dari kelas positif sebanyak target_count df_pos_trimmed = df_pos.sample(n=target_count, random_state=42) -# 5. Gabungkan kembali semua data df_final = pd.concat([df_pos_trimmed, df_neg, df_net]) -# 6. Acak urutan data agar tidak mengumpul df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True) -# Simpan hasil df_final.to_csv('trimmed_sentiment_dataset.csv', index=False) \ No newline at end of file diff --git a/visualize_confusion.py b/visualize_confusion.py index b6638bd..fca9da7 100644 --- a/visualize_confusion.py +++ b/visualize_confusion.py @@ -3,29 +3,22 @@ import seaborn as sns import numpy as np from sklearn.metrics import confusion_matrix -# Data Confusion Matrix dari Skenario 3 (Pipeline + SMOTE) -# Baris: Aktual, Kolom: Prediksi data_cm = np.array([ - [146, 34, 19], # Aktual Negatif - [60, 36, 28], # Aktual Netral - [29, 16, 280] # Aktual Positif + [146, 34, 19], + [60, 36, 28], + [29, 16, 280] ]) -# Label kategori labels = ['Negatif', 'Netral', 'Positif'] -# Membuat plot plt.figure(figsize=(8, 6)) -sns.set(font_scale=1.2) # Mengatur ukuran font +sns.set(font_scale=1.2) -# Membuat heatmap ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels) -# Menambahkan label dan judul plt.xlabel('Prediksi', fontsize=14, labelpad=15) plt.ylabel('Aktual', fontsize=14, labelpad=15) plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20) -# Menampilkan plot plt.show() \ No newline at end of file diff --git a/visualize_scenario.py b/visualize_scenario.py index b83ae74..3b52426 100644 --- a/visualize_scenario.py +++ b/visualize_scenario.py @@ -1,7 +1,6 @@ import matplotlib.pyplot as plt import numpy as np -# Data dari hasil eksperimen Anda scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)'] accuracy = [0.78, 0.79, 0.77] macro_f1 = [0.65, 0.66, 0.66] @@ -12,12 +11,10 @@ width = 0.25 fig, ax = plt.subplots(figsize=(10, 6)) -# Membuat bar chart rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db') rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71') rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c') -# Menambahkan teks dan label ax.set_ylabel('Scores') ax.set_title('Perbandingan Performa Model XGBoost antar Skenario') ax.set_xticks(x) @@ -25,7 +22,6 @@ ax.set_xticklabels(scenarios) ax.legend(loc='lower right') ax.set_ylim(0, 1.0) -# Menambahkan label angka di atas bar def autolabel(rects): for rect in rects: height = rect.get_height()