import sys import os import time import random import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options # Ambil Affiliation ID dari argumen if len(sys.argv) < 2: raise ValueError("Affiliation ID harus disediakan sebagai argumen.") affiliation_id = sys.argv[1] print(f"[INFO] Menggunakan Affiliation ID: {affiliation_id}") # Konfigurasi Chrome chrome_driver_path = "D:\\lecturertask\\chromedriver.exe" chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") service = Service(chrome_driver_path) browser = webdriver.Chrome(service=service, options=chrome_options) # Login manual browser.get("https://sinta.kemdikbud.go.id/logins") print("[INFO] Silakan login di browser yang terbuka...") login_wait_timeout = 180 elapsed = 0 while elapsed < login_wait_timeout: if "profile" in browser.current_url: print("\n[SUCCESS] Login sukses, lanjut scraping...") break print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r') time.sleep(2) elapsed += 2 else: print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...") # Scraping Scopus base_url = f"https://sinta.kemdikbud.go.id/affiliations/profile/{affiliation_id}?page={{}}&view=scopus" all_scopus = [] page = 1 try: while True: browser.get(base_url.format(page)) time.sleep(random.uniform(3, 5)) print(f"Scraping halaman {page}...") items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5") if not items: print("Tidak ada data lagi. Selesai.") break for item in items: try: title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip() quartile = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip() journal = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip() metas = item.find_elements(By.CSS_SELECTOR, "div.ar-meta") creator = "-" for meta in metas: anchor_tags = meta.find_elements(By.TAG_NAME, "a") for a in anchor_tags: if "Creator :" in a.text: creator = a.text.replace("Creator :", "").strip() break year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip() cited = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip() all_scopus.append([title, quartile, journal, creator, year, cited]) print(f"{title} | {creator} ({year})") except Exception as e: print(f"Error mengambil data item: {e}") page += 1 time.sleep(random.uniform(2.5, 5)) except KeyboardInterrupt: print("\n[INTERRUPTED] Scraping dihentikan oleh pengguna (Ctrl+C atau tutup Chrome).") finally: print("\n[MENYIMPAN DATA] Menyimpan hasil sementara ke Excel...") folder_path = "D:\\lecturertask" base_filename = "data_scopus" file_ext = ".xlsx" existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)] max_num = max([int(f.replace(base_filename, "").replace(file_ext, "")) for f in existing_files if f.replace(base_filename, "").replace(file_ext, "").isdigit()] + [0]) next_num = max_num + 1 new_filename = f"{base_filename}{next_num}{file_ext}" save_path = os.path.join(folder_path, new_filename) if all_scopus: df = pd.DataFrame(all_scopus, columns=["Judul", "Quartile", "Jurnal", "Creator", "Tahun", "Sitasi"]) df.to_excel(save_path, index=False, engine="openpyxl") print(f"[SAVED] Data disimpan di '{save_path}' ({len(all_scopus)} item)") else: print("[INFO] Tidak ada data untuk disimpan.") try: browser.quit() except: pass