import os import time import sys import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC chrome_driver_path = "D:\\lecturertask\\chromedriver.exe" chrome_options = Options() user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36" chrome_options.add_argument(f"user-agent={user_agent}") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option("useAutomationExtension", False) chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--window-size=1920x1080") service = Service(chrome_driver_path) browser = webdriver.Chrome(service=service, options=chrome_options) all_research = [] if len(sys.argv) < 2: raise ValueError("Affiliation ID harus disediakan sebagai argumen.") affiliation_id = sys.argv[1] try: browser.get("https://sinta.kemdikbud.go.id/logins") browser.implicitly_wait(10) login_wait_timeout = 50 elapsed = 0 while elapsed < login_wait_timeout: if "profile" in browser.current_url: print("\n[SUCCESS] Login sukses, lanjut scraping Penelitian...") break else: print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r') time.sleep(2) elapsed += 2 else: print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...") url = f"https://sinta.kemdikbud.go.id/affiliations/profile/{affiliation_id}/?view=researches" browser.get(url) time.sleep(3) page = 1 while True: print(f"\nScraping halaman {page}...") try: research_items = WebDriverWait(browser, 20).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5")) ) except: print("Tidak ada data di halaman ini, scraping dihentikan.") break for item in research_items: try: title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip() leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a") leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader" personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']") personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil" research_type = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip() year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip().replace("📅 ", "") fund = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip().replace("📊 ", "") all_research.append([title, leader, personil_list, research_type, year, fund]) print(f"{title}") except Exception as e: print(f"Gagal parsing item: {e}") try: next_button = browser.find_element(By.LINK_TEXT, str(page + 1)) next_button.click() WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5")) ) page += 1 time.sleep(2) except: print("Tidak ada halaman berikutnya, scraping selesai.") break finally: # Simpan data ke Excel meskipun data kosong atau terjadi error folder_path = "D:\\lecturertask" base_filename = "data_penelitian" file_ext = ".xlsx" existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)] max_num = 0 for filename in existing_files: try: num = int(filename.replace(base_filename, "").replace(file_ext, "")) if num > max_num: max_num = num except: pass next_num = max_num + 1 new_filename = f"{base_filename}{next_num}{file_ext}" save_path = os.path.join(folder_path, new_filename) df = pd.DataFrame(all_research, columns=["Judul", "Leader", "Personil", "Tipe Penelitian", "Tahun", "Dana"]) df.to_excel(save_path, index=False, engine="openpyxl") print(f"\nScraping selesai! {len(all_research)} data disimpan di '{save_path}'") browser.quit()