import os import time import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException chrome_driver_path = "D:\\lecturertask\\chromedriver.exe" chrome_options = Options() user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36" chrome_options.add_argument(f"user-agent={user_agent}") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option("useAutomationExtension", False) chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--window-size=1920x1080") service = Service(chrome_driver_path) browser = webdriver.Chrome(service=service, options=chrome_options) all_hki = [] try: browser.get("https://sinta.kemdikbud.go.id/logins") browser.implicitly_wait(10) login_wait_timeout = 50 elapsed = 0 while elapsed < login_wait_timeout: if "profile" in browser.current_url: print("\n[SUCCESS] Login sukses, lanjut scraping HKI...") break else: print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r') time.sleep(2) elapsed += 2 else: print("\n[WARNING] Belum login setelah waktu tunggu, lanjut scraping saja...") base_url = "https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=iprs" page = 1 while True: print(f"\nScraping halaman {page}...") browser.get(f"{base_url}&page={page}" if page > 1 else base_url) time.sleep(3) try: hki_items = WebDriverWait(browser, 20).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5")) ) except: print("Tidak ada data di halaman ini, scraping dihentikan.") break for item in hki_items: try: try: title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip() title = title if title else "-" except NoSuchElementException: title = "-" meta = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a") inventor = meta[0].text.strip().replace("Inventor : ", "") if len(meta) > 0 and meta[0].text.strip() else "-" publikasi = meta[1].text.strip() if len(meta) > 1 and meta[1].text.strip() else "-" try: year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip().replace("📅 ", "") year = year if year else "-" except NoSuchElementException: year = "-" try: nomor_permohonan = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip().replace("🗒 Nomor Permohonan : ", "") nomor_permohonan = nomor_permohonan if nomor_permohonan else "-" except NoSuchElementException: nomor_permohonan = "-" try: jenis_paten = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip().replace("📊 ", "") jenis_paten = jenis_paten if jenis_paten else "-" except NoSuchElementException: jenis_paten = "-" all_hki.append([title, inventor, publikasi, year, nomor_permohonan, jenis_paten]) except Exception as e: print(f"Gagal parsing item HKI: {e}") try: next_button = browser.find_element(By.LINK_TEXT, str(page + 1)) page += 1 except: print("Tidak ada halaman berikutnya, scraping selesai.") break finally: folder_path = "D:\\lecturertask" base_filename = "data_hki" file_ext = ".xlsx" existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)] max_num = 0 for filename in existing_files: try: num = int(filename.replace(base_filename, "").replace(file_ext, "")) if num > max_num: max_num = num except: pass next_num = max_num + 1 new_filename = f"{base_filename}{next_num}{file_ext}" save_path = os.path.join(folder_path, new_filename) df = pd.DataFrame(all_hki, columns=["Judul", "Inventor", "Publikasi", "Tahun", "Nomor Permohonan", "Jenis Paten"]) df.to_excel(save_path, index=False, engine="openpyxl") print(f"\nScraping selesai! {len(all_hki)} data HKI disimpan di '{save_path}'") browser.quit()