126 lines
4.9 KiB
Python
126 lines
4.9 KiB
Python
import os
|
|
import time
|
|
import pandas as pd
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import NoSuchElementException
|
|
|
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
|
chrome_options = Options()
|
|
|
|
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
chrome_options.add_argument(f"user-agent={user_agent}")
|
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
chrome_options.add_experimental_option("useAutomationExtension", False)
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--window-size=1920x1080")
|
|
|
|
service = Service(chrome_driver_path)
|
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
|
all_hki = []
|
|
|
|
try:
|
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
|
browser.implicitly_wait(10)
|
|
|
|
login_wait_timeout = 50
|
|
elapsed = 0
|
|
while elapsed < login_wait_timeout:
|
|
if "profile" in browser.current_url:
|
|
print("\n[SUCCESS] Login sukses, lanjut scraping HKI...")
|
|
break
|
|
else:
|
|
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
|
|
time.sleep(2)
|
|
elapsed += 2
|
|
else:
|
|
print("\n[WARNING] Belum login setelah waktu tunggu, lanjut scraping saja...")
|
|
|
|
base_url = "https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=iprs"
|
|
page = 1
|
|
while True:
|
|
print(f"\nScraping halaman {page}...")
|
|
browser.get(f"{base_url}&page={page}" if page > 1 else base_url)
|
|
time.sleep(3)
|
|
|
|
try:
|
|
hki_items = WebDriverWait(browser, 20).until(
|
|
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
|
|
)
|
|
except:
|
|
print("Tidak ada data di halaman ini, scraping dihentikan.")
|
|
break
|
|
|
|
for item in hki_items:
|
|
try:
|
|
try:
|
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip()
|
|
title = title if title else "-"
|
|
except NoSuchElementException:
|
|
title = "-"
|
|
|
|
meta = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
|
|
inventor = meta[0].text.strip().replace("Inventor : ", "") if len(meta) > 0 and meta[0].text.strip() else "-"
|
|
publikasi = meta[1].text.strip() if len(meta) > 1 and meta[1].text.strip() else "-"
|
|
|
|
try:
|
|
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip().replace("📅 ", "")
|
|
year = year if year else "-"
|
|
except NoSuchElementException:
|
|
year = "-"
|
|
|
|
try:
|
|
nomor_permohonan = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip().replace("🗒 Nomor Permohonan : ", "")
|
|
nomor_permohonan = nomor_permohonan if nomor_permohonan else "-"
|
|
except NoSuchElementException:
|
|
nomor_permohonan = "-"
|
|
|
|
try:
|
|
jenis_paten = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip().replace("📊 ", "")
|
|
jenis_paten = jenis_paten if jenis_paten else "-"
|
|
except NoSuchElementException:
|
|
jenis_paten = "-"
|
|
|
|
all_hki.append([title, inventor, publikasi, year, nomor_permohonan, jenis_paten])
|
|
print(f"{title}")
|
|
except Exception as e:
|
|
print(f"Gagal parsing item HKI: {e}")
|
|
|
|
try:
|
|
next_button = browser.find_element(By.LINK_TEXT, str(page + 1))
|
|
page += 1
|
|
except:
|
|
print("Tidak ada halaman berikutnya, scraping selesai.")
|
|
break
|
|
|
|
finally:
|
|
folder_path = "D:\\lecturertask"
|
|
base_filename = "data_hki"
|
|
file_ext = ".xlsx"
|
|
|
|
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
|
|
max_num = 0
|
|
for filename in existing_files:
|
|
try:
|
|
num = int(filename.replace(base_filename, "").replace(file_ext, ""))
|
|
if num > max_num:
|
|
max_num = num
|
|
except:
|
|
pass
|
|
|
|
next_num = max_num + 1
|
|
new_filename = f"{base_filename}{next_num}{file_ext}"
|
|
save_path = os.path.join(folder_path, new_filename)
|
|
|
|
df = pd.DataFrame(all_hki, columns=["Judul", "Inventor", "Publikasi", "Tahun", "Nomor Permohonan", "Jenis Paten"])
|
|
df.to_excel(save_path, index=False, engine="openpyxl")
|
|
|
|
print(f"\nScraping selesai! {len(all_hki)} data HKI disimpan di '{save_path}'")
|
|
|
|
browser.quit() |