119 lines
4.3 KiB
Python
119 lines
4.3 KiB
Python
import os
|
|
import time
|
|
import pandas as pd
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
# --- KONFIGURASI BROWSER ---
|
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
|
chrome_options = Options()
|
|
|
|
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
chrome_options.add_argument(f"user-agent={user_agent}")
|
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
chrome_options.add_experimental_option("useAutomationExtension", False)
|
|
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--window-size=1920x1080")
|
|
|
|
service = Service(chrome_driver_path)
|
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
|
# --- LOGIN ---
|
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
|
browser.implicitly_wait(10)
|
|
|
|
login_wait_timeout = 50
|
|
elapsed = 0
|
|
while elapsed < login_wait_timeout:
|
|
if "profile" in browser.current_url:
|
|
print("\n[SUCCESS] Login sukses, lanjut scraping Pengabdian...")
|
|
break
|
|
else:
|
|
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
|
|
time.sleep(2)
|
|
elapsed += 2
|
|
else:
|
|
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
|
|
|
|
# --- SCRAPING PENGABDIAN ---
|
|
browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=services")
|
|
time.sleep(3)
|
|
|
|
all_services = []
|
|
page = 1
|
|
|
|
while True:
|
|
print(f"\nScraping halaman {page}...")
|
|
|
|
try:
|
|
service_items = WebDriverWait(browser, 10).until(
|
|
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
|
|
)
|
|
except:
|
|
print("Tidak ada data di halaman ini, scraping dihentikan.")
|
|
break
|
|
|
|
for item in service_items:
|
|
try:
|
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip()
|
|
|
|
leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
|
|
leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader"
|
|
|
|
personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']")
|
|
personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil"
|
|
|
|
service_type = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
|
|
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip().replace("📅 ", "")
|
|
fund = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip().replace("📊 ", "")
|
|
|
|
all_services.append([title, leader, personil_list, service_type, year, fund])
|
|
print(f"{title}")
|
|
except Exception as e:
|
|
print(f"Gagal parsing item: {e}")
|
|
|
|
# Coba klik tombol halaman berikutnya
|
|
try:
|
|
next_button = browser.find_element(By.LINK_TEXT, str(page + 1))
|
|
next_button.click()
|
|
WebDriverWait(browser, 10).until(
|
|
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
|
|
)
|
|
page += 1
|
|
time.sleep(2)
|
|
except:
|
|
print("Tidak ada halaman berikutnya, scraping selesai.")
|
|
break
|
|
|
|
# --- SIMPAN KE EXCEL ---
|
|
folder_path = "D:\\lecturertask"
|
|
base_filename = "data_pengabdian"
|
|
file_ext = ".xlsx"
|
|
|
|
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
|
|
max_num = 0
|
|
for filename in existing_files:
|
|
try:
|
|
num = int(filename.replace(base_filename, "").replace(file_ext, ""))
|
|
if num > max_num:
|
|
max_num = num
|
|
except:
|
|
pass
|
|
|
|
next_num = max_num + 1
|
|
new_filename = f"{base_filename}{next_num}{file_ext}"
|
|
save_path = os.path.join(folder_path, new_filename)
|
|
|
|
df = pd.DataFrame(all_services, columns=["Judul", "Leader", "Personil", "Tipe Pengabdian", "Tahun", "Dana"])
|
|
df.to_excel(save_path, index=False, engine="openpyxl")
|
|
|
|
print(f"\nScraping selesai! {len(all_services)} data disimpan di '{save_path}'")
|
|
|
|
browser.quit()
|