MIF_E31222544/scraper_pengabdian.py

85 lines
3.6 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time, random
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
chrome_options.add_argument(f"user-agent={user_agent}")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
service = Service(chrome_driver_path)
browser = webdriver.Chrome(service=service, options=chrome_options)
browser.get("https://sinta.kemdikbud.go.id/logins")
input("Login secara manual, lalu tekan ENTER untuk melanjutkan...")
browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=services")
time.sleep(random.uniform(2, 6))
all_services = []
page = 1
action = ActionChains(browser)
while True:
print(f"📄 Scraping halaman {page}...")
service_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
if not service_items:
print("❌ Tidak ada data di halaman ini, berhenti.")
break
for item in service_items:
try:
action.move_to_element(item).perform()
time.sleep(random.uniform(0.5, 2))
title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip()
leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader"
personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']")
personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil"
service_type_element = item.find_elements(By.CSS_SELECTOR, "a.ar-pub")
service_type = service_type_element[0].text.strip() if service_type_element else "Tidak Diketahui"
year_element = item.find_elements(By.CSS_SELECTOR, "a.ar-year")
year = year_element[0].text.strip().replace("📅 ", "") if year_element else "Tidak Ada Tahun"
fund_element = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile")
fund = fund_element[0].text.strip().replace("📊 ", "") if fund_element else "Tidak Ada Dana"
all_services.append([title, leader, personil_list, service_type, year, fund])
except Exception as e:
print(f"⚠️ Error mengambil data: {e}")
try:
next_button = WebDriverWait(browser, random.uniform(3, 7)).until(
EC.presence_of_element_located((By.LINK_TEXT, str(page + 1)))
)
next_button.click()
time.sleep(random.uniform(3, 7))
page += 1
except:
print("✅ Tidak ada halaman berikutnya, selesai scraping.")
break
df = pd.DataFrame(all_services, columns=["Judul", "Leader", "Personil", "Tipe Pengabdian", "Tahun", "Dana"])
df.to_excel("data_pengabdian.xlsx", index=False, engine="openpyxl")
print(f"🎉 Scraping selesai! {len(all_services)} data disimpan di 'data_pengabdian.xlsx'")
browser.quit()