MIF_E31222544/scraper/scraper_backup_important/scraper_pengabdian.py

119 lines
4.3 KiB
Python

import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --- KONFIGURASI BROWSER ---
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
chrome_options.add_argument(f"user-agent={user_agent}")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
service = Service(chrome_driver_path)
browser = webdriver.Chrome(service=service, options=chrome_options)
# --- LOGIN ---
browser.get("https://sinta.kemdikbud.go.id/logins")
browser.implicitly_wait(10)
login_wait_timeout = 50
elapsed = 0
while elapsed < login_wait_timeout:
if "profile" in browser.current_url:
print("\n[SUCCESS] Login sukses, lanjut scraping Pengabdian...")
break
else:
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
time.sleep(2)
elapsed += 2
else:
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
# --- SCRAPING PENGABDIAN ---
browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=services")
time.sleep(3)
all_services = []
page = 1
while True:
print(f"\nScraping halaman {page}...")
try:
service_items = WebDriverWait(browser, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
)
except:
print("Tidak ada data di halaman ini, scraping dihentikan.")
break
for item in service_items:
try:
title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip()
leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader"
personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']")
personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil"
service_type = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip().replace("📅 ", "")
fund = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip().replace("📊 ", "")
all_services.append([title, leader, personil_list, service_type, year, fund])
print(f"{title}")
except Exception as e:
print(f"Gagal parsing item: {e}")
# Coba klik tombol halaman berikutnya
try:
next_button = browser.find_element(By.LINK_TEXT, str(page + 1))
next_button.click()
WebDriverWait(browser, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
)
page += 1
time.sleep(2)
except:
print("Tidak ada halaman berikutnya, scraping selesai.")
break
# --- SIMPAN KE EXCEL ---
folder_path = "D:\\lecturertask"
base_filename = "data_pengabdian"
file_ext = ".xlsx"
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
max_num = 0
for filename in existing_files:
try:
num = int(filename.replace(base_filename, "").replace(file_ext, ""))
if num > max_num:
max_num = num
except:
pass
next_num = max_num + 1
new_filename = f"{base_filename}{next_num}{file_ext}"
save_path = os.path.join(folder_path, new_filename)
df = pd.DataFrame(all_services, columns=["Judul", "Leader", "Personil", "Tipe Pengabdian", "Tahun", "Dana"])
df.to_excel(save_path, index=False, engine="openpyxl")
print(f"\nScraping selesai! {len(all_services)} data disimpan di '{save_path}'")
browser.quit()