MIF_E31222544/scraper/scraper_penelitian.py

117 lines
4.6 KiB
Python

import os
import time
import sys
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
chrome_options.add_argument(f"user-agent={user_agent}")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
service = Service(chrome_driver_path)
browser = webdriver.Chrome(service=service, options=chrome_options)
all_research = []
if len(sys.argv) < 2:
raise ValueError("Affiliation ID harus disediakan sebagai argumen.")
affiliation_id = sys.argv[1]
try:
browser.get("https://sinta.kemdikbud.go.id/logins")
browser.implicitly_wait(10)
login_wait_timeout = 50
elapsed = 0
while elapsed < login_wait_timeout:
if "profile" in browser.current_url:
print("\n[SUCCESS] Login sukses, lanjut scraping Penelitian...")
break
else:
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
time.sleep(2)
elapsed += 2
else:
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
url = f"https://sinta.kemdikbud.go.id/affiliations/profile/{affiliation_id}/?view=researches"
browser.get(url)
time.sleep(3)
page = 1
while True:
print(f"\nScraping halaman {page}...")
try:
research_items = WebDriverWait(browser, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
)
except:
print("Tidak ada data di halaman ini, scraping dihentikan.")
break
for item in research_items:
try:
title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip()
leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader"
personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']")
personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil"
research_type = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip().replace("📅 ", "")
fund = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip().replace("📊 ", "")
all_research.append([title, leader, personil_list, research_type, year, fund])
print(f"{title}")
except Exception as e:
print(f"Gagal parsing item: {e}")
try:
next_button = browser.find_element(By.LINK_TEXT, str(page + 1))
next_button.click()
WebDriverWait(browser, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ar-list-item.mb-5"))
)
page += 1
time.sleep(2)
except:
print("Tidak ada halaman berikutnya, scraping selesai.")
break
finally:
# Simpan data ke Excel meskipun data kosong atau terjadi error
folder_path = "D:\\lecturertask"
base_filename = "data_penelitian"
file_ext = ".xlsx"
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
max_num = 0
for filename in existing_files:
try:
num = int(filename.replace(base_filename, "").replace(file_ext, ""))
if num > max_num:
max_num = num
except:
pass
next_num = max_num + 1
new_filename = f"{base_filename}{next_num}{file_ext}"
save_path = os.path.join(folder_path, new_filename)
df = pd.DataFrame(all_research, columns=["Judul", "Leader", "Personil", "Tipe Penelitian", "Tahun", "Dana"])
df.to_excel(save_path, index=False, engine="openpyxl")
print(f"\nScraping selesai! {len(all_research)} data disimpan di '{save_path}'")
browser.quit()