MIF_E31222544/scraper/scraper_scopus.py

112 lines
4.0 KiB
Python

import sys
import os
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# Ambil Affiliation ID dari argumen
if len(sys.argv) < 2:
raise ValueError("Affiliation ID harus disediakan sebagai argumen.")
affiliation_id = sys.argv[1]
print(f"[INFO] Menggunakan Affiliation ID: {affiliation_id}")
# Konfigurasi Chrome
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
service = Service(chrome_driver_path)
browser = webdriver.Chrome(service=service, options=chrome_options)
# Login manual
browser.get("https://sinta.kemdikbud.go.id/logins")
print("[INFO] Silakan login di browser yang terbuka...")
login_wait_timeout = 180
elapsed = 0
while elapsed < login_wait_timeout:
if "profile" in browser.current_url:
print("\n[SUCCESS] Login sukses, lanjut scraping...")
break
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
time.sleep(2)
elapsed += 2
else:
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
# Scraping Scopus
base_url = f"https://sinta.kemdikbud.go.id/affiliations/profile/{affiliation_id}?page={{}}&view=scopus"
all_scopus = []
page = 1
try:
while True:
browser.get(base_url.format(page))
time.sleep(random.uniform(3, 5))
print(f"Scraping halaman {page}...")
items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
if not items:
print("Tidak ada data lagi. Selesai.")
break
for item in items:
try:
title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip()
quartile = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip()
journal = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
metas = item.find_elements(By.CSS_SELECTOR, "div.ar-meta")
creator = "-"
for meta in metas:
anchor_tags = meta.find_elements(By.TAG_NAME, "a")
for a in anchor_tags:
if "Creator :" in a.text:
creator = a.text.replace("Creator :", "").strip()
break
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip()
cited = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip()
all_scopus.append([title, quartile, journal, creator, year, cited])
print(f"{title} | {creator} ({year})")
except Exception as e:
print(f"Error mengambil data item: {e}")
page += 1
time.sleep(random.uniform(2.5, 5))
except KeyboardInterrupt:
print("\n[INTERRUPTED] Scraping dihentikan oleh pengguna (Ctrl+C atau tutup Chrome).")
finally:
print("\n[MENYIMPAN DATA] Menyimpan hasil sementara ke Excel...")
folder_path = "D:\\lecturertask"
base_filename = "data_scopus"
file_ext = ".xlsx"
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
max_num = max([int(f.replace(base_filename, "").replace(file_ext, "")) for f in existing_files if f.replace(base_filename, "").replace(file_ext, "").isdigit()] + [0])
next_num = max_num + 1
new_filename = f"{base_filename}{next_num}{file_ext}"
save_path = os.path.join(folder_path, new_filename)
if all_scopus:
df = pd.DataFrame(all_scopus, columns=["Judul", "Quartile", "Jurnal", "Creator", "Tahun", "Sitasi"])
df.to_excel(save_path, index=False, engine="openpyxl")
print(f"[SAVED] Data disimpan di '{save_path}' ({len(all_scopus)} item)")
else:
print("[INFO] Tidak ada data untuk disimpan.")
try:
browser.quit()
except:
pass