112 lines
4.0 KiB
Python
112 lines
4.0 KiB
Python
import sys
|
|
import os
|
|
import time
|
|
import random
|
|
import pandas as pd
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
# Ambil Affiliation ID dari argumen
|
|
if len(sys.argv) < 2:
|
|
raise ValueError("Affiliation ID harus disediakan sebagai argumen.")
|
|
affiliation_id = sys.argv[1]
|
|
print(f"[INFO] Menggunakan Affiliation ID: {affiliation_id}")
|
|
|
|
# Konfigurasi Chrome
|
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--start-maximized")
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
|
|
service = Service(chrome_driver_path)
|
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
|
# Login manual
|
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
|
print("[INFO] Silakan login di browser yang terbuka...")
|
|
|
|
login_wait_timeout = 180
|
|
elapsed = 0
|
|
while elapsed < login_wait_timeout:
|
|
if "profile" in browser.current_url:
|
|
print("\n[SUCCESS] Login sukses, lanjut scraping...")
|
|
break
|
|
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
|
|
time.sleep(2)
|
|
elapsed += 2
|
|
else:
|
|
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
|
|
|
|
# Scraping Scopus
|
|
base_url = f"https://sinta.kemdikbud.go.id/affiliations/profile/{affiliation_id}?page={{}}&view=scopus"
|
|
all_scopus = []
|
|
page = 1
|
|
|
|
try:
|
|
while True:
|
|
browser.get(base_url.format(page))
|
|
time.sleep(random.uniform(3, 5))
|
|
print(f"Scraping halaman {page}...")
|
|
|
|
items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
|
|
if not items:
|
|
print("Tidak ada data lagi. Selesai.")
|
|
break
|
|
|
|
for item in items:
|
|
try:
|
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip()
|
|
quartile = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip()
|
|
journal = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
|
|
|
|
metas = item.find_elements(By.CSS_SELECTOR, "div.ar-meta")
|
|
creator = "-"
|
|
for meta in metas:
|
|
anchor_tags = meta.find_elements(By.TAG_NAME, "a")
|
|
for a in anchor_tags:
|
|
if "Creator :" in a.text:
|
|
creator = a.text.replace("Creator :", "").strip()
|
|
break
|
|
|
|
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip()
|
|
cited = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip()
|
|
|
|
all_scopus.append([title, quartile, journal, creator, year, cited])
|
|
print(f"{title} | {creator} ({year})")
|
|
except Exception as e:
|
|
print(f"Error mengambil data item: {e}")
|
|
|
|
page += 1
|
|
time.sleep(random.uniform(2.5, 5))
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n[INTERRUPTED] Scraping dihentikan oleh pengguna (Ctrl+C atau tutup Chrome).")
|
|
|
|
finally:
|
|
print("\n[MENYIMPAN DATA] Menyimpan hasil sementara ke Excel...")
|
|
|
|
folder_path = "D:\\lecturertask"
|
|
base_filename = "data_scopus"
|
|
file_ext = ".xlsx"
|
|
|
|
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
|
|
max_num = max([int(f.replace(base_filename, "").replace(file_ext, "")) for f in existing_files if f.replace(base_filename, "").replace(file_ext, "").isdigit()] + [0])
|
|
next_num = max_num + 1
|
|
new_filename = f"{base_filename}{next_num}{file_ext}"
|
|
save_path = os.path.join(folder_path, new_filename)
|
|
|
|
if all_scopus:
|
|
df = pd.DataFrame(all_scopus, columns=["Judul", "Quartile", "Jurnal", "Creator", "Tahun", "Sitasi"])
|
|
df.to_excel(save_path, index=False, engine="openpyxl")
|
|
print(f"[SAVED] Data disimpan di '{save_path}' ({len(all_scopus)} item)")
|
|
else:
|
|
print("[INFO] Tidak ada data untuk disimpan.")
|
|
|
|
try:
|
|
browser.quit()
|
|
except:
|
|
pass
|