102 lines
3.4 KiB
Python
102 lines
3.4 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
import time, random
|
|
import pandas as pd
|
|
import os
|
|
|
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--start-maximized")
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
|
|
service = Service(chrome_driver_path)
|
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
|
print("[INFO] Silakan login di browser yang terbuka...")
|
|
|
|
login_wait_timeout = 180
|
|
elapsed = 0
|
|
|
|
while elapsed < login_wait_timeout:
|
|
if "profile" in browser.current_url:
|
|
print("\n[SUCCESS] Login sukses, lanjut scraping...")
|
|
break
|
|
else:
|
|
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
|
|
time.sleep(2)
|
|
elapsed += 2
|
|
else:
|
|
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
|
|
|
|
base_url = "https://sinta.kemdikbud.go.id/affiliations/profile/447?page={}&view=scopus"
|
|
|
|
all_scopus = []
|
|
page = 1
|
|
|
|
while True:
|
|
browser.get(base_url.format(page))
|
|
time.sleep(random.uniform(3, 5))
|
|
print(f"Scraping halaman {page}...")
|
|
|
|
items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
|
|
if not items:
|
|
print("Tidak ada data lagi. Selesai.")
|
|
break
|
|
|
|
for item in items:
|
|
try:
|
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip()
|
|
quartile = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip()
|
|
journal = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
|
|
|
|
metas = item.find_elements(By.CSS_SELECTOR, "div.ar-meta")
|
|
creator = "-"
|
|
for meta in metas:
|
|
anchor_tags = meta.find_elements(By.TAG_NAME, "a")
|
|
for a in anchor_tags:
|
|
if "Creator :" in a.text:
|
|
creator = a.text.replace("Creator :", "").strip()
|
|
break
|
|
if creator != "-":
|
|
break
|
|
|
|
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip()
|
|
cited = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip()
|
|
|
|
all_scopus.append([title, quartile, journal, creator, year, cited])
|
|
print(f"{title} | {creator} ({year})")
|
|
except Exception as e:
|
|
print(f"Error mengambil data item: {e}")
|
|
|
|
page += 1
|
|
time.sleep(random.uniform(2.5, 5))
|
|
|
|
folder_path = "D:\\lecturertask"
|
|
base_filename = "data_scopus"
|
|
file_ext = ".xlsx"
|
|
|
|
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
|
|
|
|
max_num = 0
|
|
for filename in existing_files:
|
|
try:
|
|
num = int(filename.replace(base_filename, "").replace(file_ext, ""))
|
|
if num > max_num:
|
|
max_num = num
|
|
except:
|
|
pass
|
|
|
|
next_num = max_num + 1
|
|
new_filename = f"{base_filename}{next_num}{file_ext}"
|
|
save_path = os.path.join(folder_path, new_filename)
|
|
|
|
df = pd.DataFrame(all_scopus, columns=["Judul", "Quartile", "Jurnal", "Creator", "Tahun", "Sitasi"])
|
|
df.to_excel(save_path, index=False, engine="openpyxl")
|
|
print(f"\n[SUCCESS] Scraping selesai! {len(all_scopus)} data berhasil disimpan di '{save_path}'")
|
|
|
|
browser.quit()
|