MIF_E31222544/scraper/scraper_backup_important/scraper_scopus.py

102 lines
3.4 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time, random
import pandas as pd
import os
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
service = Service(chrome_driver_path)
browser = webdriver.Chrome(service=service, options=chrome_options)
browser.get("https://sinta.kemdikbud.go.id/logins")
print("[INFO] Silakan login di browser yang terbuka...")
login_wait_timeout = 180
elapsed = 0
while elapsed < login_wait_timeout:
if "profile" in browser.current_url:
print("\n[SUCCESS] Login sukses, lanjut scraping...")
break
else:
print(f"[WAIT] Menunggu login... ({login_wait_timeout - elapsed} detik tersisa)", end='\r')
time.sleep(2)
elapsed += 2
else:
print("\n[WARNING] Belum login setelah 3 menit, lanjut scraping saja...")
base_url = "https://sinta.kemdikbud.go.id/affiliations/profile/447?page={}&view=scopus"
all_scopus = []
page = 1
while True:
browser.get(base_url.format(page))
time.sleep(random.uniform(3, 5))
print(f"Scraping halaman {page}...")
items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
if not items:
print("Tidak ada data lagi. Selesai.")
break
for item in items:
try:
title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip()
quartile = item.find_element(By.CSS_SELECTOR, "a.ar-quartile").text.strip()
journal = item.find_element(By.CSS_SELECTOR, "a.ar-pub").text.strip()
metas = item.find_elements(By.CSS_SELECTOR, "div.ar-meta")
creator = "-"
for meta in metas:
anchor_tags = meta.find_elements(By.TAG_NAME, "a")
for a in anchor_tags:
if "Creator :" in a.text:
creator = a.text.replace("Creator :", "").strip()
break
if creator != "-":
break
year = item.find_element(By.CSS_SELECTOR, "a.ar-year").text.strip()
cited = item.find_element(By.CSS_SELECTOR, "a.ar-cited").text.strip()
all_scopus.append([title, quartile, journal, creator, year, cited])
print(f"{title} | {creator} ({year})")
except Exception as e:
print(f"Error mengambil data item: {e}")
page += 1
time.sleep(random.uniform(2.5, 5))
folder_path = "D:\\lecturertask"
base_filename = "data_scopus"
file_ext = ".xlsx"
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
max_num = 0
for filename in existing_files:
try:
num = int(filename.replace(base_filename, "").replace(file_ext, ""))
if num > max_num:
max_num = num
except:
pass
next_num = max_num + 1
new_filename = f"{base_filename}{next_num}{file_ext}"
save_path = os.path.join(folder_path, new_filename)
df = pd.DataFrame(all_scopus, columns=["Judul", "Quartile", "Jurnal", "Creator", "Tahun", "Sitasi"])
df.to_excel(save_path, index=False, engine="openpyxl")
print(f"\n[SUCCESS] Scraping selesai! {len(all_scopus)} data berhasil disimpan di '{save_path}'")
browser.quit()