# === scraper_onescholar.py === import sys import os import time import random import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # Validasi argumen if len(sys.argv) < 2: raise ValueError("Link profil Google Scholar harus disediakan sebagai argumen.") scholar_link = sys.argv[1] print(f"[INFO] Link Google Scholar: {scholar_link}") # Konfigurasi browser CHROME_DRIVER_PATH = "D:\\lecturertask\\chromedriver.exe" chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/110.0.0.0 Safari/537.36") service = Service(CHROME_DRIVER_PATH) browser = webdriver.Chrome(service=service, options=chrome_options) hasil_scraping = [] browser.get("https://scholar.google.com/") time.sleep(1) def scrape_teks(xpath, default="-"): try: return WebDriverWait(browser, 5).until( EC.presence_of_element_located((By.XPATH, xpath)) ).text.strip() except: return default def klik_tampilkan_lagi(): while True: try: tombol = WebDriverWait(browser, 2).until( EC.element_to_be_clickable((By.ID, "gsc_bpf_more")) ) tombol.click() print("Memuat lebih banyak...") time.sleep(random.uniform(1.2, 2)) except: break try: print(f"\nMembuka profil: {scholar_link}") browser.get(scholar_link) WebDriverWait(browser, 6).until(EC.presence_of_element_located((By.ID, "gsc_a_t"))) klik_tampilkan_lagi() elements = browser.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr td.gsc_a_t a.gsc_a_at") penelitian_links = [el.get_attribute("href") for el in elements] print(f"Total publikasi: {len(penelitian_links)}") for link in penelitian_links: try: browser.get(link) WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.ID, "gsc_oci_title"))) judul = scrape_teks("//div[@id='gsc_oci_title']") pengarang = scrape_teks("//div[@class='gsc_oci_field' and text()='Pengarang']/following-sibling::div") tahun = scrape_teks("//div[@class='gsc_oci_field' and text()='Tanggal terbit']/following-sibling::div") jurnal = scrape_teks("//div[@class='gsc_oci_field' and text()='Jurnal']/following-sibling::div") jilid = scrape_teks("//div[@class='gsc_oci_field' and text()='Jilid']/following-sibling::div") terbitan = scrape_teks("//div[@class='gsc_oci_field' and text()='Terbitan']/following-sibling::div") halaman = scrape_teks("//div[@class='gsc_oci_field' and text()='Halaman']/following-sibling::div") penerbit = scrape_teks("//div[@class='gsc_oci_field' and text()='Penerbit']/following-sibling::div") kutipan = scrape_teks("//div[@class='gsc_oci_field' and text()='Total kutipan']/following-sibling::div") hasil_scraping.append([ scholar_link, judul, pengarang, tahun, jurnal, jilid, terbitan, halaman, penerbit, kutipan ]) print(f"✓ {judul}") except Exception as e: print(f"Gagal ambil detail publikasi: {e}") except KeyboardInterrupt: print("\nScraping dihentikan oleh pengguna.") finally: folder_path = "D:\\lecturertask" os.makedirs(folder_path, exist_ok=True) base_filename = "data_scholar" file_ext = ".xlsx" existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)] max_num = max([int(f.replace(base_filename, "").replace(file_ext, "")) for f in existing_files if f.replace(base_filename, "").replace(file_ext, "").isdigit()] + [0]) new_filename = f"{base_filename}{max_num + 1}{file_ext}" save_path = os.path.join(folder_path, new_filename) if hasil_scraping: df = pd.DataFrame(hasil_scraping, columns=[ "Link", "Judul", "Pengarang", "Tahun", "Jurnal", "Jilid", "Terbitan", "Halaman", "Penerbit", "Total Kutipan" ]) df.to_excel(save_path, index=False, engine="openpyxl") print(f"\nScraping selesai/dihentikan! Total {len(hasil_scraping)} data disimpan di '{save_path}'") else: print("\nScraping gagal/tidak mendapatkan data. File tidak dibuat.") browser.quit()