import os import time import random import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC CHROME_DRIVER_PATH = "D:\\lecturertask\\chromedriver.exe" chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/110.0.0.0 Safari/537.36") service = Service(CHROME_DRIVER_PATH) browser = webdriver.Chrome(service=service, options=chrome_options) browser.get("https://scholar.google.com/") with open("dosenTI.txt", "r", encoding="utf-8") as file: list_dosen = [line.strip() for line in file if line.strip()] print(f"Jumlah dosen: {len(list_dosen)}") def scrape_teks(xpath, default="tidak ditemukan"): try: return WebDriverWait(browser, 5).until( EC.presence_of_element_located((By.XPATH, xpath)) ).text.strip() except: return default def klik_tampilkan_lagi(): while True: try: tombol = WebDriverWait(browser, 2).until( EC.element_to_be_clickable((By.ID, "gsc_bpf_more")) ) tombol.click() print("Memuat lebih banyak...") time.sleep(random.uniform(1.2, 2)) except: break hasil_scraping = [] try: for dosen in list_dosen: print(f"\nMembuka profil dosen: {dosen}") try: browser.get(dosen) WebDriverWait(browser, 5).until( EC.presence_of_element_located((By.ID, "gsc_a_t")) ) except: print("Gagal membuka profil dosen.") continue try: sort_link = WebDriverWait(browser, 3).until( EC.presence_of_element_located((By.XPATH, "//span[@id='gsc_a_ha']/a[contains(@class, 'gsc_a_a')]")) ) browser.get(sort_link.get_attribute("href")) print("Disortir berdasarkan tahun.") except: print("Sortir gagal.") klik_tampilkan_lagi() try: elements = WebDriverWait(browser, 3).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.gsc_a_tr td.gsc_a_t a.gsc_a_at")) ) penelitian_links = [el.get_attribute("href") for el in elements] except: penelitian_links = [] print(f"Total penelitian: {len(penelitian_links)}") for link in penelitian_links: try: browser.get(link) WebDriverWait(browser, 3).until( EC.presence_of_element_located((By.ID, "gsc_oci_title")) ) judul = scrape_teks("//div[@id='gsc_oci_title']") pengarang = scrape_teks("//div[@class='gsc_oci_field' and text()='Pengarang']/following-sibling::div") tahun = scrape_teks("//div[@class='gsc_oci_field' and text()='Tanggal terbit']/following-sibling::div") jurnal = scrape_teks("//div[@class='gsc_oci_field' and text()='Jurnal']/following-sibling::div") jilid = scrape_teks("//div[@class='gsc_oci_field' and text()='Jilid']/following-sibling::div", "-") terbitan = scrape_teks("//div[@class='gsc_oci_field' and text()='Terbitan']/following-sibling::div", "-") halaman = scrape_teks("//div[@class='gsc_oci_field' and text()='Halaman']/following-sibling::div", "-") penerbit = scrape_teks("//div[@class='gsc_oci_field' and text()='Penerbit']/following-sibling::div") kutipan = scrape_teks("//div[@class='gsc_oci_field' and text()='Total kutipan']/following-sibling::div") hasil_scraping.append([ dosen, judul, pengarang, tahun, jurnal, jilid, terbitan, halaman, penerbit, kutipan ]) print(f"✓ {judul}") except Exception as e: print(f"Gagal ambil data detail: {e}") finally: browser.back() except KeyboardInterrupt: print("\nScraping dihentikan oleh pengguna.") finally: folder_path = "D:\\lecturertask" os.makedirs(folder_path, exist_ok=True) base_filename = "data_scholar" file_ext = ".xlsx" existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)] max_num = 0 for f in existing_files: try: num = int(f.replace(base_filename, "").replace(file_ext, "")) max_num = max(max_num, num) except: continue next_filename = f"{base_filename}{max_num + 1}{file_ext}" save_path = os.path.join(folder_path, next_filename) if hasil_scraping: df = pd.DataFrame(hasil_scraping, columns=[ "Link", "Judul", "Pengarang", "Tahun", "Jurnal", "Jilid", "Terbitan", "Halaman", "Penerbit", "Total Kutipan" ]) df.to_excel(save_path, index=False, engine="openpyxl") print(f"\nScraping selesai/dihentikan! Total {len(hasil_scraping)} data disimpan di '{save_path}'") else: print("\nScraping gagal/tidak mendapatkan data. File tidak dibuat.") browser.quit()