MIF_E31222544/scraper/scraper_onescholar.py

118 lines
4.7 KiB
Python

# === scraper_onescholar.py ===
import sys
import os
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Validasi argumen
if len(sys.argv) < 2:
raise ValueError("Link profil Google Scholar harus disediakan sebagai argumen.")
scholar_link = sys.argv[1]
print(f"[INFO] Link Google Scholar: {scholar_link}")
# Konfigurasi browser
CHROME_DRIVER_PATH = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/110.0.0.0 Safari/537.36")
service = Service(CHROME_DRIVER_PATH)
browser = webdriver.Chrome(service=service, options=chrome_options)
hasil_scraping = []
browser.get("https://scholar.google.com/")
time.sleep(1)
def scrape_teks(xpath, default="-"):
try:
return WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.XPATH, xpath))
).text.strip()
except:
return default
def klik_tampilkan_lagi():
while True:
try:
tombol = WebDriverWait(browser, 2).until(
EC.element_to_be_clickable((By.ID, "gsc_bpf_more"))
)
tombol.click()
print("Memuat lebih banyak...")
time.sleep(random.uniform(1.2, 2))
except:
break
try:
print(f"\nMembuka profil: {scholar_link}")
browser.get(scholar_link)
WebDriverWait(browser, 6).until(EC.presence_of_element_located((By.ID, "gsc_a_t")))
klik_tampilkan_lagi()
elements = browser.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr td.gsc_a_t a.gsc_a_at")
penelitian_links = [el.get_attribute("href") for el in elements]
print(f"Total publikasi: {len(penelitian_links)}")
for link in penelitian_links:
try:
browser.get(link)
WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.ID, "gsc_oci_title")))
judul = scrape_teks("//div[@id='gsc_oci_title']")
pengarang = scrape_teks("//div[@class='gsc_oci_field' and text()='Pengarang']/following-sibling::div")
tahun = scrape_teks("//div[@class='gsc_oci_field' and text()='Tanggal terbit']/following-sibling::div")
jurnal = scrape_teks("//div[@class='gsc_oci_field' and text()='Jurnal']/following-sibling::div")
jilid = scrape_teks("//div[@class='gsc_oci_field' and text()='Jilid']/following-sibling::div")
terbitan = scrape_teks("//div[@class='gsc_oci_field' and text()='Terbitan']/following-sibling::div")
halaman = scrape_teks("//div[@class='gsc_oci_field' and text()='Halaman']/following-sibling::div")
penerbit = scrape_teks("//div[@class='gsc_oci_field' and text()='Penerbit']/following-sibling::div")
kutipan = scrape_teks("//div[@class='gsc_oci_field' and text()='Total kutipan']/following-sibling::div")
hasil_scraping.append([
scholar_link, judul, pengarang, tahun, jurnal,
jilid, terbitan, halaman, penerbit, kutipan
])
print(f"{judul}")
except Exception as e:
print(f"Gagal ambil detail publikasi: {e}")
except KeyboardInterrupt:
print("\nScraping dihentikan oleh pengguna.")
finally:
folder_path = "D:\\lecturertask"
os.makedirs(folder_path, exist_ok=True)
base_filename = "data_scholar"
file_ext = ".xlsx"
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
max_num = max([int(f.replace(base_filename, "").replace(file_ext, "")) for f in existing_files if f.replace(base_filename, "").replace(file_ext, "").isdigit()] + [0])
new_filename = f"{base_filename}{max_num + 1}{file_ext}"
save_path = os.path.join(folder_path, new_filename)
if hasil_scraping:
df = pd.DataFrame(hasil_scraping, columns=[
"Link", "Judul", "Pengarang", "Tahun", "Jurnal",
"Jilid", "Terbitan", "Halaman", "Penerbit", "Total Kutipan"
])
df.to_excel(save_path, index=False, engine="openpyxl")
print(f"\nScraping selesai/dihentikan! Total {len(hasil_scraping)} data disimpan di '{save_path}'")
else:
print("\nScraping gagal/tidak mendapatkan data. File tidak dibuat.")
browser.quit()