118 lines
4.7 KiB
Python
118 lines
4.7 KiB
Python
# === scraper_onescholar.py ===
|
|
|
|
import sys
|
|
import os
|
|
import time
|
|
import random
|
|
import pandas as pd
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
# Validasi argumen
|
|
if len(sys.argv) < 2:
|
|
raise ValueError("Link profil Google Scholar harus disediakan sebagai argumen.")
|
|
|
|
scholar_link = sys.argv[1]
|
|
print(f"[INFO] Link Google Scholar: {scholar_link}")
|
|
|
|
# Konfigurasi browser
|
|
CHROME_DRIVER_PATH = "D:\\lecturertask\\chromedriver.exe"
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--start-maximized")
|
|
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/110.0.0.0 Safari/537.36")
|
|
|
|
service = Service(CHROME_DRIVER_PATH)
|
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
|
hasil_scraping = []
|
|
|
|
browser.get("https://scholar.google.com/")
|
|
time.sleep(1)
|
|
|
|
|
|
def scrape_teks(xpath, default="-"):
|
|
try:
|
|
return WebDriverWait(browser, 5).until(
|
|
EC.presence_of_element_located((By.XPATH, xpath))
|
|
).text.strip()
|
|
except:
|
|
return default
|
|
|
|
|
|
def klik_tampilkan_lagi():
|
|
while True:
|
|
try:
|
|
tombol = WebDriverWait(browser, 2).until(
|
|
EC.element_to_be_clickable((By.ID, "gsc_bpf_more"))
|
|
)
|
|
tombol.click()
|
|
print("Memuat lebih banyak...")
|
|
time.sleep(random.uniform(1.2, 2))
|
|
except:
|
|
break
|
|
|
|
try:
|
|
print(f"\nMembuka profil: {scholar_link}")
|
|
browser.get(scholar_link)
|
|
WebDriverWait(browser, 6).until(EC.presence_of_element_located((By.ID, "gsc_a_t")))
|
|
klik_tampilkan_lagi()
|
|
|
|
elements = browser.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr td.gsc_a_t a.gsc_a_at")
|
|
penelitian_links = [el.get_attribute("href") for el in elements]
|
|
print(f"Total publikasi: {len(penelitian_links)}")
|
|
|
|
for link in penelitian_links:
|
|
try:
|
|
browser.get(link)
|
|
WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.ID, "gsc_oci_title")))
|
|
|
|
judul = scrape_teks("//div[@id='gsc_oci_title']")
|
|
pengarang = scrape_teks("//div[@class='gsc_oci_field' and text()='Pengarang']/following-sibling::div")
|
|
tahun = scrape_teks("//div[@class='gsc_oci_field' and text()='Tanggal terbit']/following-sibling::div")
|
|
jurnal = scrape_teks("//div[@class='gsc_oci_field' and text()='Jurnal']/following-sibling::div")
|
|
jilid = scrape_teks("//div[@class='gsc_oci_field' and text()='Jilid']/following-sibling::div")
|
|
terbitan = scrape_teks("//div[@class='gsc_oci_field' and text()='Terbitan']/following-sibling::div")
|
|
halaman = scrape_teks("//div[@class='gsc_oci_field' and text()='Halaman']/following-sibling::div")
|
|
penerbit = scrape_teks("//div[@class='gsc_oci_field' and text()='Penerbit']/following-sibling::div")
|
|
kutipan = scrape_teks("//div[@class='gsc_oci_field' and text()='Total kutipan']/following-sibling::div")
|
|
|
|
hasil_scraping.append([
|
|
scholar_link, judul, pengarang, tahun, jurnal,
|
|
jilid, terbitan, halaman, penerbit, kutipan
|
|
])
|
|
print(f"✓ {judul}")
|
|
except Exception as e:
|
|
print(f"Gagal ambil detail publikasi: {e}")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nScraping dihentikan oleh pengguna.")
|
|
|
|
finally:
|
|
folder_path = "D:\\lecturertask"
|
|
os.makedirs(folder_path, exist_ok=True)
|
|
base_filename = "data_scholar"
|
|
file_ext = ".xlsx"
|
|
|
|
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
|
|
max_num = max([int(f.replace(base_filename, "").replace(file_ext, "")) for f in existing_files if f.replace(base_filename, "").replace(file_ext, "").isdigit()] + [0])
|
|
new_filename = f"{base_filename}{max_num + 1}{file_ext}"
|
|
save_path = os.path.join(folder_path, new_filename)
|
|
|
|
if hasil_scraping:
|
|
df = pd.DataFrame(hasil_scraping, columns=[
|
|
"Link", "Judul", "Pengarang", "Tahun", "Jurnal",
|
|
"Jilid", "Terbitan", "Halaman", "Penerbit", "Total Kutipan"
|
|
])
|
|
df.to_excel(save_path, index=False, engine="openpyxl")
|
|
print(f"\nScraping selesai/dihentikan! Total {len(hasil_scraping)} data disimpan di '{save_path}'")
|
|
else:
|
|
print("\nScraping gagal/tidak mendapatkan data. File tidak dibuat.")
|
|
|
|
browser.quit()
|