MIF_E31222544/scraper/scraper_backup_important/scraper_scholar.py

141 lines
5.5 KiB
Python

import os
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
CHROME_DRIVER_PATH = "D:\\lecturertask\\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/110.0.0.0 Safari/537.36")
service = Service(CHROME_DRIVER_PATH)
browser = webdriver.Chrome(service=service, options=chrome_options)
browser.get("https://scholar.google.com/")
with open("dosenTI.txt", "r", encoding="utf-8") as file:
list_dosen = [line.strip() for line in file if line.strip()]
print(f"Jumlah dosen: {len(list_dosen)}")
def scrape_teks(xpath, default="tidak ditemukan"):
try:
return WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.XPATH, xpath))
).text.strip()
except:
return default
def klik_tampilkan_lagi():
while True:
try:
tombol = WebDriverWait(browser, 2).until(
EC.element_to_be_clickable((By.ID, "gsc_bpf_more"))
)
tombol.click()
print("Memuat lebih banyak...")
time.sleep(random.uniform(1.2, 2))
except:
break
hasil_scraping = []
try:
for dosen in list_dosen:
print(f"\nMembuka profil dosen: {dosen}")
try:
browser.get(dosen)
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.ID, "gsc_a_t"))
)
except:
print("Gagal membuka profil dosen.")
continue
try:
sort_link = WebDriverWait(browser, 3).until(
EC.presence_of_element_located((By.XPATH, "//span[@id='gsc_a_ha']/a[contains(@class, 'gsc_a_a')]"))
)
browser.get(sort_link.get_attribute("href"))
print("Disortir berdasarkan tahun.")
except:
print("Sortir gagal.")
klik_tampilkan_lagi()
try:
elements = WebDriverWait(browser, 3).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.gsc_a_tr td.gsc_a_t a.gsc_a_at"))
)
penelitian_links = [el.get_attribute("href") for el in elements]
except:
penelitian_links = []
print(f"Total penelitian: {len(penelitian_links)}")
for link in penelitian_links:
try:
browser.get(link)
WebDriverWait(browser, 3).until(
EC.presence_of_element_located((By.ID, "gsc_oci_title"))
)
judul = scrape_teks("//div[@id='gsc_oci_title']")
pengarang = scrape_teks("//div[@class='gsc_oci_field' and text()='Pengarang']/following-sibling::div")
tahun = scrape_teks("//div[@class='gsc_oci_field' and text()='Tanggal terbit']/following-sibling::div")
jurnal = scrape_teks("//div[@class='gsc_oci_field' and text()='Jurnal']/following-sibling::div")
jilid = scrape_teks("//div[@class='gsc_oci_field' and text()='Jilid']/following-sibling::div", "-")
terbitan = scrape_teks("//div[@class='gsc_oci_field' and text()='Terbitan']/following-sibling::div", "-")
halaman = scrape_teks("//div[@class='gsc_oci_field' and text()='Halaman']/following-sibling::div", "-")
penerbit = scrape_teks("//div[@class='gsc_oci_field' and text()='Penerbit']/following-sibling::div")
kutipan = scrape_teks("//div[@class='gsc_oci_field' and text()='Total kutipan']/following-sibling::div")
hasil_scraping.append([
dosen, judul, pengarang, tahun, jurnal,
jilid, terbitan, halaman, penerbit, kutipan
])
print(f"{judul}")
except Exception as e:
print(f"Gagal ambil data detail: {e}")
finally:
browser.back()
except KeyboardInterrupt:
print("\nScraping dihentikan oleh pengguna.")
finally:
folder_path = "D:\\lecturertask"
os.makedirs(folder_path, exist_ok=True)
base_filename = "data_scholar"
file_ext = ".xlsx"
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
max_num = 0
for f in existing_files:
try:
num = int(f.replace(base_filename, "").replace(file_ext, ""))
max_num = max(max_num, num)
except:
continue
next_filename = f"{base_filename}{max_num + 1}{file_ext}"
save_path = os.path.join(folder_path, next_filename)
if hasil_scraping:
df = pd.DataFrame(hasil_scraping, columns=[
"Link", "Judul", "Pengarang", "Tahun", "Jurnal",
"Jilid", "Terbitan", "Halaman", "Penerbit", "Total Kutipan"
])
df.to_excel(save_path, index=False, engine="openpyxl")
print(f"\nScraping selesai/dihentikan! Total {len(hasil_scraping)} data disimpan di '{save_path}'")
else:
print("\nScraping gagal/tidak mendapatkan data. File tidak dibuat.")
browser.quit()