141 lines
5.5 KiB
Python
141 lines
5.5 KiB
Python
import os
|
|
import time
|
|
import random
|
|
import pandas as pd
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
CHROME_DRIVER_PATH = "D:\\lecturertask\\chromedriver.exe"
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--start-maximized")
|
|
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/110.0.0.0 Safari/537.36")
|
|
|
|
service = Service(CHROME_DRIVER_PATH)
|
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
|
browser.get("https://scholar.google.com/")
|
|
|
|
with open("dosenTI.txt", "r", encoding="utf-8") as file:
|
|
list_dosen = [line.strip() for line in file if line.strip()]
|
|
|
|
print(f"Jumlah dosen: {len(list_dosen)}")
|
|
|
|
def scrape_teks(xpath, default="tidak ditemukan"):
|
|
try:
|
|
return WebDriverWait(browser, 5).until(
|
|
EC.presence_of_element_located((By.XPATH, xpath))
|
|
).text.strip()
|
|
except:
|
|
return default
|
|
|
|
def klik_tampilkan_lagi():
|
|
while True:
|
|
try:
|
|
tombol = WebDriverWait(browser, 2).until(
|
|
EC.element_to_be_clickable((By.ID, "gsc_bpf_more"))
|
|
)
|
|
tombol.click()
|
|
print("Memuat lebih banyak...")
|
|
time.sleep(random.uniform(1.2, 2))
|
|
except:
|
|
break
|
|
hasil_scraping = []
|
|
|
|
try:
|
|
for dosen in list_dosen:
|
|
print(f"\nMembuka profil dosen: {dosen}")
|
|
try:
|
|
browser.get(dosen)
|
|
WebDriverWait(browser, 5).until(
|
|
EC.presence_of_element_located((By.ID, "gsc_a_t"))
|
|
)
|
|
except:
|
|
print("Gagal membuka profil dosen.")
|
|
continue
|
|
|
|
try:
|
|
sort_link = WebDriverWait(browser, 3).until(
|
|
EC.presence_of_element_located((By.XPATH, "//span[@id='gsc_a_ha']/a[contains(@class, 'gsc_a_a')]"))
|
|
)
|
|
browser.get(sort_link.get_attribute("href"))
|
|
print("Disortir berdasarkan tahun.")
|
|
except:
|
|
print("Sortir gagal.")
|
|
|
|
klik_tampilkan_lagi()
|
|
|
|
try:
|
|
elements = WebDriverWait(browser, 3).until(
|
|
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.gsc_a_tr td.gsc_a_t a.gsc_a_at"))
|
|
)
|
|
penelitian_links = [el.get_attribute("href") for el in elements]
|
|
except:
|
|
penelitian_links = []
|
|
|
|
print(f"Total penelitian: {len(penelitian_links)}")
|
|
|
|
for link in penelitian_links:
|
|
try:
|
|
browser.get(link)
|
|
WebDriverWait(browser, 3).until(
|
|
EC.presence_of_element_located((By.ID, "gsc_oci_title"))
|
|
)
|
|
|
|
judul = scrape_teks("//div[@id='gsc_oci_title']")
|
|
pengarang = scrape_teks("//div[@class='gsc_oci_field' and text()='Pengarang']/following-sibling::div")
|
|
tahun = scrape_teks("//div[@class='gsc_oci_field' and text()='Tanggal terbit']/following-sibling::div")
|
|
jurnal = scrape_teks("//div[@class='gsc_oci_field' and text()='Jurnal']/following-sibling::div")
|
|
jilid = scrape_teks("//div[@class='gsc_oci_field' and text()='Jilid']/following-sibling::div", "-")
|
|
terbitan = scrape_teks("//div[@class='gsc_oci_field' and text()='Terbitan']/following-sibling::div", "-")
|
|
halaman = scrape_teks("//div[@class='gsc_oci_field' and text()='Halaman']/following-sibling::div", "-")
|
|
penerbit = scrape_teks("//div[@class='gsc_oci_field' and text()='Penerbit']/following-sibling::div")
|
|
kutipan = scrape_teks("//div[@class='gsc_oci_field' and text()='Total kutipan']/following-sibling::div")
|
|
|
|
hasil_scraping.append([
|
|
dosen, judul, pengarang, tahun, jurnal,
|
|
jilid, terbitan, halaman, penerbit, kutipan
|
|
])
|
|
print(f"✓ {judul}")
|
|
except Exception as e:
|
|
print(f"Gagal ambil data detail: {e}")
|
|
finally:
|
|
browser.back()
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nScraping dihentikan oleh pengguna.")
|
|
|
|
finally:
|
|
folder_path = "D:\\lecturertask"
|
|
os.makedirs(folder_path, exist_ok=True)
|
|
base_filename = "data_scholar"
|
|
file_ext = ".xlsx"
|
|
|
|
existing_files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith(file_ext)]
|
|
max_num = 0
|
|
for f in existing_files:
|
|
try:
|
|
num = int(f.replace(base_filename, "").replace(file_ext, ""))
|
|
max_num = max(max_num, num)
|
|
except:
|
|
continue
|
|
|
|
next_filename = f"{base_filename}{max_num + 1}{file_ext}"
|
|
save_path = os.path.join(folder_path, next_filename)
|
|
|
|
if hasil_scraping:
|
|
df = pd.DataFrame(hasil_scraping, columns=[
|
|
"Link", "Judul", "Pengarang", "Tahun", "Jurnal",
|
|
"Jilid", "Terbitan", "Halaman", "Penerbit", "Total Kutipan"
|
|
])
|
|
df.to_excel(save_path, index=False, engine="openpyxl")
|
|
print(f"\nScraping selesai/dihentikan! Total {len(hasil_scraping)} data disimpan di '{save_path}'")
|
|
else:
|
|
print("\nScraping gagal/tidak mendapatkan data. File tidak dibuat.")
|
|
|
|
browser.quit() |