commit
4b251ec50c
|
@ -0,0 +1,35 @@
|
||||||
|
attrs==25.1.0
|
||||||
|
beautifulsoup4==4.13.3
|
||||||
|
certifi==2025.1.31
|
||||||
|
cffi==1.17.1
|
||||||
|
charset-normalizer==3.4.1
|
||||||
|
dnspython==2.7.0
|
||||||
|
et_xmlfile==2.0.0
|
||||||
|
exceptiongroup==1.2.2
|
||||||
|
h11==0.14.0
|
||||||
|
idna==3.10
|
||||||
|
numpy==2.2.3
|
||||||
|
openpyxl==3.1.5
|
||||||
|
outcome==1.3.0.post0
|
||||||
|
packaging==24.2
|
||||||
|
pandas==2.2.3
|
||||||
|
pycparser==2.22
|
||||||
|
pymongo==4.11.1
|
||||||
|
PySocks==1.7.1
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
pytz==2025.1
|
||||||
|
requests==2.32.3
|
||||||
|
selenium==4.28.1
|
||||||
|
six==1.17.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
sortedcontainers==2.4.0
|
||||||
|
soupsieve==2.6
|
||||||
|
trio==0.29.0
|
||||||
|
trio-websocket==0.12.1
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
tzdata==2025.1
|
||||||
|
urllib3==2.3.0
|
||||||
|
webdriver-manager==4.0.2
|
||||||
|
websocket-client==1.8.0
|
||||||
|
wsproto==1.2.0
|
|
@ -0,0 +1,67 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
import pandas as pd
|
||||||
|
import time, random
|
||||||
|
|
||||||
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--headful")
|
||||||
|
chrome_options.add_argument("--disable-gpu")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--window-size=1920x1080")
|
||||||
|
|
||||||
|
service = Service(chrome_driver_path)
|
||||||
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
|
||||||
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
||||||
|
input("Login secara manual, lalu tekan ENTER untuk melanjutkan...")
|
||||||
|
|
||||||
|
browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=iprs")
|
||||||
|
time.sleep(random.uniform(2, 6))
|
||||||
|
|
||||||
|
all_iprs = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"📄 Scraping halaman {page}...")
|
||||||
|
|
||||||
|
service_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
|
||||||
|
if not service_items:
|
||||||
|
print("❌ Tidak ada data di halaman ini, berhenti.")
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in service_items:
|
||||||
|
try:
|
||||||
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip()
|
||||||
|
inventor = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")[0].text.strip()
|
||||||
|
publication = item.find_elements(By.CSS_SELECTOR, "a.ar-pub")[0].text.strip()
|
||||||
|
year = item.find_elements(By.CSS_SELECTOR, "a.ar-year")[0].text.strip()
|
||||||
|
application_number = item.find_elements(By.CSS_SELECTOR, "a.ar-cited")[0].text.strip()
|
||||||
|
patent_type = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile")[0].text.strip()
|
||||||
|
|
||||||
|
all_iprs.append([title, inventor, publication, year, application_number, patent_type])
|
||||||
|
print(f"✅ Data berhasil diambil: {title}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error mengambil data: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
next_button = WebDriverWait(browser, random.uniform(3, 7)).until(
|
||||||
|
EC.presence_of_element_located((By.LINK_TEXT, str(page + 1)))
|
||||||
|
)
|
||||||
|
next_button.click()
|
||||||
|
time.sleep(random.uniform(3, 7))
|
||||||
|
page += 1
|
||||||
|
except:
|
||||||
|
print("✅ Tidak ada halaman berikutnya, selesai scraping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_iprs, columns=["Judul", "Inventor", "Publikasi", "Tahun", "Nomor Permohonan", "Jenis Paten"])
|
||||||
|
df.to_excel("data_iprs.xlsx", index=False, engine="openpyxl")
|
||||||
|
|
||||||
|
print(f"🎉 Scraping selesai! {len(all_iprs)} data disimpan di 'data_iprs.xlsx'")
|
||||||
|
|
||||||
|
browser.quit()
|
|
@ -0,0 +1,86 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
import pandas as pd
|
||||||
|
import time, random
|
||||||
|
|
||||||
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
||||||
|
chrome_options = Options()
|
||||||
|
|
||||||
|
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
chrome_options.add_argument(f"user-agent={user_agent}")
|
||||||
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
chrome_options.add_experimental_option("useAutomationExtension", False)
|
||||||
|
|
||||||
|
chrome_options.add_argument("--disable-gpu")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--window-size=1920x1080")
|
||||||
|
|
||||||
|
service = Service(chrome_driver_path)
|
||||||
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
|
||||||
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
||||||
|
input("Login secara manual, lalu tekan ENTER untuk melanjutkan...")
|
||||||
|
|
||||||
|
browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=researches")
|
||||||
|
time.sleep(random.uniform(2, 6))
|
||||||
|
|
||||||
|
all_research = []
|
||||||
|
page = 1
|
||||||
|
action = ActionChains(browser)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"📄 Scraping halaman {page}...")
|
||||||
|
|
||||||
|
research_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
|
||||||
|
if not research_items:
|
||||||
|
print("❌ Tidak ada data di halaman ini, berhenti.")
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in research_items:
|
||||||
|
try:
|
||||||
|
action.move_to_element(item).perform()
|
||||||
|
time.sleep(random.uniform(0.5, 2))
|
||||||
|
|
||||||
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip()
|
||||||
|
|
||||||
|
leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
|
||||||
|
leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader"
|
||||||
|
|
||||||
|
personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']")
|
||||||
|
personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil"
|
||||||
|
|
||||||
|
research_type_element = item.find_elements(By.CSS_SELECTOR, "a.ar-pub")
|
||||||
|
research_type = research_type_element[0].text.strip() if research_type_element else "Tidak Diketahui"
|
||||||
|
|
||||||
|
year_element = item.find_elements(By.CSS_SELECTOR, "a.ar-year")
|
||||||
|
year = year_element[0].text.strip().replace("📅 ", "") if year_element else "Tidak Ada Tahun"
|
||||||
|
|
||||||
|
fund_element = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile")
|
||||||
|
fund = fund_element[0].text.strip().replace("📊 ", "") if fund_element else "Tidak Ada Dana"
|
||||||
|
|
||||||
|
all_research.append([title, leader, personil_list, research_type, year, fund])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error mengambil data: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
next_button = WebDriverWait(browser, random.uniform(3, 7)).until(
|
||||||
|
EC.presence_of_element_located((By.LINK_TEXT, str(page + 1)))
|
||||||
|
)
|
||||||
|
next_button.click()
|
||||||
|
time.sleep(random.uniform(3, 7))
|
||||||
|
page += 1
|
||||||
|
except:
|
||||||
|
print("✅ Tidak ada halaman berikutnya, selesai scraping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_research, columns=["Judul", "Leader", "Personil", "Tipe Penelitian", "Tahun", "Dana"])
|
||||||
|
df.to_excel("data_penelitian.xlsx", index=False, engine="openpyxl")
|
||||||
|
|
||||||
|
print(f"🎉 Scraping selesai! {len(all_research)} data disimpan di 'data_penelitian.xlsx'")
|
||||||
|
|
||||||
|
browser.quit()
|
|
@ -0,0 +1,85 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
import pandas as pd
|
||||||
|
import time, random
|
||||||
|
|
||||||
|
chrome_driver_path = "D:\\lecturertask\\chromedriver.exe"
|
||||||
|
chrome_options = Options()
|
||||||
|
|
||||||
|
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
chrome_options.add_argument(f"user-agent={user_agent}")
|
||||||
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
chrome_options.add_experimental_option("useAutomationExtension", False)
|
||||||
|
chrome_options.add_argument("--disable-gpu")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--window-size=1920x1080")
|
||||||
|
|
||||||
|
service = Service(chrome_driver_path)
|
||||||
|
browser = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
|
||||||
|
browser.get("https://sinta.kemdikbud.go.id/logins")
|
||||||
|
input("Login secara manual, lalu tekan ENTER untuk melanjutkan...")
|
||||||
|
|
||||||
|
browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=services")
|
||||||
|
time.sleep(random.uniform(2, 6))
|
||||||
|
|
||||||
|
all_services = []
|
||||||
|
page = 1
|
||||||
|
action = ActionChains(browser)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"📄 Scraping halaman {page}...")
|
||||||
|
|
||||||
|
service_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5")
|
||||||
|
if not service_items:
|
||||||
|
print("❌ Tidak ada data di halaman ini, berhenti.")
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in service_items:
|
||||||
|
try:
|
||||||
|
action.move_to_element(item).perform()
|
||||||
|
time.sleep(random.uniform(0.5, 2))
|
||||||
|
|
||||||
|
title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip()
|
||||||
|
|
||||||
|
leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")
|
||||||
|
leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader"
|
||||||
|
|
||||||
|
personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']")
|
||||||
|
personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil"
|
||||||
|
|
||||||
|
service_type_element = item.find_elements(By.CSS_SELECTOR, "a.ar-pub")
|
||||||
|
service_type = service_type_element[0].text.strip() if service_type_element else "Tidak Diketahui"
|
||||||
|
|
||||||
|
year_element = item.find_elements(By.CSS_SELECTOR, "a.ar-year")
|
||||||
|
year = year_element[0].text.strip().replace("📅 ", "") if year_element else "Tidak Ada Tahun"
|
||||||
|
|
||||||
|
fund_element = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile")
|
||||||
|
fund = fund_element[0].text.strip().replace("📊 ", "") if fund_element else "Tidak Ada Dana"
|
||||||
|
|
||||||
|
all_services.append([title, leader, personil_list, service_type, year, fund])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error mengambil data: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
next_button = WebDriverWait(browser, random.uniform(3, 7)).until(
|
||||||
|
EC.presence_of_element_located((By.LINK_TEXT, str(page + 1)))
|
||||||
|
)
|
||||||
|
next_button.click()
|
||||||
|
time.sleep(random.uniform(3, 7))
|
||||||
|
page += 1
|
||||||
|
except:
|
||||||
|
print("✅ Tidak ada halaman berikutnya, selesai scraping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_services, columns=["Judul", "Leader", "Personil", "Tipe Pengabdian", "Tahun", "Dana"])
|
||||||
|
df.to_excel("data_pengabdian.xlsx", index=False, engine="openpyxl")
|
||||||
|
|
||||||
|
print(f"🎉 Scraping selesai! {len(all_services)} data disimpan di 'data_pengabdian.xlsx'")
|
||||||
|
|
||||||
|
browser.quit()
|
Loading…
Reference in New Issue