commit 4b251ec50cf2c2f6331e99608c31ee9c34ebe1a9 Author: Sultan A <125938835+SamforWisdom@users.noreply.github.com> Date: Fri Feb 21 15:01:37 2025 +0700 upload scraper 1.0 scraper sinta diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..20f839c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,35 @@ +attrs==25.1.0 +beautifulsoup4==4.13.3 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +dnspython==2.7.0 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +h11==0.14.0 +idna==3.10 +numpy==2.2.3 +openpyxl==3.1.5 +outcome==1.3.0.post0 +packaging==24.2 +pandas==2.2.3 +pycparser==2.22 +pymongo==4.11.1 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +pytz==2025.1 +requests==2.32.3 +selenium==4.28.1 +six==1.17.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +trio==0.29.0 +trio-websocket==0.12.1 +typing_extensions==4.12.2 +tzdata==2025.1 +urllib3==2.3.0 +webdriver-manager==4.0.2 +websocket-client==1.8.0 +wsproto==1.2.0 diff --git a/scraper_HKI.py b/scraper_HKI.py new file mode 100644 index 0000000..f58a13d --- /dev/null +++ b/scraper_HKI.py @@ -0,0 +1,67 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +import pandas as pd +import time, random + +chrome_driver_path = "D:\\lecturertask\\chromedriver.exe" +chrome_options = Options() +chrome_options.add_argument("--headful") +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("--no-sandbox") +chrome_options.add_argument("--window-size=1920x1080") + +service = Service(chrome_driver_path) +browser = webdriver.Chrome(service=service, options=chrome_options) + +browser.get("https://sinta.kemdikbud.go.id/logins") +input("Login secara manual, lalu tekan ENTER untuk melanjutkan...") + +browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=iprs") +time.sleep(random.uniform(2, 6)) + +all_iprs = [] +page = 1 + +while True: + print(f"📄 Scraping halaman {page}...") + + service_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5") + if not service_items: + print("❌ Tidak ada data di halaman ini, berhenti.") + break + + for item in service_items: + try: + title = item.find_element(By.CSS_SELECTOR, "div.ar-title a").text.strip() + inventor = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a")[0].text.strip() + publication = item.find_elements(By.CSS_SELECTOR, "a.ar-pub")[0].text.strip() + year = item.find_elements(By.CSS_SELECTOR, "a.ar-year")[0].text.strip() + application_number = item.find_elements(By.CSS_SELECTOR, "a.ar-cited")[0].text.strip() + patent_type = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile")[0].text.strip() + + all_iprs.append([title, inventor, publication, year, application_number, patent_type]) + print(f"✅ Data berhasil diambil: {title}") + except Exception as e: + print(f"⚠️ Error mengambil data: {e}") + + try: + next_button = WebDriverWait(browser, random.uniform(3, 7)).until( + EC.presence_of_element_located((By.LINK_TEXT, str(page + 1))) + ) + next_button.click() + time.sleep(random.uniform(3, 7)) + page += 1 + except: + print("✅ Tidak ada halaman berikutnya, selesai scraping.") + break + +df = pd.DataFrame(all_iprs, columns=["Judul", "Inventor", "Publikasi", "Tahun", "Nomor Permohonan", "Jenis Paten"]) +df.to_excel("data_iprs.xlsx", index=False, engine="openpyxl") + +print(f"🎉 Scraping selesai! {len(all_iprs)} data disimpan di 'data_iprs.xlsx'") + +browser.quit() diff --git a/scraper_penelitian.py b/scraper_penelitian.py new file mode 100644 index 0000000..d21e28c --- /dev/null +++ b/scraper_penelitian.py @@ -0,0 +1,86 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.action_chains import ActionChains +import pandas as pd +import time, random + +chrome_driver_path = "D:\\lecturertask\\chromedriver.exe" +chrome_options = Options() + +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36" +chrome_options.add_argument(f"user-agent={user_agent}") +chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) +chrome_options.add_experimental_option("useAutomationExtension", False) + +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("--no-sandbox") +chrome_options.add_argument("--window-size=1920x1080") + +service = Service(chrome_driver_path) +browser = webdriver.Chrome(service=service, options=chrome_options) + +browser.get("https://sinta.kemdikbud.go.id/logins") +input("Login secara manual, lalu tekan ENTER untuk melanjutkan...") + +browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=researches") +time.sleep(random.uniform(2, 6)) + +all_research = [] +page = 1 +action = ActionChains(browser) + +while True: + print(f"📄 Scraping halaman {page}...") + + research_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5") + if not research_items: + print("❌ Tidak ada data di halaman ini, berhenti.") + break + + for item in research_items: + try: + action.move_to_element(item).perform() + time.sleep(random.uniform(0.5, 2)) + + title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip() + + leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a") + leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader" + + personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']") + personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil" + + research_type_element = item.find_elements(By.CSS_SELECTOR, "a.ar-pub") + research_type = research_type_element[0].text.strip() if research_type_element else "Tidak Diketahui" + + year_element = item.find_elements(By.CSS_SELECTOR, "a.ar-year") + year = year_element[0].text.strip().replace("📅 ", "") if year_element else "Tidak Ada Tahun" + + fund_element = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile") + fund = fund_element[0].text.strip().replace("📊 ", "") if fund_element else "Tidak Ada Dana" + + all_research.append([title, leader, personil_list, research_type, year, fund]) + except Exception as e: + print(f"⚠️ Error mengambil data: {e}") + + try: + next_button = WebDriverWait(browser, random.uniform(3, 7)).until( + EC.presence_of_element_located((By.LINK_TEXT, str(page + 1))) + ) + next_button.click() + time.sleep(random.uniform(3, 7)) + page += 1 + except: + print("✅ Tidak ada halaman berikutnya, selesai scraping.") + break + +df = pd.DataFrame(all_research, columns=["Judul", "Leader", "Personil", "Tipe Penelitian", "Tahun", "Dana"]) +df.to_excel("data_penelitian.xlsx", index=False, engine="openpyxl") + +print(f"🎉 Scraping selesai! {len(all_research)} data disimpan di 'data_penelitian.xlsx'") + +browser.quit() \ No newline at end of file diff --git a/scraper_pengabdian.py b/scraper_pengabdian.py new file mode 100644 index 0000000..ace723e --- /dev/null +++ b/scraper_pengabdian.py @@ -0,0 +1,85 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.action_chains import ActionChains +import pandas as pd +import time, random + +chrome_driver_path = "D:\\lecturertask\\chromedriver.exe" +chrome_options = Options() + +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, seperti Gecko) Chrome/120.0.0.0 Safari/537.36" +chrome_options.add_argument(f"user-agent={user_agent}") +chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) +chrome_options.add_experimental_option("useAutomationExtension", False) +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("--no-sandbox") +chrome_options.add_argument("--window-size=1920x1080") + +service = Service(chrome_driver_path) +browser = webdriver.Chrome(service=service, options=chrome_options) + +browser.get("https://sinta.kemdikbud.go.id/logins") +input("Login secara manual, lalu tekan ENTER untuk melanjutkan...") + +browser.get("https://sinta.kemdikbud.go.id/affiliations/profile/447/?view=services") +time.sleep(random.uniform(2, 6)) + +all_services = [] +page = 1 +action = ActionChains(browser) + +while True: + print(f"📄 Scraping halaman {page}...") + + service_items = browser.find_elements(By.CSS_SELECTOR, "div.ar-list-item.mb-5") + if not service_items: + print("❌ Tidak ada data di halaman ini, berhenti.") + break + + for item in service_items: + try: + action.move_to_element(item).perform() + time.sleep(random.uniform(0.5, 2)) + + title = item.find_element(By.CSS_SELECTOR, "div.ar-title").text.strip() + + leader_element = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a") + leader = leader_element[0].text.strip() if leader_element else "Tidak Ada Leader" + + personil_elements = item.find_elements(By.CSS_SELECTOR, "div.ar-meta a[href*='authors/profile']") + personil_list = "; ".join([p.text.strip() for p in personil_elements]) if personil_elements else "Tidak Ada Personil" + + service_type_element = item.find_elements(By.CSS_SELECTOR, "a.ar-pub") + service_type = service_type_element[0].text.strip() if service_type_element else "Tidak Diketahui" + + year_element = item.find_elements(By.CSS_SELECTOR, "a.ar-year") + year = year_element[0].text.strip().replace("📅 ", "") if year_element else "Tidak Ada Tahun" + + fund_element = item.find_elements(By.CSS_SELECTOR, "a.ar-quartile") + fund = fund_element[0].text.strip().replace("📊 ", "") if fund_element else "Tidak Ada Dana" + + all_services.append([title, leader, personil_list, service_type, year, fund]) + except Exception as e: + print(f"⚠️ Error mengambil data: {e}") + + try: + next_button = WebDriverWait(browser, random.uniform(3, 7)).until( + EC.presence_of_element_located((By.LINK_TEXT, str(page + 1))) + ) + next_button.click() + time.sleep(random.uniform(3, 7)) + page += 1 + except: + print("✅ Tidak ada halaman berikutnya, selesai scraping.") + break + +df = pd.DataFrame(all_services, columns=["Judul", "Leader", "Personil", "Tipe Pengabdian", "Tahun", "Dana"]) +df.to_excel("data_pengabdian.xlsx", index=False, engine="openpyxl") + +print(f"🎉 Scraping selesai! {len(all_services)} data disimpan di 'data_pengabdian.xlsx'") + +browser.quit() \ No newline at end of file