import os import shutil import time from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DOCUMENTS_PATH = os.path.join(BASE_DIR, "documents") CHROMA_PATH = os.path.join(BASE_DIR, "chromadb") #imperot library #iki ws mulai berubah jadi batman(ingestion) def main(): print("MULAI INGESTION") if not os.path.exists(DOCUMENTS_PATH): os.makedirs(DOCUMENTS_PATH) pdf_files = [f for f in os.listdir(DOCUMENTS_PATH) if f.endswith('.pdf')] if not pdf_files: print("[INFO] Folder dokumen kosong.") if os.path.exists(CHROMA_PATH): try: shutil.rmtree(CHROMA_PATH) print("[SUKSES] Database lama berhasil dihapus.") except PermissionError: print("[WARNING] Tidak bisa menghapus folder 'chromadb' karena sedang dipakai Flask.") except Exception as e: print(f"[ERROR] Error saat menghapus DB: {e}") return documents = [] print(f"[INFO] Memproses {len(pdf_files)} file PDF...") for file in pdf_files: pdf_path = os.path.join(DOCUMENTS_PATH, file) try: loader = PyPDFLoader(pdf_path) documents.extend(loader.load()) except Exception as e: print(f" [ERROR] Gagal baca {file}: {e}") if not documents: print("[WARNING] Dokumen terbaca kosong.") return text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=100) chunks = text_splitter.split_documents(documents) embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") if os.path.exists(CHROMA_PATH): try: shutil.rmtree(CHROMA_PATH) print("Database lama berhasil dihapus.") time.sleep(1) except PermissionError: print("[WARNING] Database sedang dikunci oleh Flask (Windows Lock).") print(" mencoba menimpa/menambah data (Upsert).") except Exception as e: print(f"Gagal mengakses database: {e}") try: db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH) print(f"[SUKSES] {len(chunks)} potongan data tersimpan.") except Exception as e: print(f"Gagal menyimpan Database: {e}") if __name__ == "__main__": main()