MIF_E31230671/rag_engine/ingest.py

73 lines
2.5 KiB
Python

import os
import shutil
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DOCUMENTS_PATH = os.path.join(BASE_DIR, "documents")
CHROMA_PATH = os.path.join(BASE_DIR, "chromadb")
#imperot library
#iki ws mulai berubah jadi batman(ingestion)
def main():
print("MULAI INGESTION")
if not os.path.exists(DOCUMENTS_PATH):
os.makedirs(DOCUMENTS_PATH)
pdf_files = [f for f in os.listdir(DOCUMENTS_PATH) if f.endswith('.pdf')]
if not pdf_files:
print("[INFO] Folder dokumen kosong.")
if os.path.exists(CHROMA_PATH):
try:
shutil.rmtree(CHROMA_PATH)
print("[SUKSES] Database lama berhasil dihapus.")
except PermissionError:
print("[WARNING] Tidak bisa menghapus folder 'chromadb' karena sedang dipakai Flask.")
except Exception as e:
print(f"[ERROR] Error saat menghapus DB: {e}")
return
documents = []
print(f"[INFO] Memproses {len(pdf_files)} file PDF...")
for file in pdf_files:
pdf_path = os.path.join(DOCUMENTS_PATH, file)
try:
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
except Exception as e:
print(f" [ERROR] Gagal baca {file}: {e}")
if not documents:
print("[WARNING] Dokumen terbaca kosong.")
return
text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
if os.path.exists(CHROMA_PATH):
try:
shutil.rmtree(CHROMA_PATH)
print("Database lama berhasil dihapus.")
time.sleep(1)
except PermissionError:
print("[WARNING] Database sedang dikunci oleh Flask (Windows Lock).")
print(" mencoba menimpa/menambah data (Upsert).")
except Exception as e:
print(f"Gagal mengakses database: {e}")
try:
db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
print(f"[SUKSES] {len(chunks)} potongan data tersimpan.")
except Exception as e:
print(f"Gagal menyimpan Database: {e}")
if __name__ == "__main__":
main()