73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
import os
|
|
import shutil
|
|
import time
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from langchain_chroma import Chroma
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
DOCUMENTS_PATH = os.path.join(BASE_DIR, "documents")
|
|
CHROMA_PATH = os.path.join(BASE_DIR, "chromadb")
|
|
#imperot library
|
|
|
|
#iki ws mulai berubah jadi batman(ingestion)
|
|
def main():
|
|
print("MULAI INGESTION")
|
|
|
|
if not os.path.exists(DOCUMENTS_PATH):
|
|
os.makedirs(DOCUMENTS_PATH)
|
|
|
|
pdf_files = [f for f in os.listdir(DOCUMENTS_PATH) if f.endswith('.pdf')]
|
|
|
|
if not pdf_files:
|
|
print("[INFO] Folder dokumen kosong.")
|
|
if os.path.exists(CHROMA_PATH):
|
|
try:
|
|
shutil.rmtree(CHROMA_PATH)
|
|
print("[SUKSES] Database lama berhasil dihapus.")
|
|
except PermissionError:
|
|
print("[WARNING] Tidak bisa menghapus folder 'chromadb' karena sedang dipakai Flask.")
|
|
except Exception as e:
|
|
print(f"[ERROR] Error saat menghapus DB: {e}")
|
|
return
|
|
|
|
documents = []
|
|
print(f"[INFO] Memproses {len(pdf_files)} file PDF...")
|
|
|
|
for file in pdf_files:
|
|
pdf_path = os.path.join(DOCUMENTS_PATH, file)
|
|
try:
|
|
loader = PyPDFLoader(pdf_path)
|
|
documents.extend(loader.load())
|
|
except Exception as e:
|
|
print(f" [ERROR] Gagal baca {file}: {e}")
|
|
|
|
if not documents:
|
|
print("[WARNING] Dokumen terbaca kosong.")
|
|
return
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=100)
|
|
chunks = text_splitter.split_documents(documents)
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
|
|
if os.path.exists(CHROMA_PATH):
|
|
try:
|
|
shutil.rmtree(CHROMA_PATH)
|
|
print("Database lama berhasil dihapus.")
|
|
time.sleep(1)
|
|
except PermissionError:
|
|
print("[WARNING] Database sedang dikunci oleh Flask (Windows Lock).")
|
|
print(" mencoba menimpa/menambah data (Upsert).")
|
|
except Exception as e:
|
|
print(f"Gagal mengakses database: {e}")
|
|
|
|
try:
|
|
db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
|
|
print(f"[SUKSES] {len(chunks)} potongan data tersimpan.")
|
|
except Exception as e:
|
|
print(f"Gagal menyimpan Database: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |