Source code for dllmforge.rag_preprocess_documents

"""
This module provides document preprocessing functionality for RAG (Retrieval-Augmented Generation) pipelines.
It includes document loading and text chunking for PDF files.
"""
import os
from pathlib import Path
from typing import List, Tuple, Dict, Any
from abc import ABC, abstractmethod
import re

from pypdf import PdfReader


[docs] class DocumentLoader(ABC): """Abstract base class for document loaders."""
[docs] @abstractmethod def load(self, file_path: Path) -> List[Tuple[int, str]]: """ Load a document and return its contents as a list of (page_number, text) tuples. Args: file_path: Path to the document file Returns: List of tuples containing (page_number, text) pairs """ pass
[docs] class PDFLoader(DocumentLoader): """Loader for PDF documents using PyPDF2."""
[docs] def load(self, file_path: Path) -> Tuple[List[Tuple[int, str]], str]: """ Load a PDF document and extract text from its pages. Args: file_path: Path to the PDF file Returns: Tuple containing (pages_with_text, file_name) where pages_with_text is a list of (page_number, text) pairs """ file_name = os.path.basename(file_path) pages_with_text = [] with open(file_path, "rb") as pdf_file: pdf_reader = PdfReader(pdf_file) metadata = pdf_reader.metadata for page_number, page in enumerate(pdf_reader.pages, start=1): text = page.extract_text() if text.strip(): # Skip empty pages pages_with_text.append((page_number, text)) return pages_with_text, file_name, metadata
[docs] class TextChunker: """Class for chunking text into smaller segments with overlap. For detailed information about chunking strategies in RAG applications, including: - Why chunking is important - How to choose chunk size and overlap - Different splitting techniques - Evaluation methods See: https://www.mongodb.com/developer/products/atlas/choosing-chunking-strategy-rag/ """
[docs] def __init__(self, chunk_size: int = 1000, overlap_size: int = 200): """ Initialize the TextChunker. Args: chunk_size: Maximum size of each chunk in characters overlap_size: Number of characters to overlap between chunks (recommended: 5-20% of chunk_size) """ self.chunk_size = chunk_size self.overlap_size = overlap_size
[docs] def chunk_text(self, pages_with_text: List[Tuple[int, str]], file_name: str = None, metadata: dict = None) -> List[Dict[str, Any]]: """ Split text into chunks while preserving sentence boundaries. Args: pages_with_text: List of tuples containing (page_number, text) pairs file_name: Name of the source file (optional) metadata: Metadata information extracted from the document (optional) Returns: List of dictionaries containing chunks with metadata: { 'text': str, # The chunk text 'page_number': int, # Source page number 'chunk_index': int, # Index of the chunk 'total_chunks': int, # Total number of chunks from this document 'file_name': str # Name of the source file } """ chunks: List[Dict[str, Any]] = [] for page_number, text in pages_with_text: sentences = re.split(r'(?<=[.!?]) +', text) current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= self.chunk_size: current_chunk += sentence + " " else: # Add the current chunk if current_chunk.strip(): chunks.append({ 'text': current_chunk.strip(), 'page_number': page_number, 'chunk_index': len(chunks), 'total_chunks': None, # Will be updated after all chunks are created 'file_name': file_name, 'metadata': metadata if metadata else None }) # Start a new chunk with overlap current_chunk = current_chunk[-self.overlap_size:].strip() + " " + sentence + " " # Add any remaining text as a chunk if current_chunk.strip(): chunks.append({ 'text': current_chunk.strip(), 'page_number': page_number, 'chunk_index': len(chunks), 'total_chunks': None, 'file_name': file_name, 'metadata': metadata if metadata else None }) # Update total_chunks in all chunks total_chunks = len(chunks) for chunk in chunks: chunk['total_chunks'] = total_chunks return chunks
if __name__ == "__main__": # Example usage data_dir = Path(r'c:\Users\deng_jg\work\16centralized_agents\test_data') pdf_path = data_dir / "lstm_low_flow.pdf" # Load the PDF document loader = PDFLoader() pages, file_name, metadata = loader.load(pdf_path) # Create chunks with custom settings chunker = TextChunker(chunk_size=1000, overlap_size=200) chunks = chunker.chunk_text(pages, file_name) # Print some information about the chunks print(f"Generated {len(chunks)} chunks from file: {file_name}") for i, chunk in enumerate(chunks[:2]): # Print first two chunks as example print(f"\nChunk {i+1}:") print(f"File: {chunk['file_name']}") print(f"Page: {chunk['page_number']}") print(f"Index: {chunk['chunk_index']} of {chunk['total_chunks']}") print(f"Text preview: {chunk['text'][:100]}...")