Source code for dllmforge.rag_preprocess_documents

"""
This module provides document preprocessing functionality for RAG (Retrieval-Augmented Generation) pipelines.
It includes document loading and text chunking for PDF files.
"""
import os
from pathlib import Path
from typing import List, Tuple, Dict, Any
from abc import ABC, abstractmethod
import re

from pypdf import PdfReader



[docs]
class DocumentLoader(ABC):
    """Abstract base class for document loaders."""


[docs]
    @abstractmethod
    def load(self, file_path: Path) -> List[Tuple[int, str]]:
        """
        Load a document and return its contents as a list of (page_number, text) tuples.
        Args:
            file_path: Path to the document file
        Returns:
            List of tuples containing (page_number, text) pairs
        """
        pass





[docs]
class PDFLoader(DocumentLoader):
    """Loader for PDF documents using PyPDF2."""


[docs]
    def load(self, file_path: Path) -> Tuple[List[Tuple[int, str]], str]:
        """
        Load a PDF document and extract text from its pages.
        Args:
            file_path: Path to the PDF file
        Returns:
            Tuple containing (pages_with_text, file_name) where pages_with_text is a list of (page_number, text) pairs
        """
        file_name = os.path.basename(file_path)

        pages_with_text = []
        with open(file_path, "rb") as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            metadata = pdf_reader.metadata
            for page_number, page in enumerate(pdf_reader.pages, start=1):
                text = page.extract_text()
                if text.strip():  # Skip empty pages
                    pages_with_text.append((page_number, text))
        return pages_with_text, file_name, metadata





[docs]
class TextChunker:
    """Class for chunking text into smaller segments with overlap.
    For detailed information about chunking strategies in RAG applications, including:
    - Why chunking is important
    - How to choose chunk size and overlap
    - Different splitting techniques
    - Evaluation methods
    See: https://www.mongodb.com/developer/products/atlas/choosing-chunking-strategy-rag/
    """


[docs]
    def __init__(self, chunk_size: int = 1000, overlap_size: int = 200):
        """
        Initialize the TextChunker.
        Args:
            chunk_size: Maximum size of each chunk in characters
            overlap_size: Number of characters to overlap between chunks (recommended: 5-20% of chunk_size)
        """
        self.chunk_size = chunk_size
        self.overlap_size = overlap_size



[docs]
    def chunk_text(self,
                   pages_with_text: List[Tuple[int, str]],
                   file_name: str = None,
                   metadata: dict = None) -> List[Dict[str, Any]]:
        """
        Split text into chunks while preserving sentence boundaries.
        Args:
            pages_with_text: List of tuples containing (page_number, text) pairs
            file_name: Name of the source file (optional)
            metadata: Metadata information extracted from the document (optional)
        Returns:
            List of dictionaries containing chunks with metadata:
            {
                'text': str,           # The chunk text
                'page_number': int,    # Source page number
                'chunk_index': int,    # Index of the chunk
                'total_chunks': int,   # Total number of chunks from this document
                'file_name': str       # Name of the source file
            }
        """
        chunks: List[Dict[str, Any]] = []

        for page_number, text in pages_with_text:
            sentences = re.split(r'(?<=[.!?]) +', text)
            current_chunk = ""

            for sentence in sentences:
                if len(current_chunk) + len(sentence) <= self.chunk_size:
                    current_chunk += sentence + " "
                else:
                    # Add the current chunk
                    if current_chunk.strip():
                        chunks.append({
                            'text': current_chunk.strip(),
                            'page_number': page_number,
                            'chunk_index': len(chunks),
                            'total_chunks': None,  # Will be updated after all chunks are created
                            'file_name': file_name,
                            'metadata': metadata if metadata else None
                        })

                    # Start a new chunk with overlap
                    current_chunk = current_chunk[-self.overlap_size:].strip() + " " + sentence + " "

            # Add any remaining text as a chunk
            if current_chunk.strip():
                chunks.append({
                    'text': current_chunk.strip(),
                    'page_number': page_number,
                    'chunk_index': len(chunks),
                    'total_chunks': None,
                    'file_name': file_name,
                    'metadata': metadata if metadata else None
                })

        # Update total_chunks in all chunks
        total_chunks = len(chunks)
        for chunk in chunks:
            chunk['total_chunks'] = total_chunks

        return chunks




if __name__ == "__main__":
    # Example usage
    data_dir = Path(r'c:\Users\deng_jg\work\16centralized_agents\test_data')
    pdf_path = data_dir / "lstm_low_flow.pdf"

    # Load the PDF document
    loader = PDFLoader()
    pages, file_name, metadata = loader.load(pdf_path)

    # Create chunks with custom settings
    chunker = TextChunker(chunk_size=1000, overlap_size=200)
    chunks = chunker.chunk_text(pages, file_name)

    # Print some information about the chunks
    print(f"Generated {len(chunks)} chunks from file: {file_name}")
    for i, chunk in enumerate(chunks[:2]):  # Print first two chunks as example
        print(f"\nChunk {i+1}:")
        print(f"File: {chunk['file_name']}")
        print(f"Page: {chunk['page_number']}")
        print(f"Index: {chunk['chunk_index']} of {chunk['total_chunks']}")
        print(f"Text preview: {chunk['text'][:100]}...")
Source code for dllmforge.rag_preprocess_documents

dllmforge

Navigation

Related Topics