Source code for dllmforge.rag_embedding_open_source

"""
This module provides embedding functionality for RAG (Retrieval-Augmented Generation) pipelines.
It can be used to 1) vectorize document chunks, and 2) vectorize user queries.
The module uses Azure OpenAI embeddings model as an example of using hosted embedding APIs. Note you need
Azure OpenAI service and a deployed embedding model on Azure to use this module.
"""

from typing import List, Any, Union, Dict

try:
    from langchain_huggingface import HuggingFaceEmbeddings
    HUGGINGFACE_AVAILABLE = True
except ImportError:
    HUGGINGFACE_AVAILABLE = False
    HuggingFaceEmbeddings = None



[docs]
class LangchainHFEmbeddingModel:
    """Class for embedding queries and document chunks using LangChain's HuggingFaceEmbeddings."""


[docs]
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        """
        Initialize the HuggingFaceEmbeddings from LangChain.

        Args:
            model_name: Name or path of the Hugging Face model (default: "sentence-transformers/all-MiniLM-L6-v2").
        """
        # kwargs for encoder; adjust as needed
        encode_kwargs = {"normalize_embeddings": False}  #, "trust_remote_code": True}
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)



[docs]
    @staticmethod
    def validate_embedding(embedding: List[float]) -> bool:
        """Validate that the embedding vector is non-empty and numeric."""
        if not embedding:
            return False
        if not all(isinstance(x, (int, float)) for x in embedding):
            return False
        return True



[docs]
    def embed(self, query_or_chunks: Union[str, List[Dict[str, Any]]]) -> Union[List[float], List[Dict[str, Any]]]:
        """
        Embed a single query string or a list of document chunks.

        Args:
            query_or_chunks: A string (query) or list of dicts with keys:
                            "text", "file_name", "page_number".

        Returns:
            - For a string query: list of floats (vector embedding).
            - For document chunks: list of dicts with keys:
              "chunk_id", "chunk", "page_number", "file_name", "text_vector".
        """
        # Single query
        if isinstance(query_or_chunks, str):
            vectorized_query = self.embeddings.embed_query(query_or_chunks)
            if not self.validate_embedding(vectorized_query):
                raise ValueError("Invalid embedding generated for query.")
            return vectorized_query

        # List of chunks
        if isinstance(query_or_chunks, list) and all(isinstance(c, dict) for c in query_or_chunks):
            texts = [chunk["text"] for chunk in query_or_chunks]
            embeddings = self.embeddings.embed_documents(texts)
            vectorized_chunks = []
            for chunk, emb in zip(query_or_chunks, embeddings):
                if not self.validate_embedding(emb):
                    raise ValueError(f"Invalid embedding for chunk: {chunk}")
                vectorized_chunks.append({
                    "chunk_id": f"{chunk['file_name']}_i{chunk['chunk_index']}",
                    "chunk": chunk["text"],
                    "page_number": chunk["page_number"],
                    "file_name": chunk["file_name"],
                    "text_vector": emb
                })
            return vectorized_chunks

        raise ValueError("Input must be a string or a list of dictionaries.")
Source code for dllmforge.rag_embedding_open_source

dllmforge

Navigation

Related Topics