rixdagen/_chromadb/chroma_client.py

import chromadb
import os
import sys
# Set /home/lasse/riksdagen as working directory
os.chdir("/home/lasse/riksdagen")
sys.path.append("/home/lasse/riksdagen")

from chromadb import Collection
from chromadb.api import ClientAPI

import bootstrap  # Ensure sys.path and working directory are set
from config import chromadb_path, embedding_model
import re
from typing import Dict, List, Any, Tuple, Optional
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction

class ChromaClient:
    def __init__(self, path: str | None = chromadb_path):
        self.path: str = path
        self._client: ClientAPI = self._init_client()
        self.embedding_function = OllamaEmbeddingFunction(model_name=embedding_model, url='192.168.1.10:33405')

    def _init_client(self) -> chromadb.PersistentClient:
        return chromadb.PersistentClient(path=self.path)

    def get_collection(self, name: str) -> Collection:
        available_collections = {col.name for col in self._client.list_collections()}
        if name not in available_collections:
            self.create_collection(name=name)
        return self._client.get_collection(name=name)

    def create_collection(self, name: str) -> Collection:
        return self._client.get_or_create_collection(name=name, embedding_function=self.embedding_function)

    def parse_search_query(self, query: str) -> Tuple[Optional[Dict], Optional[Dict]]:
        """
        Parse a Google-like search query into ChromaDB metadata and document filters.

        Supports syntax like:
        - Field searches: author:Smith, year:2020, category:politics
        - Comparisons: year:>2020, year:<=2024, year:>=2020
        - Ranges: year:2020..2024 (equivalent to year:>=2020 AND year:<=2024)
        - Document content: document_contains:"climate change", document_regex:"\\d{4}"
        - Logical operators: AND, OR, NOT
        - Grouping: (author:Smith OR author:Johnson) AND year:>2020
        - Quoted phrases: author:"John Smith" (handles spaces in values)

        Args:
            query (str): The search query string in Google-like syntax

        Returns:
            Tuple[Optional[Dict], Optional[Dict]]: A tuple containing:
                - metadata_filter: ChromaDB metadata filter dict (or None if no metadata filters)
                - document_filter: ChromaDB document filter dict (or None if no document filters)

        Examples:
            >>> client.parse_search_query("author:Smith AND year:>2020")
            ({"$and": [{"author": "Smith"}, {"year": {"$gt": 2020}}]}, None)

            >>> client.parse_search_query("year:2020..2024 AND document_contains:'climate'")
            ({"$and": [{"year": {"$gte": 2020}}, {"year": {"$lte": 2024}}]}, {"$contains": "climate"})
        """
        if not query or not query.strip():
            return None, None

        # Normalize the query - convert to uppercase for operators, but preserve field values
        query = query.strip()

        # Split the query into tokens, preserving quoted strings and operators
        # This regex matches: quoted strings, field:value pairs, operators, parentheses
        tokens = re.findall(r'(?:"[^"]*"|\'[^\']*\'|\([^)]*\)|[^\s()]+)', query)

        metadata_conditions = []
        document_conditions = []

        # Process tokens and convert to conditions
        i = 0
        while i < len(tokens):
            token = tokens[i].strip()

            # Skip logical operators and parentheses - they'll be handled in a more advanced parser
            if token.upper() in ['AND', 'OR', 'NOT', '(', ')']:
                i += 1
                continue

            # Look for field:value patterns
            if ':' in token:
                field, value = token.split(':', 1)
                field = field.lower().strip()
                value = value.strip().strip('"\'')  # Remove quotes if present

                # Handle document content searches
                if field in ['document_contains', 'doc_contains', 'contains']:
                    document_conditions.append({"$contains": value})
                elif field in ['document_regex', 'doc_regex', 'regex']:
                    document_conditions.append({"$regex": value})
                else:
                    # Handle metadata field searches
                    condition = self._parse_field_condition(field, value)
                    if condition:
                        metadata_conditions.append(condition)

            i += 1

        # Build final filters
        metadata_filter = None
        if len(metadata_conditions) == 1:
            metadata_filter = metadata_conditions[0]
        elif len(metadata_conditions) > 1:
            # For now, combine all metadata conditions with AND
            # A more advanced parser could handle OR/NOT operators
            metadata_filter = {"$and": metadata_conditions}

        document_filter = None
        if len(document_conditions) == 1:
            document_filter = document_conditions[0]
        elif len(document_conditions) > 1:
            # Combine document conditions with AND
            document_filter = {"$and": document_conditions}

        return metadata_filter, document_filter

    def _parse_field_condition(self, field: str, value: str) -> Optional[Dict]:
        """
        Parse a single field:value condition into a ChromaDB filter condition.

        Handles various syntaxes:
        - Simple equality: field:value -> {"field": "value"}
        - Comparisons: field:>10 -> {"field": {"$gt": 10}}
        - Ranges: field:2020..2024 -> expands to two conditions for >=2020 AND <=2024

        Args:
            field (str): The field name (e.g., "year", "author", "category")
            value (str): The field value, possibly with operators or range syntax

        Returns:
            Optional[Dict]: ChromaDB filter condition dict, or None if parsing fails
        """
        # Handle range syntax: field:start..end
        if '..' in value:
            try:
                start_str, end_str = value.split('..', 1)
                start_val = self._convert_value(start_str.strip())
                end_val = self._convert_value(end_str.strip())

                # Return as two separate conditions - caller should handle combining them
                # For now, we'll return the >= condition and let the caller handle the <= part
                # This is a limitation of this simple parser
                return {"field": field, "$gte": start_val, "$lte": end_val}
            except (ValueError, TypeError):
                # If range parsing fails, treat as literal string
                return {field: value}

        # Handle comparison operators: >, <, >=, <=, !=
        comparison_match = re.match(r'^(>=|<=|>|<|!=)(.+)$', value)
        if comparison_match:
            operator, op_value = comparison_match.groups()
            op_value = op_value.strip()

            # Convert operator to ChromaDB syntax
            operator_map = {
                '>': '$gt',
                '<': '$lt',
                '>=': '$gte',
                '<=': '$lte',
                '!=': '$ne'
            }

            chroma_op = operator_map.get(operator)
            if chroma_op:
                converted_value = self._convert_value(op_value)
                return {field: {chroma_op: converted_value}}

        # Simple equality condition
        converted_value = self._convert_value(value)
        return {field: converted_value}

    def _convert_value(self, value: str) -> Any:
        """
        Convert a string value to the appropriate Python type.

        Tries to convert to int, then float, then keeps as string.
        Handles quoted strings by removing quotes.

        Args:
            value (str): The string value to convert

        Returns:
            Any: The converted value (int, float, or str)
        """
        if not value:
            return value

        # Remove surrounding quotes if present
        if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
            return value[1:-1]

        # Try to convert to number
        try:
            if '.' in value:
                return float(value)
            else:
                return int(value)
        except ValueError:
            return value

    def query_collection(
        self,
        collection: Collection,
        query_texts: list[str],
        n_results: int = 5,
        query_embeddings: list[list[float]] | None = None,
        where: dict | None = None,
        where_document: dict | None = None,
    ) -> list[dict]:
        """
        Query a ChromaDB collection with text queries and return formatted results.
        Use this to search for similar texts based on text or pre-computed embeddings.

        Args:
            collection (Collection): The ChromaDB collection to query against.
            query_texts (list[str]): List of text strings to search for in the collection.
            n_results (int, optional): Maximum number of results to return per query. Defaults to 5.
            query_embeddings (list[list[float]] | None, optional): Pre-computed embeddings
                for the queries. If None, embeddings will be computed from query_texts.
                Defaults to None.
            where (dict | None, optional): Metadata filter to apply to the search results.
                Examples:
                - Simple filter: {"author": "John Doe"}
                - Comparison: {"page": {"$gt": 10}}
                - Logical AND: {"$and": [{"author": "John"}, {"year": {"$gte": 2020}}]}
                - Logical OR: {"$or": [{"category": "news"}, {"category": "politics"}]}
                - Inclusion: {"status": {"$in": ["published", "draft"]}}
                If None, no filtering is applied. Defaults to None.
            where_document (dict | None, optional): Full-text search filter for document content.
                Examples:
                - Contains text: {"$contains": "climate change"}
                - Regex pattern: {"$regex": r"\\b\\d{4}\\b"}  # matches 4-digit years
                If None, no document filtering is applied. Defaults to None.

        Returns:
            list[dict]: A flattened list of unique dictionaries containing query results
                from all queries, where each dictionary has keys 'metadata', 'document',
                'distance', and 'id'. Duplicates (same id) are removed, keeping the
                result with the best (lowest) distance score.
        """

        assert query_texts or query_embeddings, "Either query_texts or query_embeddings must be provided."
        # Build query parameters - only include optional parameters if they're provided

        query_params = {
            'n_results': n_results,
        }

        if query_texts:
            query_params['query_texts'] = query_texts
        elif query_embeddings:
            query_params['query_embeddings'] = query_embeddings
        if where is not None:
            query_params['where'] = where

        if where_document is not None:
            query_params['where_document'] = where_document

        # Execute the query with the constructed parameters
        results = collection.query(**query_params)

        # Dictionary to store unique results by id, keeping the one with best distance
        unique_results = {}

        # Process results from all queries
        metadatas = results.get("metadatas", [])
        documents = results.get("documents", [])
        distances = results.get("distances", [])
        ids = results.get("ids", [])

        # Iterate through each query's results
        for query_idx in range(len(metadatas)):
            query_metadatas = metadatas[query_idx] if query_idx < len(metadatas) else []
            query_documents = documents[query_idx] if query_idx < len(documents) else []
            query_distances = distances[query_idx] if query_idx < len(distances) else []
            query_ids = ids[query_idx] if query_idx < len(ids) else []

            # Process each result in this query
            for metadata, document, distance, identifier in zip(
                query_metadatas, query_documents, query_distances, query_ids
            ):
                # Keep the result with the best (lowest) distance if we have duplicates
                if identifier not in unique_results or distance < unique_results[identifier]['distance']:
                    unique_results[identifier] = {
                        'metadata': metadata,
                        'document': document,
                        'distance': distance,
                        'id': identifier
                    }

        # Convert to list and sort by distance (best results first)
        results_list = list(unique_results.values())
        results_list.sort(key=lambda x: x['distance'])

        return results_list


chroma_db = ChromaClient()
# --- Tests ---


if __name__ == "__main__":

    collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION"))
    print(collection.count())
    query = 'betyg grundskola'
    results = chroma_db.query_collection(
        collection=collection,
        query_texts=query,
        n_results=3,
    )
    for res in results:
        print(res['document'])
    print('---')
    col: Collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION"))
    print(col.get(limit=10))
    results = col.query(query_texts=query, n_results=3)
    for i in zip(
        results['metadatas'][0],
        results['documents'][0],
        results['distances'][0],
        results['ids'][0],
    ):
        print(i)