import chromadb import os import sys # Set /home/lasse/riksdagen as working directory os.chdir("/home/lasse/riksdagen") sys.path.append("/home/lasse/riksdagen") from chromadb import Collection from chromadb.api import ClientAPI import bootstrap # Ensure sys.path and working directory are set from config import chromadb_path, embedding_model import re from typing import Dict, List, Any, Tuple, Optional from chromadb.utils.embedding_functions import OllamaEmbeddingFunction class ChromaClient: def __init__(self, path: str | None = chromadb_path): self.path: str = path self._client: ClientAPI = self._init_client() self.embedding_function = OllamaEmbeddingFunction(model_name=embedding_model, url='192.168.1.10:33405') def _init_client(self) -> chromadb.PersistentClient: return chromadb.PersistentClient(path=self.path) def get_collection(self, name: str) -> Collection: available_collections = {col.name for col in self._client.list_collections()} if name not in available_collections: self.create_collection(name=name) return self._client.get_collection(name=name) def create_collection(self, name: str) -> Collection: return self._client.get_or_create_collection(name=name, embedding_function=self.embedding_function) def parse_search_query(self, query: str) -> Tuple[Optional[Dict], Optional[Dict]]: """ Parse a Google-like search query into ChromaDB metadata and document filters. Supports syntax like: - Field searches: author:Smith, year:2020, category:politics - Comparisons: year:>2020, year:<=2024, year:>=2020 - Ranges: year:2020..2024 (equivalent to year:>=2020 AND year:<=2024) - Document content: document_contains:"climate change", document_regex:"\\d{4}" - Logical operators: AND, OR, NOT - Grouping: (author:Smith OR author:Johnson) AND year:>2020 - Quoted phrases: author:"John Smith" (handles spaces in values) Args: query (str): The search query string in Google-like syntax Returns: Tuple[Optional[Dict], Optional[Dict]]: A tuple containing: - metadata_filter: ChromaDB metadata filter dict (or None if no metadata filters) - document_filter: ChromaDB document filter dict (or None if no document filters) Examples: >>> client.parse_search_query("author:Smith AND year:>2020") ({"$and": [{"author": "Smith"}, {"year": {"$gt": 2020}}]}, None) >>> client.parse_search_query("year:2020..2024 AND document_contains:'climate'") ({"$and": [{"year": {"$gte": 2020}}, {"year": {"$lte": 2024}}]}, {"$contains": "climate"}) """ if not query or not query.strip(): return None, None # Normalize the query - convert to uppercase for operators, but preserve field values query = query.strip() # Split the query into tokens, preserving quoted strings and operators # This regex matches: quoted strings, field:value pairs, operators, parentheses tokens = re.findall(r'(?:"[^"]*"|\'[^\']*\'|\([^)]*\)|[^\s()]+)', query) metadata_conditions = [] document_conditions = [] # Process tokens and convert to conditions i = 0 while i < len(tokens): token = tokens[i].strip() # Skip logical operators and parentheses - they'll be handled in a more advanced parser if token.upper() in ['AND', 'OR', 'NOT', '(', ')']: i += 1 continue # Look for field:value patterns if ':' in token: field, value = token.split(':', 1) field = field.lower().strip() value = value.strip().strip('"\'') # Remove quotes if present # Handle document content searches if field in ['document_contains', 'doc_contains', 'contains']: document_conditions.append({"$contains": value}) elif field in ['document_regex', 'doc_regex', 'regex']: document_conditions.append({"$regex": value}) else: # Handle metadata field searches condition = self._parse_field_condition(field, value) if condition: metadata_conditions.append(condition) i += 1 # Build final filters metadata_filter = None if len(metadata_conditions) == 1: metadata_filter = metadata_conditions[0] elif len(metadata_conditions) > 1: # For now, combine all metadata conditions with AND # A more advanced parser could handle OR/NOT operators metadata_filter = {"$and": metadata_conditions} document_filter = None if len(document_conditions) == 1: document_filter = document_conditions[0] elif len(document_conditions) > 1: # Combine document conditions with AND document_filter = {"$and": document_conditions} return metadata_filter, document_filter def _parse_field_condition(self, field: str, value: str) -> Optional[Dict]: """ Parse a single field:value condition into a ChromaDB filter condition. Handles various syntaxes: - Simple equality: field:value -> {"field": "value"} - Comparisons: field:>10 -> {"field": {"$gt": 10}} - Ranges: field:2020..2024 -> expands to two conditions for >=2020 AND <=2024 Args: field (str): The field name (e.g., "year", "author", "category") value (str): The field value, possibly with operators or range syntax Returns: Optional[Dict]: ChromaDB filter condition dict, or None if parsing fails """ # Handle range syntax: field:start..end if '..' in value: try: start_str, end_str = value.split('..', 1) start_val = self._convert_value(start_str.strip()) end_val = self._convert_value(end_str.strip()) # Return as two separate conditions - caller should handle combining them # For now, we'll return the >= condition and let the caller handle the <= part # This is a limitation of this simple parser return {"field": field, "$gte": start_val, "$lte": end_val} except (ValueError, TypeError): # If range parsing fails, treat as literal string return {field: value} # Handle comparison operators: >, <, >=, <=, != comparison_match = re.match(r'^(>=|<=|>|<|!=)(.+)$', value) if comparison_match: operator, op_value = comparison_match.groups() op_value = op_value.strip() # Convert operator to ChromaDB syntax operator_map = { '>': '$gt', '<': '$lt', '>=': '$gte', '<=': '$lte', '!=': '$ne' } chroma_op = operator_map.get(operator) if chroma_op: converted_value = self._convert_value(op_value) return {field: {chroma_op: converted_value}} # Simple equality condition converted_value = self._convert_value(value) return {field: converted_value} def _convert_value(self, value: str) -> Any: """ Convert a string value to the appropriate Python type. Tries to convert to int, then float, then keeps as string. Handles quoted strings by removing quotes. Args: value (str): The string value to convert Returns: Any: The converted value (int, float, or str) """ if not value: return value # Remove surrounding quotes if present if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")): return value[1:-1] # Try to convert to number try: if '.' in value: return float(value) else: return int(value) except ValueError: return value def query_collection( self, collection: Collection, query_texts: list[str], n_results: int = 5, query_embeddings: list[list[float]] | None = None, where: dict | None = None, where_document: dict | None = None, ) -> list[dict]: """ Query a ChromaDB collection with text queries and return formatted results. Use this to search for similar texts based on text or pre-computed embeddings. Args: collection (Collection): The ChromaDB collection to query against. query_texts (list[str]): List of text strings to search for in the collection. n_results (int, optional): Maximum number of results to return per query. Defaults to 5. query_embeddings (list[list[float]] | None, optional): Pre-computed embeddings for the queries. If None, embeddings will be computed from query_texts. Defaults to None. where (dict | None, optional): Metadata filter to apply to the search results. Examples: - Simple filter: {"author": "John Doe"} - Comparison: {"page": {"$gt": 10}} - Logical AND: {"$and": [{"author": "John"}, {"year": {"$gte": 2020}}]} - Logical OR: {"$or": [{"category": "news"}, {"category": "politics"}]} - Inclusion: {"status": {"$in": ["published", "draft"]}} If None, no filtering is applied. Defaults to None. where_document (dict | None, optional): Full-text search filter for document content. Examples: - Contains text: {"$contains": "climate change"} - Regex pattern: {"$regex": r"\\b\\d{4}\\b"} # matches 4-digit years If None, no document filtering is applied. Defaults to None. Returns: list[dict]: A flattened list of unique dictionaries containing query results from all queries, where each dictionary has keys 'metadata', 'document', 'distance', and 'id'. Duplicates (same id) are removed, keeping the result with the best (lowest) distance score. """ assert query_texts or query_embeddings, "Either query_texts or query_embeddings must be provided." # Build query parameters - only include optional parameters if they're provided query_params = { 'n_results': n_results, } if query_texts: query_params['query_texts'] = query_texts elif query_embeddings: query_params['query_embeddings'] = query_embeddings if where is not None: query_params['where'] = where if where_document is not None: query_params['where_document'] = where_document # Execute the query with the constructed parameters results = collection.query(**query_params) # Dictionary to store unique results by id, keeping the one with best distance unique_results = {} # Process results from all queries metadatas = results.get("metadatas", []) documents = results.get("documents", []) distances = results.get("distances", []) ids = results.get("ids", []) # Iterate through each query's results for query_idx in range(len(metadatas)): query_metadatas = metadatas[query_idx] if query_idx < len(metadatas) else [] query_documents = documents[query_idx] if query_idx < len(documents) else [] query_distances = distances[query_idx] if query_idx < len(distances) else [] query_ids = ids[query_idx] if query_idx < len(ids) else [] # Process each result in this query for metadata, document, distance, identifier in zip( query_metadatas, query_documents, query_distances, query_ids ): # Keep the result with the best (lowest) distance if we have duplicates if identifier not in unique_results or distance < unique_results[identifier]['distance']: unique_results[identifier] = { 'metadata': metadata, 'document': document, 'distance': distance, 'id': identifier } # Convert to list and sort by distance (best results first) results_list = list(unique_results.values()) results_list.sort(key=lambda x: x['distance']) return results_list chroma_db = ChromaClient() # --- Tests --- if __name__ == "__main__": collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION")) print(collection.count()) query = 'betyg grundskola' results = chroma_db.query_collection( collection=collection, query_texts=query, n_results=3, ) for res in results: print(res['document']) print('---') col: Collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION")) print(col.get(limit=10)) results = col.query(query_texts=query, n_results=3) for i in zip( results['metadatas'][0], results['documents'][0], results['distances'][0], results['ids'][0], ): print(i)