rixdagen/backend/services/llm_tools.py

from contextvars import ContextVar, Token
from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
from colorprinter import *
import os
from _llm import register_tool, get_tools
from _chromadb.chroma_client import chroma_db
from arango_client import arango
from arango.exceptions import AQLQueryExecuteError
from backend.services.search import (
    SearchService,
)  # Import SearchService for use in the tool
from utils import detect_sql_syntax
from _llm import LLM
from pydantic import BaseModel, Field

# * When to use an AQL tool vs vector/semantic search*
# - Use AQL for exact predicates, structured filters, joins, grouping, aggregations,
#   date-range queries, or ArangoSearch indexed text search.
#   Examples:
#     • Exact matches (by id, date, party, speaker).
#     • Aggregations (counts, sums, min/max) and grouping (COLLECT).
#     • Joins across collections with nested FOR.
#     • Range queries, pagination, sorted results and server-side window functions.
# - Prefer vector/semantic search when you need fuzzy or semantic similarity
#   (e.g., "find speeches similar in meaning to this paragraph"). Vector search is
#   complementary to AQL, not a replacement for structured queries.


@register_tool()
def search_documents(query: str):
    """
    With this tool, you can search in **natural language** among all speeches in the Riksdagen database.
    Use it when you want to get eg. **statistics or aggregations** – things you can’t do with vector search or arango_search.
    You will likely use this tool when a user has asked questions like:
    - "How many ...?"
    - "What is the most common ...?"
    - "Who spoke the most about ...?"
    - "What parties mentioned ... the most?"
    You might also use this **as a fallback if arango_search or vector_search_talks do not return relevant results**. It can do complex searches and aggregations that the other tools cannot.

    It's important that you **formulate your query as clearly as possible**.
    Please specify things like:
    - What you want to count or aggregate (e.g., count of speeches, sum of occurrences).
    - Any filters or conditions (e.g., specific words, parties, speakers, date ranges).
    - The desired output format (e.g., list of counts, how many results).

    Args:
        query: Your formulated query in natural language. Try to be as specific as possible.

    Returns:
        The result of the query, typically a list of dictionaries with keys and values depending on the query.
    """

    class AQLResponseModel(BaseModel):
        query: str = Field(..., description="The generated AQL query string.")
        output_explanation: str = Field(
            ..., description="A *very* short explanation of the expected output format, and how to interpret the results.",
            examples=["Using COUNT INTO c, the result is a single integer count of matching documents.",]
        )

    tools = get_tools(specific_tools=["aql_query"])
    aql_query_description = tools[0]['function']['description']

    system_message = f"""You are an expert in converting natural language queries into AQL (ArangoDB Query Language) queries for the Riksdagen database.
The user will provide a query in natural language, and you must translate it into a valid AQL query that can be executed against the database.

You are an expert at writing AQL (ArangoDB Query Language) queries for *reading* (retrieving) data. Follow these rules:

- Every query must use the `RETURN` keyword to output results.
- Only write one query per string (no semicolons, no multiple queries).
- Use these keywords to build queries: FOR, IN, RETURN, LET, FILTER, SORT, LIMIT, COLLECT, SEARCH.
- Prefer using the ArangoSearch view `talks_search` and the `SEARCH` keyword for full-text or relevance-based queries.
- Use `FILTER` for structured filters (like year, party, etc).
- Use `COLLECT` for grouping and aggregations.
- Use `SORT` and `LIMIT` for ordering and slicing results.
- Never use SQL syntax or keywords like SELECT, JOIN, GROUP BY, ORDER BY, SUM.
- Collection, attribute, and variable names are case-sensitive. If a name is a reserved keyword or contains special characters, wrap it in backticks: \`name\`.
- Always use fully qualified attribute names (e.g., `doc.year`).
- Always end with a RETURN statement, returning only the fields needed.

**Examples:**

Count speeches mentioning a word:
FOR doc IN talks_search
  SEARCH ANALYZER(doc.anforandetext IN TOKENS("miljö", "text_sv"), "text_sv")
  COLLECT WITH COUNT INTO c
  RETURN c

List speeches from a year:
FOR doc IN talks_search
  FILTER doc.year == 2022
  RETURN doc

Aggregation by party:
FOR d IN talks
  FILTER d.year == 2022
  COLLECT parti = d.parti WITH COUNT INTO cnt
  RETURN {{ parti, count: cnt }}

Join with people collection:
FOR d IN talks
  FOR p IN people
    FILTER p._key == d.intressent_id
    RETURN {{ _id: d._id, talare: d.talare, born: p.fodd_ar }}

The AQL query you produce will be usd in a tool with this description:

'''
{aql_query_description}
'''

Please have that in mind when writing your queries.

**Summary:**
Write AQL queries for reading data using only the allowed keywords and patterns above. Do not use SQL syntax. Always return results with RETURN.
    """
    messages = [
        {"role": "system", "content": system_message},
        {
            "role": "user",
            "content": f"Convert the following natural language query into an AQL query:\n\n{query}\n\nOnly return the AQL query, nothing else.",
        },
    ]
    tools = get_tools(specific_tools=["aql_query"])
    llm = LLM(model="vllm", temperature=0, tools=tools)
    response = llm.generate(messages=messages, format=AQLResponseModel)
    query = response.content.query
    output_explanation = response.content.output_explanation
    result = aql_query(query)
    if len(result) == 1: #TODO Is it a good idea to return as a single item if only one item?
        result = result[0]

    result = f"The AQL used to answer your request was:\n```\n{query}\n```\n{output_explanation}\n\nThe result of the query is:\n{result}"
    return result


@register_tool()
def aql_query(query: str) -> List[Dict[str, Any]]:


    def remove_fields(doc: Dict[str, Any], fields_to_remove: List[str]) -> Dict[str, Any]:
        """Recursively remove specified fields from a document."""
        if not isinstance(doc, dict):
            return doc

        for key in list(doc.keys()):
            if key in fields_to_remove:
                del doc[key]

        return doc


    """
    Execute a read-only AQL query against the Riksdag talks database.

    Args;
        query (str): The AQL query string to execute. Must be in *AQL" syntax (*not* SQL), se instructions.

    Purpose
    - Use this tool for exact matches, joins, aggregations, grouping, and structured queries that cannot be handled by higher-level search helpers.

    SUMMARY PRINCIPLE (MANDATORY)
    - **Always prefer ArangoSearch `SEARCH` on the view `talks_search`** for any query that can be answered by the view (full-text, phrase, or relevance-oriented queries).
    - **Only** fall back to collection-level queries (`FOR d IN talks`) when you need joins, aggregations, grouping, non-view indexes, or full-document access that the view does not support.

    WHEN TO USE `SEARCH` (PREFERRED)
    - Full-text and phrase matches, language-aware tokenization, and relevance ranking.
    - Boolean/phrase queries using `TOKENS(...)` or `PHRASE(...)` together with `ANALYZER(..., "text_sv")`.
    - Relevance sorting using `SORT BM25(doc) DESC` with deterministic tie-breakers (e.g., date or `_key`).

    WHEN TO USE COLLECTION-LEVEL FILTER (FALLBACK)
    - Joins across collections (e.g., enrich `talks` with `people` metadata).
    - Aggregations and grouping (`COLLECT`, `COUNT`, `SUM`) when results depend on structured fields or non-view indexes.
    - Operations that require full-document access not available from the view.

    VIEWS & COLLECTIONS
    - **talks_search**: ArangoSearch view on `talks` with full-text indexing (including `anforandetext`).
    - **talks**: collection with all speeches.
    - **people**: collection containing speaker metadata.

    DOCUMENT SCHEMA (talks / talks_search)
    - `_id` (str)
    - `_key` (str)
    - `talare` (str)
    - `parti` (str)
    - `intressent_id` (str)
    - `dok_datum` (str, e.g. "YYYY-MM-DD")
    - `year` (int)
    - `anforandetext` (str)
    - `debate` (str)

    ANALYZERS
    - `text_sv` — use for natural-language fields (e.g., `anforandetext`).
    - `identity` — use for exact-token fields (ids, codes).

    COMMON PATTERNS & TEMPLATES
    1) **Count documents that mention a term** (how many speeches mention X)
    ```aql
    FOR doc IN talks_search
      SEARCH ANALYZER(doc.anforandetext IN TOKENS(<term>, "text_sv"), "text_sv")
      COLLECT WITH COUNT INTO c
      RETURN c
    ```


    2) **Search + metadata range** (use SEARCH for text + FILTER for date/number ranges)
    ```aql
    FOR doc IN talks_search
      FILTER doc.year >= <start_year> AND doc.year <= <end_year>
      SEARCH ANALYZER(doc.anforandetext IN TOKENS(<term>, "text_sv"), "text_sv")
      SORT BM25(doc) DESC, doc._key ASC
      LIMIT <offset>, <count>
      RETURN { _id: doc._id, talare: doc.talare, parti: doc.parti, dok_datum: doc.dok_datum, score: BM25(doc) }
    ```

    3) **Aggregation by party (collection-level)**
    ```aql
    FOR d IN talks
      FILTER d.dok_datum >= "2016-01-01" AND d.dok_datum <= "2016-12-31"
      COLLECT parti = d.parti WITH COUNT INTO cnt
      SORT cnt DESC
      RETURN { parti, count: cnt }
    ```

    4) **Join-like enrichment (talks + people)**
    ```aql
    FOR d IN talks
      FOR p IN people
        FILTER p._key == d.intressent_id
        RETURN { _id: d._id, talare: d.talare, born: p.fodd_ar, valkrets: p.valkrets }
    ```

    BEST PRACTICES (SHORT)
    - Prefer `talks_search` + `SEARCH` for text and relevance tasks — this is the default.
    - Use `FILTER` on `talks_search` for structured constraints (dates, numeric ranges) and `COLLECT` on `talks` for aggregations.
    - Use `SORT BM25(doc) DESC` when relevance is primary; add deterministic tie-breakers.
    - Return only required fields to keep results compact.
    - Avoid `CONTAINS(...)` on large text fields; prefer the view and analyzers.

    EXAMPLES
    - "How many speeches mention 'korallrev'?" → use template (1).
    - "How many times has 'korallrev' been mentioned in total?" → use template (2).


    *AQL keyword rules (very important!)*
    ✅ Allowed AQL keywords and functions:
    FOR, IN, RETURN, LET, FILTER, SORT, LIMIT, COLLECT,
    SEARCH, BM25, ANALYZER, TOKENS, PHRASE, OFFSET_INFO,
    AND, OR, NOT, ==, !=, >, >=, <, <=, IN,
    LOWER, SUBSTRING, CONCAT, MERGE,

    ❌ Never use SQL-like syntax: SELECT, JOIN, GROUP BY, ORDER BY, SUM

    Returns
    - `list[dict]` — rows produced by the AQL query (JSON-serializable).
    """
    try:
        docs = []
        for doc in arango.execute_aql(query):
            docs.append(remove_fields(doc, ['chunks'])) #TODO And other fields to remove?
        return docs

    except AQLQueryExecuteError as e:
        test = detect_sql_syntax(query)
        if test["is_sql"]:
            import ollama

            issues = test["issues"]
            print_red(
                f"[Tools] Detected SQL syntax in AQL query: {query}. Issues: {issues}. Attempting to rewrite using LLM..."
            )

            class AQL(BaseModel):
                aql_query: str

            system_message = f"""
    You are an expert in AQL (ArangoDB Query Language). The user has provided a query that a tool has detected to contain SQL-like syntax and you must rewrite it using AQL syntax.
    Please change the query AS LITTLE AS POSSIBLE to make it valid AQL syntax, while preserving the original intent of the query as much as possible.

    Here are some important rules to follow when rewriting the query:
    ✅ Allowed AQL keywords and functions:
    FOR, IN, RETURN, LET, FILTER, SORT, LIMIT, COLLECT,
    SEARCH, BM25, ANALYZER, TOKENS, PHRASE, OFFSET_INFO,
    AND, OR, NOT, ==, !=, >, >=, <, <=, IN,
    LOWER, SUBSTRING, CONCAT, MERGE,

    ❌ Never use SQL-like syntax: SELECT, JOIN, GROUP BY, ORDER BY, SUM

    Don't overthink it, just fix the syntax errors and return valid AQL.
            """

            prompt = f"""The query is: {query}. A tools has detected the following issues with the query:

            {issues}

            Please rewrite it using *AQL syntax*."""
            llm = ollama.Client(host="http://192.168.1.12:33401")
            model = "codegemma:2b"
            response = llm.chat(
                messages=[
                    {"role": "system", "content": system_message},
                    {
                        "role": "user",
                        "content": prompt,
                    },
                ],
                format=AQL.model_json_schema(),
                model=model,
            )
            query = AQL.model_validate_json(response.message.content).aql_query
            print_green(f"[Tools] Rewritten AQL query: {query}")
        try:
            docs = []
            for doc in arango.execute_aql(query):
                docs.append(remove_fields(doc, ['chunks'])) #TODO And other fields to remove?
            return docs
        except AQLQueryExecuteError as e2:
            print_red(f"[Tools] Still got AQL execution error after rewrite: {str(e2)}")
            result = detect_sql_syntax(query)
            if result["is_sql"]:
                return f"""ERROR: Detected SQL syntax in your AQL query: {query}. {result['issues']}.\n Please rewrite using AQL syntax. See the aql_query tool documentation for examples!"""
            else:
                print_blue("NOT SQL SYNTAX")
                return f"ERROR executing AQL query: {str(e)}.\nPlease see the aql_query tool documentation for correct usage and examples!"
    except Exception as e:
        import traceback
        tb = traceback.format_exc()
        print_red(f"[Tools] Unexpected error executing AQL query: {str(e)}\n{tb}")
        return f"ERROR executing AQL query: {str(e)}.\nPlease see the aql_query tool documentation for correct usage and examples!"


@register_tool()
def vector_search_talks(query: str, limit: int = 8) -> List[Dict[str, Any]]:
    """
    Använd det här verktyget för att göra en semantisk sökning bland anföranden i Riksdagen.
    Använd när du vill:
    - Hitta relevanta anföranden baserat på innebörden i en fråga eller ett ämne.
    - Få sammanfattningar eller utdrag från anföranden som är relaterade till en specifik fråga.
    - Söka tematiskt snarare än med exakta nyckelord.
    När du genererar query-parametern, försök att formulera den som en naturlig språkfråga eller ett uttalande som fångar det du vill veta.

    Args:
        query: The user's question.
        limit: Number of hits to return. Default 8.

    Returns:
        List of speech snippets most relevant to the query.
    """
    print_yellow(f"[Tools] vector_search_talks → query='{query}' (top_k={limit}).")
    collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION"))
    results = collection.query(
        query_texts=[query],
        n_results=limit,
    )

    metadatas = results.get("metadatas") or []
    documents = results.get("documents") or []
    ids = results.get("ids") or []
    distances = results.get("distances") or []

    metadata_rows = metadatas[0] if metadatas else []
    document_rows = documents[0] if documents else []
    id_rows = ids[0] if ids else []
    distance_rows = distances[0] if distances else []

    def _as_int(value: Any, default: int = -1) -> int:
        """
        Normalize chunk indices returned by Chroma so downstream Pydantic validation succeeds.
        """
        if isinstance(value, bool):
            return default
        if isinstance(value, int):
            return value
        if isinstance(value, float) and value.is_integer():
            return int(value)
        if isinstance(value, str):
            stripped = value.strip()
            if stripped.startswith("+"):
                stripped = stripped[1:]
            if stripped.lstrip("-").isdigit():
                return int(stripped)
        return default

    max_len = max(
        len(metadata_rows),
        len(document_rows),
        len(id_rows),
        len(distance_rows),
        0,
    )

    hits: List[Dict[str, Any]] = []
    for idx in range(max_len):
        metadata = metadata_rows[idx] if idx < len(metadata_rows) else {}
        if not isinstance(metadata, dict):
            metadata = {}
        _id = metadata.get("_id") or (id_rows[idx] if idx < len(id_rows) else None)
        if not _id:
            continue
        chunk_index_raw = (
            metadata.get("chunk_index")
            or metadata.get("index")
            or metadata.get("chunkId")
        )
        chunk_index = _as_int(chunk_index_raw)
        snippet_candidates: List[str] = []
        for candidate in (
            metadata.get("snippet"),
            metadata.get("text"),
            document_rows[idx] if idx < len(document_rows) else "",
        ):
            if isinstance(candidate, str) and candidate.strip():
                snippet_candidates.append(candidate.strip())
        snippet = snippet_candidates[0] if snippet_candidates else ""
        hit = {
            "_id": _id,
            "_id": _id,
            "chunk_index": chunk_index,
            "heading": metadata.get("heading") or metadata.get("title") or metadata.get("talare"),
            "snippet": snippet,
            "debateurl": metadata.get("debateurl") or metadata.get("debate_url"),
            "score": distance_rows[idx] if idx < len(distance_rows) else None,
        }
        hits.append(hit)
    print_purple(f"[Tools] vector_search_talks assembled {len(hits)} hits.")
    return hits


@register_tool()
def fetch_documents(_ids: list[str], collection: str = None, fields: dict = {}) -> list[Dict[str, Any]]:
    """
    Fetches documents from the database by their IDs, with optional collection prefix and field filtering.

    Args:
        _ids (list[str]): List of document IDs to fetch. If a single ID is provided, it will be converted to a list.
        collection (str, optional): Collection name to prefix to IDs if not already present. Defaults to None.
        fields (dict, optional): Dictionary specifying which fields to include in the returned documents. If empty, all fields are returned. Defaults to {}.

    Returns:
        list[Dict[str, Any]]: List of documents fetched from the database. If 'fields' is specified, only those fields are included in each document.

    Raises:
        ValueError: If document IDs do not include the collection prefix and no collection is specified.
        """


    if not isinstance(_ids, list):
        if isinstance(_ids, str):
            if '[' in _ids and ']' in _ids:
                import json
                try:
                    _ids = json.loads(_ids)
                except Exception as e:
                    print_red(f"[Tools] Error parsing _ids as JSON list: {str(e)}. Treating as single ID.")
                    _ids = [_ids]
        _ids = [_ids]

    _ids = [_id.replace('\\', "/") for _id in _ids]
    if collection and '/' not in _ids[0]:
        _ids = [f"{collection}/{_id.split('/')[-1]}" for _id in _ids]
    elif '/' not in _ids[0]:
        return f"ERROR FROM TOOL: When fetching documents by _id, you **must** include the collection prefix (e.g., 'talks/12345'). Or specify the collection parameter."

    query = f"""
    FOR id IN @document_ids
        RETURN DOCUMENT(id)
    """
    document_ids_string = f"""[{",".join(f'"{_id}"' for _id in _ids)}]"""
    print_blue(f"[Tools] Fetch {query}, bind_vars={{'document_ids': {document_ids_string}}}")
    docs = arango.execute_aql(query, bind_vars={"document_ids": document_ids_string})

    if fields:
        l = []
        for doc in docs:
            filtered = {k: doc.get(k) for k in fields if k in doc}
            l.append(filtered)
        return l
    else:
        for _id in _ids:
            if _id.startswith("talks/"):
                # If fetching a talk, also fetch its chunks
                for doc in docs:
                    if "chunks" in doc:
                        del doc["chunks"]
        return list(docs)


def _normalize_arango_search_args(
    query: str,
    parties: Optional[Union[str, List[str]]] = None,
    people: Optional[Union[str, List[str]]] = None,
    debates: Optional[Union[str, List[str]]] = None,
    from_year: Optional[Union[str, int]] = None,
    to_year: Optional[Union[str, int]] = None,
    limit: Optional[Union[str, int]] = 10,
    speaker_ids: Optional[Union[str, List[str], bool]] = None,
) -> Dict[str, Any]:
    """
    Helper to ensure all arango_search arguments are in the correct format.
    Converts single strings to lists, and string numbers to integers.
    """

    def to_list(val):
        if val is None:
            return []
        if isinstance(val, list):
            return val
        if isinstance(val, str):
            # Split comma-separated string, or wrap single string in list
            if "," in val:
                return [v.strip() for v in val.split(",") if v.strip()]
            return [val.strip()]
        return [val]

    def to_int(val):
        if val is None:
            return None
        if isinstance(val, int):
            return val
        if isinstance(val, str):
            try:
                return int(val)
            except ValueError:
                return None
        return None

    return {
        "query": str(query) if query is not None else "",
        "parties": to_list(parties),
        "people": to_list(people),
        "debates": to_list(debates),
        "from_year": to_int(from_year),
        "to_year": to_int(to_year),
        "limit": to_int(limit) if limit is not None else 10,
        "speaker_ids": to_list(speaker_ids),
    }


@register_tool()
def arango_search(
    query: str,
    parties: Optional[list[str]] = None,
    people: Optional[list[str]] = None,
    from_year: Optional[int] = None,
    to_year: Optional[int] = None,
    limit: int = 20,
    return_snippets: bool = False,
    results_to_user: bool = False,
    focus_ids: Optional[List[str]] = None,
    intressent_ids: Optional[Union[str, List[str], bool]] = None,
) -> List[Dict[str, Any]]:
    """
    Perform a full-text and metadata search in the Riksdagen 'talks' collection using ArangoSearch, using "google-like" syntax.
    Possible to `use return_snippets` to only return snippets with highlights instead of full documents, which can be useful to get an overview of the results.
    If searching for specific words or phrases, consider using quotes (") for phrases, AND/OR/NOT operators, and year ranges (e.g., år:2018-2022).
    Always use a limit to avoid too many results resulting in a truncated response. Hits will be ranked by relevance (BM25).
    This tool can also be used to help the user do a search, in which case you should set `results_to_user=True` so the results are sent to the user as they are. This is useful if you want to show the user the results of a search directly

    This tool uses advanced text search (with stemming, language analysis, and ranking) and can also filter by party, speaker, debate type, and year range.
    Use this tool when you want:
      - To find speeches containing specific words, phrases, or combinations (with support for AND, OR, NOT, and phrases).
      - To filter results by party, speaker, debate type, or year.
      - To get ranked, relevant snippets (with highlights) from the parliamentary database.
      - To perform more flexible or "Google-like" search than exact AQL queries, but more structured than pure vector/semantic search.
      - Help the user do a search, remember to set results_to_user=True so the results are sent to the user.

    When NOT to use this tool:
      - If you need fuzzy/semantic similarity (use vector_search_talks).
      - If you need exact aggregations, joins, or advanced AQL features (use aql_query).

    **Good practices for using this tool**
    - Always use a limit to avoid too many results.
    - If you want to get an overview of results, use `return_snippets=True` to get highlighted snippets. After that, decide what _id:s to fetch in full.
    - If the user has asked for e.g. "a list of talks mentioning...", "I want to see...", "give me all speeches about..." – or in other ways indicates they want to see the actual results – use `results_to_user=True` so the results are sent to the user as they are.

    **This tools has four special features/parameters:**
    1) `return_snippets=True` – If you want to get an overview of the results, use this parameter to get highlighted snippets instead of full documents. This is useful if you want to quickly see what the results are about, and then decide which _id:s to fetch in full.
    2) `results_to_user=True` – If the user has asked for e.g. "a list of talks mentioning...", "I want to see...", "give me all speeches about..." – or in other ways indicates they want to see the actual results – use this parameter so the results are sent to the user as they are.
    3) `focus_ids` – If you want to do a search within the ID:s from the last search, set this parameter to True. This is only useful if you have done a previous where you've used `results_to_user=True`, and the user has then asked a follow-up question that requires a more specific search within the previous results.
    4) `intressent_ids` – If you want to filter the search by specific speaker IDs, use this parameter. It should be a list of speaker IDs (intressent_id).

    Args:
        query (str): The search string (supports AND, OR, NOT, phrases in quotes, and year ranges like år:2018-2022).
        parties (list[str], optional): List of party codes to filter by (e.g., ["S", "M"]).
        people (list[str], optional): List of speaker names to filter by.
        from_year (int, optional): Start year for filtering.
        to_year (int, optional): End year for filtering.
        limit (int, optional): Maximum number of results to return (default 20).
        return_snippets (bool, optional): If True, return only snippets with highlights instead of full documents (default False).
        results_to_user (bool, optional): If True, the results will be sent to the user as they are. (default False).
        focus_ids (list[str], optional): Restrict the search to these specific document ids.
        intressent_ids (list[str] | str | bool, optional): List of speaker IDs to filter by.

    Returns:
        List[dict]: List of search result snippets, each as a dictionary with keys like '_id', 'text', 'snippet', 'speaker', 'party', etc.

    Example:
        arango_search(
            query='"kärnkraft" AND energi',
            parties=["M", "S"],
            from_year=2010,
            to_year=2022,
            limit=10
        )
    """
    # Normalize all arguments to expected types
    args = _normalize_arango_search_args(
        query=query,
        parties=parties,
        people=people,
        from_year=from_year,
        to_year=to_year,
        limit=limit,
        speaker_ids=intressent_ids,
    )

    class Payload:
        def __init__(
            self,
            q: str,
            parties: Optional[list[str]],
            people: Optional[list[str]],
            debates: Optional[list[str]],
            from_year: Optional[int],
            to_year: Optional[int],
            limit: int,
            return_snippets: bool = False,
            focus_ids: Optional[List[str]] = None,
            speaker_ids: Optional[List[str]] = None,
        ):
            """Lightweight holder passed to SearchService.search."""
            self.q = q
            self.parties = parties or []
            self.people = people or []
            self.debates = debates or []
            self.from_year = from_year
            self.to_year = to_year
            self.limit = limit
            self.return_snippets = return_snippets
            self.focus_ids = focus_ids or []
            self.speaker_ids = speaker_ids

    focus_id_list: List[str] = []
    if focus_ids:
        if isinstance(focus_ids, list):
            focus_id_list = [str(item) for item in focus_ids if isinstance(item, (str, int))]
        elif isinstance(focus_ids, str):
            try:
                parsed = json.loads(focus_ids)
                if isinstance(parsed, list):
                    focus_id_list = [str(item) for item in parsed if isinstance(item, (str, int))]
            except json.JSONDecodeError:
                focus_id_list = [focus_ids]
        elif focus_ids is True:
            # The chat service replaces True with the stored list before calling this tool,
            # so reaching this branch means there was no list to reuse.
            focus_id_list = []

    search_service = SearchService()
    results, stats, limit_reached = search_service.search(
        payload=Payload(
            q=args["query"],
            parties=args["parties"],
            people=args["people"],
            debates=args["debates"],
            from_year=args["from_year"],
            to_year=args["to_year"],
            limit=args["limit"],
            return_snippets=return_snippets,
            focus_ids=focus_id_list,
            speaker_ids=intressent_ids,
        ),
        include_snippets=True,
        return_snippets=return_snippets,
    )
    result_ids = [
        hit["_id"] for hit in results if isinstance(hit, dict) and hit.get("_id")
    ]
    payload = {
        "results": results,
        "stats": stats,
        "limit_reached": limit_reached,
        "return_snippets": return_snippets,
        "focus_ids": result_ids,
    }
    if results_to_user:
        return {
            "type": "search_results",
            "payload": payload,
        }
    return payload


if __name__ == "__main__":
    print(vector_search_talks("klimatförändringar", limit=3))