From 88e0244429382d03b0d84dbd44f5d3293a5dafad Mon Sep 17 00:00:00 2001 From: Lasse Server Date: Sun, 8 Mar 2026 13:36:16 +0100 Subject: [PATCH] Add provider templates, scripts for syncing and embedding, and test cases - Created a template for providers.yaml to define API providers and models. - Added a new providers.yaml file with initial provider configurations. - Implemented fix_things.py to update chunk documents in ArangoDB. - Developed make_arango_embeddings.py to generate embeddings for talks and store them in ArangoDB. - Introduced sync_talks.py to synchronize new speeches from riksdagen.se and process them. - Added notes.md for documentation on riksdagsgruppen login details. - Created test_make_arango_embeddings.py for integration testing of embedding generation. - Implemented test_gpu.py to test image input handling with vLLM. --- 20240201-113043-pivpnwgbackup.tgz | Bin 0 -> 1393 bytes Makefile | 10 +- _chromadb/chroma_client.py | 23 +- arango_client.py | 55 +++- backend/app.py | 3 +- backend/services/chat.py | 43 ++- backend/services/llm_tools.py | 368 +++++++++++++++---------- backend/services/monitor_script.py | 144 ++++++++++ config.py | 2 +- etc/riksdagen-sync.service | 19 ++ etc/riksdagen-sync.timer | 11 + frontend/src/App.tsx | 16 +- frontend/src/components/ChatPanel.tsx | 76 ++++- frontend/src/components/TalkView.tsx | 24 +- frontend/src/styles.css | 37 +++ home/Lasse/configs/mac_studion.conf | 10 + mcp_server/__init__.py | 6 + mcp_server/auth.py | 24 ++ mcp_server/check_cert.py | 130 +++++++++ mcp_server/requirements.txt | 0 mcp_server/server.py | 114 ++++++++ mcp_server/test_mcp_client.py | 168 +++++++++++ mcp_server/test_mcp_ping.py | 81 ++++++ mcp_server/test_tools.py.py | 75 +++++ mcp_server/tools.py | 360 ++++++++++++++++++++++++ page_063.png | Bin 0 -> 156497 bytes providers.template.yaml | 113 ++++++++ providers.yaml | 20 ++ scripts/convert_embeddings_to_lists.py | 182 ------------ scripts/debates.py | 30 +- scripts/documents_to_arango.py | 4 +- scripts/fix_things.py | 25 ++ scripts/make_arango_embeddings.py | 173 ++++++++++++ scripts/notes.md | 5 + scripts/sync_talks.py | 177 ++++++++++++ scripts/test_make_arango_embeddings.py | 57 ++++ test_gpu | 70 +++++ 37 files changed, 2282 insertions(+), 373 deletions(-) create mode 100644 20240201-113043-pivpnwgbackup.tgz create mode 100644 backend/services/monitor_script.py create mode 100644 etc/riksdagen-sync.service create mode 100644 etc/riksdagen-sync.timer create mode 100644 home/Lasse/configs/mac_studion.conf create mode 100644 mcp_server/__init__.py create mode 100644 mcp_server/auth.py create mode 100644 mcp_server/check_cert.py create mode 100644 mcp_server/requirements.txt create mode 100644 mcp_server/server.py create mode 100644 mcp_server/test_mcp_client.py create mode 100644 mcp_server/test_mcp_ping.py create mode 100644 mcp_server/test_tools.py.py create mode 100644 mcp_server/tools.py create mode 100644 page_063.png create mode 100644 providers.template.yaml create mode 100644 providers.yaml delete mode 100644 scripts/convert_embeddings_to_lists.py create mode 100644 scripts/fix_things.py create mode 100644 scripts/make_arango_embeddings.py create mode 100644 scripts/notes.md create mode 100644 scripts/sync_talks.py create mode 100644 scripts/test_make_arango_embeddings.py create mode 100644 test_gpu diff --git a/20240201-113043-pivpnwgbackup.tgz b/20240201-113043-pivpnwgbackup.tgz new file mode 100644 index 0000000000000000000000000000000000000000..d465dfbdd094aaa249526daf448b11ed58a13bd7 GIT binary patch literal 1393 zcmV-%1&;b3iwFSYfV*V?1MOOCQ{p@r^jErQ7%ufZVOIcRCiuXt1k6citLXXdTmAB0yS;PHoPF)$D1URX*F3JDo&ph^98NE9Cw6f{{!6D-$Mc&b*2$ZB zx^D?PQ^BTw#vDx$J6v}icgptMF8JHf9Qj(uqMoo*6b`DjR-glB}b>V5B^eS3yaLG7#i4IkA924zk6iu4j5awfz%HN1A0$C~FFF`lHGBKftp8AJCZ>e-aSJe~hEz z|J?tJDDeN12zdNY0&DE=QFlV!>k;$&@r)71e}ta5KMvzRl#s~2LHuXApv>cc5_ok$ zFEw|$zx;Yw!~dbCUwi)8FdaJ{C-8dyLkUPh{YNknIscQuOsRFm(&g2ld9^>j)b0?n z>U%|1nhY!3L)B2uOMTcco&(XkKEc4^q(LkaQ~kGNN?J2DYMaVbmtywPN&q*-CdR2lUn)2 z7>T3S5xHz#5mEJ@H+cIP|5E>P{wITWQNg#P=9w#2$M;aXr|s^Me2o2) zWY$gGKCT+~hTCk^JGCC?{};qE|AU&(?8W^EFrfG=p8qYp|0BtSzyFg6*4tmVOqMNs z8UHSh{~YE2fk;;UKVZ$d|4#r5!Cu%z3lsp!=Lr-M5(UTu!>7f1{x9r`7<9lW{v!fc z_%8z}asDTPMg4DC>D%PN;F59(Fv$Rt%fKu|X{AyRu4cl|5B4LFZopbWXroe8W#jcV zmm2fF8>O)edQm=?0g!!V$fk@R^rC0PCSk5NeYC~m#WMe6m;OUU>Hio>bNyeESVO@0 zj}YYgelW*i&*vlV%*L@ z6`lWKi2gkP1t_fl34i}189bl=ZSrF1+WZd$oByoQ46Y)^#M6+5s^bP+os)BGYfI?R zo@onKj+`64s`8fn@U_GaW>0P7$=pBMrkh}y|3l3hInp~D zQizYX(R-rr3~BaM^E~=a;(zK<{}1^NWDFM1i~9e9{v%0p()xe61~0UWy8lVB#{Th& zf2l|R|GfR5>VE{|r1QUTYJaZ(eQ*7bgM)*EgM)*EgM)*E<9pyY$p45>08jt`kdE(+ literal 0 HcmV?d00001 diff --git a/Makefile b/Makefile index ae1f41b..6376a7c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: frontend backend reload nginx +.PHONY: frontend backend reload nginx install-sync # install deps and build the React app frontend: @@ -14,3 +14,11 @@ nginx: # build everything and reload nginx all: frontend nginx + +# install and enable the daily sync timer +install-sync: + sudo cp etc/riksdagen-sync.service /etc/systemd/system/ + sudo cp etc/riksdagen-sync.timer /etc/systemd/system/ + sudo systemctl daemon-reload + sudo systemctl enable --now riksdagen-sync.timer + @echo "Timer installed. Check status with: systemctl list-timers | grep riksdagen" diff --git a/_chromadb/chroma_client.py b/_chromadb/chroma_client.py index f70b5cb..f637812 100644 --- a/_chromadb/chroma_client.py +++ b/_chromadb/chroma_client.py @@ -13,12 +13,17 @@ from config import chromadb_path, embedding_model import re from typing import Dict, List, Any, Tuple, Optional from chromadb.utils.embedding_functions import OllamaEmbeddingFunction +from chromadb.utils.embedding_functions import OllamaEmbeddingFunction + +from env_manager import set_env +set_env() class ChromaClient: def __init__(self, path: str | None = chromadb_path): self.path: str = path self._client: ClientAPI = self._init_client() - self.embedding_function = OllamaEmbeddingFunction(model_name=embedding_model, url='192.168.1.10:33405') + embedding_url = os.getenv('LLM_EMBEDDINGS_PORT', '192.168.1.12:33405') + self.embedding_function = OllamaEmbeddingFunction(model_name=embedding_model, url=embedding_url) def _init_client(self) -> chromadb.PersistentClient: return chromadb.PersistentClient(path=self.path) @@ -308,8 +313,8 @@ chroma_db = ChromaClient() if __name__ == "__main__": collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION")) - print(collection.count()) - query = 'betyg grundskola' + query = 'betyg grundskola uppförande' + print(f"Querying for: {query}") results = chroma_db.query_collection( collection=collection, query_texts=query, @@ -317,15 +322,3 @@ if __name__ == "__main__": ) for res in results: print(res['document']) - print('---') - col: Collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION")) - print(col.get(limit=10)) - results = col.query(query_texts=query, n_results=3) - for i in zip( - results['metadatas'][0], - results['documents'][0], - results['distances'][0], - results['ids'][0], - ): - print(i) - diff --git a/arango_client.py b/arango_client.py index 0ff11a9..978ef99 100644 --- a/arango_client.py +++ b/arango_client.py @@ -1,7 +1,37 @@ -from _arango._arango import Arango import os +from typing import List + +from ollama import Client +from arango.collection import Collection + +from _arango._arango import Arango + + +class CustomArango(Arango): + def __init__(self, db_name = 'riksdagen', user=None, password=None): + super().__init__(db_name, user, password) + -arango = Arango( + def make_embeddings(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of texts using Ollama. + + Args: + texts (List[str]): List of text strings to embed. + + Returns: + List[List[float]]: List of embedding vectors. + """ + ollama_client = Client(host='192.168.1.12:33405') + + embeddings = ollama_client.embed( + model="qwen3-embedding:latest", + input=texts, + dimensions=384 + ) + return embeddings.embeddings + +arango = CustomArango( db_name="riksdagen", user='riksdagen', password=os.getenv("ARANGO_PWD"), @@ -9,4 +39,23 @@ arango = Arango( if __name__ == "__main__": - print(arango.db.collections()) \ No newline at end of file + + embeddings = arango.make_embeddings(["Vilka åtgärder bör vidtas för att hantera klimatförändringar?"]) + query = """LET query = @query_embedding + +FOR doc IN chunks + LET score = APPROX_NEAR_COSINE(doc.embedding, query) + SORT score DESC + LIMIT 5 + RETURN { + _key: doc._key, + debate: doc.debate, + text: doc.text, + similarity: score + } + """ + result = arango.db.aql.execute(query=query, bind_vars={"query_embedding": embeddings[0]}) + for doc in result: + print(doc) + print('---') + diff --git a/backend/app.py b/backend/app.py index 9d8e96e..d4516d2 100644 --- a/backend/app.py +++ b/backend/app.py @@ -161,7 +161,8 @@ async def get_talk(talk_id: str) -> dict: "anforande_nummer", "replik", "url_session", - "url_audio" + "url_audio", + "summary" ] ) diff --git a/backend/services/chat.py b/backend/services/chat.py index 33d1be9..8e4d1fa 100644 --- a/backend/services/chat.py +++ b/backend/services/chat.py @@ -69,9 +69,20 @@ You can only request a tool use, not use it directly. After you request a tool, **When giving your final answer:** - Always start with a short summary of your findings, before any detailed analysis or tables. - Respond concisely, the user is not here for small talk. -- Make sure to include sources for your answer, but don't use the internal _id or chunk_index fields; instead, use date, title, etc. -- When refering to a source, use foot notes like [1], [2], etc. at the end of the sentence where you mention it. *Remember to include a short bibliography at the end of your answer, listing all sources you used.* -- Always format your final answer using Markdown (it will be translated to HTML by the frontend). +- **IMPORTANT: Always format your answer using Markdown.** The frontend will convert it to HTML automatically. +- **IMPORTANT: Use inline citation numbers for ALL source citations.** Use the format `[1]`, `[2]`, etc. directly after the statement that references the source. +- **CRITICAL: Citations must be plain square brackets with numbers inside: `[1]`, `[2]`, `[3]`. Do NOT use Markdown footnote syntax like `[^1]` or special Unicode brackets like `【1】`.** +- **IMPORTANT: Always include a "Källor" (Sources) section at the end** with a numbered list matching your citations. Format each source as: `[1] Speaker name – Date – Brief context or quote` +- Example of correct citation format: + ``` + ROT-avdraget infördes 2009[1] och hade som syfte att minska svartarbete[2]. + + ## Källor + [1] Eva Andersson – 2009-01-15 – Debatt om ROT-avdrag + [2] Per Svensson – 2009-02-20 – Diskussion om byggbranschen + ``` +- Make sure citation numbers are sequential ([1], [2], [3]...) and that every citation has a matching entry in the Källor section. +- Don't use internal _id or chunk_index fields in your answer; use human-readable information (speaker, date, topic). - Don't ever make up quotes or facts; if you don't have enough information, say that you don't know, or call another tool to find more information. - Answer in Swedish. """ @@ -198,10 +209,11 @@ You can only request a tool use, not use it directly. After you request a tool, # This avoids issues if there are multiple ChatCompletionMessage classes in the project # The following code should NOT be inside the if-block! - try: - print_blue("Thinking:", response.reasoning_content) - except Exception as e: - print_red(f"[ChatService] Error printing thinking response: {e}") + # Use getattr so this doesn't raise AttributeError when the model + # doesn't return a reasoning/thinking block (which is the normal case). + thinking = getattr(response, "reasoning_content", None) + if thinking: + print_blue("Thinking:", thinking) try: print_purple("Content:", response.content) except Exception as e: @@ -209,11 +221,12 @@ You can only request a tool use, not use it directly. After you request a tool, tool_calls = getattr(response, "tool_calls", None) if tool_calls: - if response.reasoning_content: - if isinstance(response.reasoning_content, dict) and "content" in response.reasoning_content: - reasoning_content = response.reasoning_content["content"] + reasoning_content_attr = getattr(response, "reasoning_content", None) + if reasoning_content_attr: + if isinstance(reasoning_content_attr, dict) and "content" in reasoning_content_attr: + reasoning_content = reasoning_content_attr["content"] else: - reasoning_content = str(response.reasoning_content) + reasoning_content = str(reasoning_content_attr) current_messages.append( { "role": "assistant", @@ -319,7 +332,7 @@ You can only request a tool use, not use it directly. After you request a tool, f"{tool_result_string[:12000]} (...) [truncated]" ) - reminder = '\n\n**Remember that you can only use the information you get using the tools when giving your final answer. Do not make up any facts or quotes. If you do not have enough information, say that you do not know, or call another tool to find more information. Always give you final answer in Swedish.**' + reminder = '\n\n**Remember:**\n- You can only use information from tool results when giving your final answer.\n- Do not make up facts or quotes.\n- If you lack information, say so or call another tool.\n- **Always use inline citations in the format [1], [2], [3] etc. Do NOT use [^1] or 【1】.**\n- **Always include a "Källor" section at the end with matching numbered sources.**\n- Always format your final answer in Markdown.\n- Always answer in Swedish.' tool_message = { "role": "tool", "name": tool_name, @@ -338,7 +351,11 @@ You can only request a tool use, not use it directly. After you request a tool, continue elif response.content: final_content = getattr(response, "content", "") - return final_content, collected_tables, active_focus_ids + final_message = FinalAnswer( + final_answer=final_content, + explanation="Model provided a direct answer without requiring additional tools." + ) + return final_message, collected_tables, active_focus_ids def _get_tool_function(self, tool_name: str): diff --git a/backend/services/llm_tools.py b/backend/services/llm_tools.py index 89e7008..38440b3 100644 --- a/backend/services/llm_tools.py +++ b/backend/services/llm_tools.py @@ -8,22 +8,114 @@ from arango_client import arango from arango.exceptions import AQLQueryExecuteError from backend.services.search import ( SearchService, -) # Import SearchService for use in the tool +) from utils import detect_sql_syntax from _llm import LLM from pydantic import BaseModel, Field -# * When to use an AQL tool vs vector/semantic search* -# - Use AQL for exact predicates, structured filters, joins, grouping, aggregations, -# date-range queries, or ArangoSearch indexed text search. -# Examples: -# • Exact matches (by id, date, party, speaker). -# • Aggregations (counts, sums, min/max) and grouping (COLLECT). -# • Joins across collections with nested FOR. -# • Range queries, pagination, sorted results and server-side window functions. -# - Prefer vector/semantic search when you need fuzzy or semantic similarity -# (e.g., "find speeches similar in meaning to this paragraph"). Vector search is -# complementary to AQL, not a replacement for structured queries. + +class HitDocument(BaseModel): + """ + HitDocument is a Pydantic model that provides a normalized representation of a search hit across various tools, enabling consistent downstream handling. + + Attributes: + id (Optional[str]): Fully qualified ArangoDB document identifier. + key (Optional[str]): Document key without collection prefix. + speaker (Optional[str]): Name of the speaker associated with the hit. + party (Optional[str]): Party affiliation of the speaker. + date (Optional[str]): ISO formatted document date (YYYY-MM-DD). + snippet (Optional[str]): Contextual snippet or highlight from the document. + text (Optional[str]): Full text of the document when available. + score (Optional[float]): Relevance score supplied by the executing tool. + metadata (Dict[str, Any]): Additional metadata specific to the originating tool that should be preserved. + + Methods: + to_string() -> str: + Renders the hit as a human-readable string with uppercase labels, including all present fields and metadata. + """ + + """Normalized representation of a search hit across tools to enable consistent downstream handling.""" + id: Optional[str] = Field( + default=None, description="Fully qualified ArangoDB document identifier." + ) + key: Optional[str] = Field( + default=None, description="Document key without collection prefix." + ) + speaker: Optional[str] = Field( + default=None, description="Name of the speaker associated with the hit." + ) + party: Optional[str] = Field( + default=None, description="Party affiliation of the speaker." + ) + date: Optional[str] = Field( + default=None, description="ISO formatted document date (YYYY-MM-DD)." + ) + snippet: Optional[str] = Field( + default=None, description="Contextual snippet or highlight from the document." + ) + text: Optional[str] = Field( + default=None, description="Full text of the document when available." + ) + score: Optional[float] = Field( + default=None, description="Relevance score supplied by the executing tool." + ) + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Additional metadata specific to the originating tool that should be preserved.", + ) + + def to_string(self, include_metadata: bool = True) -> str: + """ + Render the object as a human-readable string with uppercase labels. + + Args: + include_metadata (bool, optional): Whether to include metadata fields in the output. Defaults to True. + + Returns: + str: A formatted string representation of the object, with each field and its value separated by double newlines, and field names in uppercase. + """ + data: Dict[str, Any] = self.model_dump(exclude_none=True) + metadata: Dict[str, Any] = data.pop("metadata", {}) + segments: List[str] = [] + for field_name, field_value in data.items(): + segments.append(f"{field_name.upper()}\n{field_value}") + for meta_key, meta_value in metadata.items(): + segments.append(f"{meta_key.upper()}\n{meta_value}") + return "\n\n".join(segments) + + +class HitsResponse(BaseModel): + """ + HitsResponse is a Pydantic model that serves as a container for multiple HitDocument instances, providing utility methods for formatting and rendering the collection. + + Attributes: + hits (List[HitDocument]): A list of collected search hits. + + Methods: + to_string() -> str: + Returns a string representation of all hits, separated by a visual divider. If there are no hits, returns an empty string. + """ + + hits: List[HitDocument] = Field( + default_factory=list, description="Collected search hits." + ) + + def to_string(self, include_metadata=True) -> str: + """ + Render all hits as a single string, separated by a visual divider. + + Args: + include_metadata (bool, optional): Whether to include metadata in each hit's string representation. Defaults to True. + + Returns: + str: A single string containing all hits, separated by "\n\n---\n\n". Returns an empty string if there are no hits. + """ + """Render all hits as a single string separated by a visual divider.""" + if not self.hits: + return "" + return "\n\n---\n\n".join( + hit.to_string(include_metadata=include_metadata) for hit in self.hits + ) @register_tool() @@ -54,12 +146,15 @@ def search_documents(query: str): class AQLResponseModel(BaseModel): query: str = Field(..., description="The generated AQL query string.") output_explanation: str = Field( - ..., description="A *very* short explanation of the expected output format, and how to interpret the results.", - examples=["Using COUNT INTO c, the result is a single integer count of matching documents.",] + ..., + description="A *very* short explanation of the expected output format, and how to interpret the results.", + examples=[ + "Using COUNT INTO c, the result is a single integer count of matching documents.", + ], ) tools = get_tools(specific_tools=["aql_query"]) - aql_query_description = tools[0]['function']['description'] + aql_query_description = tools[0]["function"]["description"] system_message = f"""You are an expert in converting natural language queries into AQL (ArangoDB Query Language) queries for the Riksdagen database. The user will provide a query in natural language, and you must translate it into a valid AQL query that can be executed against the database. @@ -127,7 +222,9 @@ Write AQL queries for reading data using only the allowed keywords and patterns query = response.content.query output_explanation = response.content.output_explanation result = aql_query(query) - if len(result) == 1: #TODO Is it a good idea to return as a single item if only one item? + if ( + len(result) == 1 + ): # TODO Is it a good idea to return as a single item if only one item? result = result[0] result = f"The AQL used to answer your request was:\n```\n{query}\n```\n{output_explanation}\n\nThe result of the query is:\n{result}" @@ -137,18 +234,18 @@ Write AQL queries for reading data using only the allowed keywords and patterns @register_tool() def aql_query(query: str) -> List[Dict[str, Any]]: - - def remove_fields(doc: Dict[str, Any], fields_to_remove: List[str]) -> Dict[str, Any]: + def remove_fields( + doc: Dict[str, Any], fields_to_remove: List[str] + ) -> Dict[str, Any]: """Recursively remove specified fields from a document.""" if not isinstance(doc, dict): return doc - + for key in list(doc.keys()): if key in fields_to_remove: del doc[key] return doc - """ Execute a read-only AQL query against the Riksdag talks database. @@ -257,7 +354,9 @@ def aql_query(query: str) -> List[Dict[str, Any]]: try: docs = [] for doc in arango.execute_aql(query): - docs.append(remove_fields(doc, ['chunks'])) #TODO And other fields to remove? + docs.append( + remove_fields(doc, ["chunks"]) + ) # TODO And other fields to remove? return docs except AQLQueryExecuteError as e: @@ -312,7 +411,9 @@ def aql_query(query: str) -> List[Dict[str, Any]]: try: docs = [] for doc in arango.execute_aql(query): - docs.append(remove_fields(doc, ['chunks'])) #TODO And other fields to remove? + docs.append( + remove_fields(doc, ["chunks"]) + ) # TODO And other fields to remove? return docs except AQLQueryExecuteError as e2: print_red(f"[Tools] Still got AQL execution error after rewrite: {str(e2)}") @@ -324,150 +425,130 @@ def aql_query(query: str) -> List[Dict[str, Any]]: return f"ERROR executing AQL query: {str(e)}.\nPlease see the aql_query tool documentation for correct usage and examples!" except Exception as e: import traceback + tb = traceback.format_exc() print_red(f"[Tools] Unexpected error executing AQL query: {str(e)}\n{tb}") return f"ERROR executing AQL query: {str(e)}.\nPlease see the aql_query tool documentation for correct usage and examples!" @register_tool() -def vector_search_talks(query: str, limit: int = 8) -> List[Dict[str, Any]]: +def vector_search_talks(query: str, limit: int = 8) -> str: """ - Använd det här verktyget för att göra en semantisk sökning bland anföranden i Riksdagen. - Använd när du vill: - - Hitta relevanta anföranden baserat på innebörden i en fråga eller ett ämne. - - Få sammanfattningar eller utdrag från anföranden som är relaterade till en specifik fråga. - - Söka tematiskt snarare än med exakta nyckelord. - När du genererar query-parametern, försök att formulera den som en naturlig språkfråga eller ett uttalande som fångar det du vill veta. + Semantic search among speeches in the Riksdagen database. Args: - query: The user's question. - limit: Number of hits to return. Default 8. + query (str): The user's question. + limit (int): Number of hits to return. Default 8. Returns: - List of speech snippets most relevant to the query. + str: Formatted string containing the top hits separated by dividers. """ print_yellow(f"[Tools] vector_search_talks → query='{query}' (top_k={limit}).") - collection = chroma_db.get_collection(os.getenv("CHROMA_TALK_COLLECTION")) - results = collection.query( - query_texts=[query], - n_results=limit, - ) - - metadatas = results.get("metadatas") or [] - documents = results.get("documents") or [] - ids = results.get("ids") or [] - distances = results.get("distances") or [] - metadata_rows = metadatas[0] if metadatas else [] - document_rows = documents[0] if documents else [] - id_rows = ids[0] if ids else [] - distance_rows = distances[0] if distances else [] - - def _as_int(value: Any, default: int = -1) -> int: - """ - Normalize chunk indices returned by Chroma so downstream Pydantic validation succeeds. - """ - if isinstance(value, bool): - return default - if isinstance(value, int): - return value - if isinstance(value, float) and value.is_integer(): - return int(value) - if isinstance(value, str): - stripped = value.strip() - if stripped.startswith("+"): - stripped = stripped[1:] - if stripped.lstrip("-").isdigit(): - return int(stripped) - return default - - max_len = max( - len(metadata_rows), - len(document_rows), - len(id_rows), - len(distance_rows), - 0, + embeddings = arango.make_embeddings([query])[0] + query = """ + LET query = @query_embedding + + FOR doc IN chunks + LET score = APPROX_NEAR_COSINE(doc.embedding, query) + SORT score DESC + LIMIT 5 + RETURN { + text: doc.text, + parent_id: doc.parent_id, + collection: doc.collection, + index: doc.index, + score: score + } + """ + results = list( + arango.execute_aql( + query=query, bind_vars={"query_embedding": embeddings}, batch_size=limit + ) ) - - hits: List[Dict[str, Any]] = [] - for idx in range(max_len): - metadata = metadata_rows[idx] if idx < len(metadata_rows) else {} - if not isinstance(metadata, dict): - metadata = {} - _id = metadata.get("_id") or (id_rows[idx] if idx < len(id_rows) else None) - if not _id: - continue - chunk_index_raw = ( - metadata.get("chunk_index") - or metadata.get("index") - or metadata.get("chunkId") + parent_ids = [doc["parent_id"] for doc in results] + parent_docs = fetch_documents( + parent_ids, fields=["_id", "_key", "talare", "parti", "dok_datum", "chunks"] + ) + hits: List[HitDocument] = [] + for n in range(len(results)): + chunks_doc = results[n] + parent_doc = parent_docs[n] + parent_chunks = parent_doc["chunks"] + chunk_index = chunks_doc["index"] + + # Build snippet from neighboring chunks, extracting the 'text' field from each chunk dict + snippet_segments: List[str] = [] + + def get_chunk_text(chunk: Any) -> str: + """Helper to extract text from a chunk dict or return the string itself.""" + if isinstance(chunk, dict): + return chunk.get("text", "") + elif isinstance(chunk, str): + return chunk + return str(chunk) + + if chunk_index > 0: + snippet_segments.append(get_chunk_text(parent_chunks[chunk_index - 1])) + snippet_segments.append(get_chunk_text(chunks_doc)) # current chunk + if chunk_index < len(parent_chunks) - 1: + snippet_segments.append(get_chunk_text(parent_chunks[chunk_index + 1])) + + hits.append( + HitDocument( + id=parent_doc.get("_id"), + key=parent_doc.get("_key"), + speaker=parent_doc.get("talare"), + party=parent_doc.get("parti"), + date=parent_doc.get("dok_datum"), + snippet=" ".join(snippet_segments), + score=chunks_doc.get("score"), + metadata={ + "collection": chunks_doc.get("collection"), + "parent_id": chunks_doc.get("parent_id"), + "chunk_index": chunk_index, + }, + ) ) - chunk_index = _as_int(chunk_index_raw) - snippet_candidates: List[str] = [] - for candidate in ( - metadata.get("snippet"), - metadata.get("text"), - document_rows[idx] if idx < len(document_rows) else "", - ): - if isinstance(candidate, str) and candidate.strip(): - snippet_candidates.append(candidate.strip()) - snippet = snippet_candidates[0] if snippet_candidates else "" - hit = { - "_id": _id, - "_id": _id, - "chunk_index": chunk_index, - "heading": metadata.get("heading") or metadata.get("title") or metadata.get("talare"), - "snippet": snippet, - "debateurl": metadata.get("debateurl") or metadata.get("debate_url"), - "score": distance_rows[idx] if idx < len(distance_rows) else None, - } - hits.append(hit) - print_purple(f"[Tools] vector_search_talks assembled {len(hits)} hits.") - return hits + return HitsResponse(hits=hits).to_string() @register_tool() -def fetch_documents(_ids: list[str], collection: str = None, fields: dict = {}) -> list[Dict[str, Any]]: - """ - Fetches documents from the database by their IDs, with optional collection prefix and field filtering. +def fetch_documents(_ids: list[str], collection: str = "", fields: list = []) -> str: + """ + Fetches full documents by their _id from ArangoDB. Args: - _ids (list[str]): List of document IDs to fetch. If a single ID is provided, it will be converted to a list. - collection (str, optional): Collection name to prefix to IDs if not already present. Defaults to None. - fields (dict, optional): Dictionary specifying which fields to include in the returned documents. If empty, all fields are returned. Defaults to {}. + _ids: List of document IDs (e.g., ["talks/abc123", "talks/def456"]) + collection: Optional collection name (not used, kept for compatibility) + fields: Optional list to specify which fields to return - Returns: - list[Dict[str, Any]]: List of documents fetched from the database. If 'fields' is specified, only those fields are included in each document. + Note: + - If `collection` is provided, it will be prepended to any IDs in `_ids` that do not already contain a collection prefix. + - If collection is `talks` and fields is empty, the `chunks` field will be removed from the returned documents to reduce payload size. - Raises: - ValueError: If document IDs do not include the collection prefix and no collection is specified. - """ + Returns: + JSON string with document data or error message + """ + # Check that collection is provided or items in _ids contain collection prefix + assert collection or all( + "/" in i for i in _ids + ), "Either collection must be provided or _ids must contain collection prefix." + if collection: + l = [] + for i in _ids: + if "/" not in i: + l.append(f"{collection}/{i}") + else: + l.append(i) - if not isinstance(_ids, list): - if isinstance(_ids, str): - if '[' in _ids and ']' in _ids: - import json - try: - _ids = json.loads(_ids) - except Exception as e: - print_red(f"[Tools] Error parsing _ids as JSON list: {str(e)}. Treating as single ID.") - _ids = [_ids] - _ids = [_ids] - - _ids = [_id.replace('\\', "/") for _id in _ids] - if collection and '/' not in _ids[0]: - _ids = [f"{collection}/{_id.split('/')[-1]}" for _id in _ids] - elif '/' not in _ids[0]: - return f"ERROR FROM TOOL: When fetching documents by _id, you **must** include the collection prefix (e.g., 'talks/12345'). Or specify the collection parameter." - - query = f""" + query = """ FOR id IN @document_ids RETURN DOCUMENT(id) """ - document_ids_string = f"""[{",".join(f'"{_id}"' for _id in _ids)}]""" - print_blue(f"[Tools] Fetch {query}, bind_vars={{'document_ids': {document_ids_string}}}") - docs = arango.execute_aql(query, bind_vars={"document_ids": document_ids_string}) + + docs = arango.execute_aql(query, bind_vars={"document_ids": _ids}) if fields: l = [] @@ -478,7 +559,6 @@ def fetch_documents(_ids: list[str], collection: str = None, fields: dict = {}) else: for _id in _ids: if _id.startswith("talks/"): - # If fetching a talk, also fetch its chunks for doc in docs: if "chunks" in doc: del doc["chunks"] @@ -575,7 +655,7 @@ def arango_search( **This tools has four special features/parameters:** 1) `return_snippets=True` – If you want to get an overview of the results, use this parameter to get highlighted snippets instead of full documents. This is useful if you want to quickly see what the results are about, and then decide which _id:s to fetch in full. - 2) `results_to_user=True` – If the user has asked for e.g. "a list of talks mentioning...", "I want to see...", "give me all speeches about..." – or in other ways indicates they want to see the actual results – use this parameter so the results are sent to the user as they are. + 2) `results_to_user=True` – If the user has asked for e.g. "a list of talks mentioning...", "I want to see...", "give me all speeches about..." – or in other ways indicates they want to see the actual results – use this parameter so the results are sent to the user. 3) `focus_ids` – If you want to do a search within the ID:s from the last search, set this parameter to True. This is only useful if you have done a previous where you've used `results_to_user=True`, and the user has then asked a follow-up question that requires a more specific search within the previous results. 4) `intressent_ids` – If you want to filter the search by specific speaker IDs, use this parameter. It should be a list of speaker IDs (intressent_id). @@ -643,12 +723,16 @@ def arango_search( focus_id_list: List[str] = [] if focus_ids: if isinstance(focus_ids, list): - focus_id_list = [str(item) for item in focus_ids if isinstance(item, (str, int))] + focus_id_list = [ + str(item) for item in focus_ids if isinstance(item, (str, int)) + ] elif isinstance(focus_ids, str): try: parsed = json.loads(focus_ids) if isinstance(parsed, list): - focus_id_list = [str(item) for item in parsed if isinstance(item, (str, int))] + focus_id_list = [ + str(item) for item in parsed if isinstance(item, (str, int)) + ] except json.JSONDecodeError: focus_id_list = [focus_ids] elif focus_ids is True: diff --git a/backend/services/monitor_script.py b/backend/services/monitor_script.py new file mode 100644 index 0000000..895095d --- /dev/null +++ b/backend/services/monitor_script.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +System Resource Monitor - Logs system stats to help diagnose SSH connectivity issues. + +This script monitors: +- CPU usage +- Memory usage +- Disk usage +- Network connectivity +- SSH service status +- System load +- Active connections + +Run continuously to capture when the system becomes unreachable. +""" + +import psutil +import time +import logging +from datetime import datetime +from pathlib import Path + +# Setup logging to file with rotation +log_file = Path("/var/log/system_monitor.log") +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_file), + logging.StreamHandler() # Also print to console + ] +) + +def check_ssh_service() -> dict: + """ + Check if SSH service is running. + + Returns: + dict: Service status information + """ + try: + import subprocess + result = subprocess.run( + ['systemctl', 'is-active', 'ssh'], + capture_output=True, + text=True, + timeout=5 + ) + return { + 'running': result.returncode == 0, + 'status': result.stdout.strip() + } + except Exception as e: + return {'running': False, 'error': str(e)} + +def get_system_stats() -> dict: + """ + Collect current system statistics. + + Returns: + dict: System statistics including CPU, memory, disk, network + """ + # CPU usage + cpu_percent = psutil.cpu_percent(interval=1) + cpu_count = psutil.cpu_count() + + # Memory usage + memory = psutil.virtual_memory() + swap = psutil.swap_memory() + + # Disk usage + disk = psutil.disk_usage('/') + + # Network stats + net_io = psutil.net_io_counters() + + # System load (1, 5, 15 minute averages) + load_avg = psutil.getloadavg() + + # Number of connections + connections = len(psutil.net_connections()) + + return { + 'cpu_percent': cpu_percent, + 'cpu_count': cpu_count, + 'memory_percent': memory.percent, + 'memory_available_gb': memory.available / (1024**3), + 'swap_percent': swap.percent, + 'disk_percent': disk.percent, + 'disk_free_gb': disk.free / (1024**3), + 'network_bytes_sent': net_io.bytes_sent, + 'network_bytes_recv': net_io.bytes_recv, + 'load_1min': load_avg[0], + 'load_5min': load_avg[1], + 'load_15min': load_avg[2], + 'connections': connections + } + +def monitor_loop(interval_seconds: int = 60): + """ + Main monitoring loop that logs system stats at regular intervals. + + Args: + interval_seconds: How often to log stats (default: 60 seconds) + """ + logging.info("Starting system monitoring...") + + while True: + try: + stats = get_system_stats() + ssh_status = check_ssh_service() + + # Log current stats + log_message = ( + f"CPU: {stats['cpu_percent']:.1f}% | " + f"MEM: {stats['memory_percent']:.1f}% ({stats['memory_available_gb']:.2f}GB free) | " + f"DISK: {stats['disk_percent']:.1f}% ({stats['disk_free_gb']:.2f}GB free) | " + f"LOAD: {stats['load_1min']:.2f} {stats['load_5min']:.2f} {stats['load_15min']:.2f} | " + f"CONN: {stats['connections']} | " + f"SSH: {ssh_status.get('status', 'unknown')}" + ) + + # Warning thresholds + if stats['cpu_percent'] > 90: + logging.warning(f"HIGH CPU! {log_message}") + elif stats['memory_percent'] > 90: + logging.warning(f"HIGH MEMORY! {log_message}") + elif stats['disk_percent'] > 90: + logging.warning(f"HIGH DISK USAGE! {log_message}") + elif stats['load_1min'] > stats['cpu_count'] * 2: + logging.warning(f"HIGH LOAD! {log_message}") + elif not ssh_status.get('running'): + logging.error(f"SSH SERVICE DOWN! {log_message}") + else: + logging.info(log_message) + + time.sleep(interval_seconds) + + except Exception as e: + logging.error(f"Error in monitoring loop: {e}") + time.sleep(interval_seconds) + +if __name__ == "__main__": + monitor_loop(interval_seconds=60) # Log every 60 seconds \ No newline at end of file diff --git a/config.py b/config.py index 1e31a81..676b431 100644 --- a/config.py +++ b/config.py @@ -6,7 +6,7 @@ embedding_model = os.getenv("LLM_MODEL_EMBEDDINGS") if embedding_model == "embeddinggemma": embedding_dimensions = 768 else: - embedding_dimensions = None + embedding_dimensions = 384 _llm_api_url = os.getenv("LLM_API_URL") llm_base_url = (_llm_api_url).rstrip("/") llm_api_key = os.getenv("LLM_API_KEY", os.getenv("LLM_API_PWD_LASSE", "not-set")) diff --git a/etc/riksdagen-sync.service b/etc/riksdagen-sync.service new file mode 100644 index 0000000..23569f5 --- /dev/null +++ b/etc/riksdagen-sync.service @@ -0,0 +1,19 @@ +[Unit] +Description=Riksdagen daily talk sync +# Wait for network before starting +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=lasse +WorkingDirectory=/home/lasse/riksdagen +# Loads ARANGO_PWD and other env vars from the project .env file +EnvironmentFile=/home/lasse/riksdagen/.env +ExecStart=/home/lasse/riksdagen/.venv/bin/python /home/lasse/riksdagen/scripts/sync_talks.py +# Log stdout/stderr to the systemd journal (view with: journalctl -u riksdagen-sync) +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target diff --git a/etc/riksdagen-sync.timer b/etc/riksdagen-sync.timer new file mode 100644 index 0000000..6d1b129 --- /dev/null +++ b/etc/riksdagen-sync.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run riksdagen daily talk sync at 06:00 + +[Timer] +# Run every day at 06:00 +OnCalendar=*-*-* 06:00:00 +# If the server was off at 06:00, run the job as soon as it comes back up +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index c5ec4d8..c32c9ab 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -256,7 +256,6 @@ function SearchView() { const parsed = JSON.parse(stored) as { query?: string; filters?: SearchFilters; - results?: TalkHit[]; sortMode?: "relevance" | "date"; visibleCount?: number; hasSearched?: boolean; @@ -274,13 +273,22 @@ function SearchView() { to_year: parsed.filters.to_year ?? undefined, }); } - if (Array.isArray(parsed.results)) setResults(parsed.results); if (parsed.sortMode === "date" || parsed.sortMode === "relevance") setSortMode(parsed.sortMode); if (typeof parsed.visibleCount === "number") setVisibleCount(parsed.visibleCount); if ("hasSearched" in parsed) setHasSearched(Boolean(parsed.hasSearched)); if ("lastError" in parsed) setLastError(parsed.lastError ?? null); setSpeaker(parsed.speaker ?? null); setSpeakerIds(Array.isArray(parsed.speakerIds) ? parsed.speakerIds : []); + // If we have a query and filters, re-run the search automatically + if (parsed.query && parsed.filters) { + const cleanQuery = stripMentions(parsed.query); + searchMutation.mutate({ + q: cleanQuery, + ...parsed.filters, + speaker_ids: parsed.speakerIds && parsed.speakerIds.length > 0 ? parsed.speakerIds : undefined, + include_snippets: true, + }); + } } catch (error) { console.error("Failed to restore search state from session storage:", error); window.sessionStorage.removeItem(SEARCH_STATE_KEY); @@ -295,7 +303,6 @@ function SearchView() { const persistableState = { query, filters, - results, sortMode, visibleCount, hasSearched, @@ -303,8 +310,9 @@ function SearchView() { speaker, speakerIds, }; + // Only store lightweight state (no results) window.sessionStorage.setItem(SEARCH_STATE_KEY, JSON.stringify(persistableState)); - }, [query, filters, results, sortMode, visibleCount, hasSearched, lastError, speaker, speakerIds, hasHydratedSearch]); + }, [query, filters, sortMode, visibleCount, hasSearched, lastError, speaker, speakerIds, hasHydratedSearch]); useEffect(() => { // Any filter change while the user is interacting should reset pagination back to the first page. diff --git a/frontend/src/components/ChatPanel.tsx b/frontend/src/components/ChatPanel.tsx index 03f44bc..61eb94f 100644 --- a/frontend/src/components/ChatPanel.tsx +++ b/frontend/src/components/ChatPanel.tsx @@ -297,13 +297,77 @@ export const ChatPanel = forwardRef(function ChatPanel( } }; + /** + * Converts Markdown to HTML and replaces [1], [2], ... with 1 citations, + * but only outside of , ,
,