import os import sys import logging # Silence the per-request HTTP logs from the ollama/httpx library logging.getLogger("httpx").setLevel(logging.WARNING) os.chdir("/home/lasse/riksdagen") sys.path.append("/home/lasse/riksdagen") from arango_client import arango from ollama import Client as Ollama from arango.collection import Collection from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict from time import sleep from utils import TextChunker def make_embeddings(texts: List[str]) -> List[List[float]]: """ Generate embeddings for a list of texts using Ollama. Args: texts (List[str]): List of text strings to embed. Returns: List[List[float]]: List of embedding vectors. """ ollama_client = Ollama(host='192.168.1.12:33405') embeddings = ollama_client.embed( model="qwen3-embedding:latest", input=texts, dimensions=384, ) return embeddings.embeddings def process_chunk_batch(chunk_batch: List[Dict]) -> List[Dict]: """ Generate embeddings for a batch of chunks and attach them. Args: chunk_batch (List[Dict]): List of chunk dicts, each with a 'text' field. Returns: List[Dict]: Same list with an 'embedding' field added to each dict. """ sleep(1) texts = [chunk['text'] for chunk in chunk_batch] embeddings = make_embeddings(texts) for i, chunk in enumerate(chunk_batch): chunk['embedding'] = embeddings[i] return chunk_batch def make_arango_embeddings() -> int: """ Chunks and embeds all talks that are not yet represented in the 'chunks' collection. For each talk that has no chunks in the collection yet: - If the talk document already has a 'chunks' field (legacy path), those are used. - Otherwise the speech text is split into chunks using TextChunker. Embedding vectors are generated via Ollama and stored in the 'chunks' collection. Each chunk document in ArangoDB has: _key : "{talk_key}:{chunk_index}" (unique within the collection) text : the chunk text index : chunk index within the talk parent_id : "talks/{talk_key}" (links back to the source talk) collection: "talks" embedding : the vector (list of floats) Returns: int: Total number of chunk documents inserted/updated. """ if not arango.db.has_collection("chunks"): chunks_collection: Collection = arango.db.create_collection("chunks") else: chunks_collection: Collection = arango.db.collection("chunks") # Find every talk that has no entry yet in the chunks collection. # The inner FOR loop returns [] if no match exists (acts as NOT EXISTS). cursor = arango.db.aql.execute( """ FOR p IN talks FILTER p.anforandetext != null AND p.anforandetext != "" FILTER ( FOR c IN chunks FILTER c.parent_id == p._id LIMIT 1 RETURN 1 ) == [] RETURN { _key: p._key, _id: p._id, anforandetext: p.anforandetext, chunks: p.chunks } """, batch_size=1000, ttl=360, ) n = 0 embed_batch_size = 20 # Number of chunks per Ollama call chunk_batches: List[List[Dict]] = [] for talk in cursor: talk_key = talk["_key"] parent_id = f"talks/{talk_key}" if talk.get("chunks"): # Legacy path: chunks were previously generated and stored on the talk document. # Strip out the old ChromaDB-specific fields and assign a proper _key. _chunks = [] for chunk in talk["chunks"]: idx = chunk.get("index", 0) _chunks.append({ "_key": f"{talk_key}:{idx}", "text": chunk["text"], "index": idx, "parent_id": parent_id, "collection": "talks", }) else: # New path: chunk the speech text directly with TextChunker. text = (talk.get("anforandetext") or "").strip() text_chunks = TextChunker(chunk_limit=500).chunk(text) _chunks = [ { "_key": f"{talk_key}:{idx}", "text": content, "index": idx, "parent_id": parent_id, "collection": "talks", } for idx, content in enumerate(text_chunks) if content and content.strip() ] # Split into batches for embedding for i in range(0, len(_chunks), embed_batch_size): batch = _chunks[i : i + embed_batch_size] if batch: chunk_batches.append(batch) # Embed all batches in parallel (Ollama calls are I/O-bound, threads are fine) total_batches = len(chunk_batches) completed_batches = 0 with ThreadPoolExecutor(max_workers=3) as executor: futures = [executor.submit(process_chunk_batch, batch) for batch in chunk_batches] processed_chunks: List[Dict] = [] for future in as_completed(futures): result = future.result() completed_batches += 1 processed_chunks.extend(result) print(f"Embedding batches: {completed_batches}/{total_batches} | chunks ready to insert: {len(processed_chunks)}", end="\r") # Insert in batches of 100 to keep HTTP payloads small if len(processed_chunks) >= 100: n += len(processed_chunks) chunks_collection.insert_many(processed_chunks, overwrite=True) processed_chunks = [] if processed_chunks: n += len(processed_chunks) chunks_collection.insert_many(processed_chunks, overwrite=True) print(f"\nDone. Inserted/updated {n} chunks in ArangoDB.") return n if __name__ == "__main__": make_arango_embeddings()