rixdagen/scripts/make_arango_embeddings.py

import os
import sys
import logging

# Silence the per-request HTTP logs from the ollama/httpx library
logging.getLogger("httpx").setLevel(logging.WARNING)

os.chdir("/home/lasse/riksdagen")
sys.path.append("/home/lasse/riksdagen")

from arango_client import arango
from ollama import Client as Ollama
from arango.collection import Collection
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict
from time import sleep
from utils import TextChunker


def make_embeddings(texts: List[str]) -> List[List[float]]:
    """
    Generate embeddings for a list of texts using Ollama.

    Args:
        texts (List[str]): List of text strings to embed.

    Returns:
        List[List[float]]: List of embedding vectors.
    """
    ollama_client = Ollama(host='192.168.1.12:33405')
    embeddings = ollama_client.embed(
        model="qwen3-embedding:latest",
        input=texts,
        dimensions=384,
    )
    return embeddings.embeddings


def process_chunk_batch(chunk_batch: List[Dict]) -> List[Dict]:
    """
    Generate embeddings for a batch of chunks and attach them.

    Args:
        chunk_batch (List[Dict]): List of chunk dicts, each with a 'text' field.

    Returns:
        List[Dict]: Same list with an 'embedding' field added to each dict.
    """
    sleep(1)
    texts = [chunk['text'] for chunk in chunk_batch]
    embeddings = make_embeddings(texts)
    for i, chunk in enumerate(chunk_batch):
        chunk['embedding'] = embeddings[i]
    return chunk_batch


def make_arango_embeddings() -> int:
    """
    Chunks and embeds all talks that are not yet represented in the 'chunks' collection.

    For each talk that has no chunks in the collection yet:
      - If the talk document already has a 'chunks' field (legacy path), those are used.
      - Otherwise the speech text is split into chunks using TextChunker.
    Embedding vectors are generated via Ollama and stored in the 'chunks' collection.

    Each chunk document in ArangoDB has:
      _key      : "{talk_key}:{chunk_index}"  (unique within the collection)
      text      : the chunk text
      index     : chunk index within the talk
      parent_id : "talks/{talk_key}"  (links back to the source talk)
      collection: "talks"
      embedding : the vector (list of floats)

    Returns:
        int: Total number of chunk documents inserted/updated.
    """
    if not arango.db.has_collection("chunks"):
        chunks_collection: Collection = arango.db.create_collection("chunks")
    else:
        chunks_collection: Collection = arango.db.collection("chunks")

    # Find every talk that has no entry yet in the chunks collection.
    # The inner FOR loop returns [] if no match exists (acts as NOT EXISTS).
    cursor = arango.db.aql.execute(
        """
        FOR p IN talks
            FILTER p.anforandetext != null AND p.anforandetext != ""
            FILTER (
                FOR c IN chunks
                FILTER c.parent_id == p._id
                LIMIT 1
                RETURN 1
            ) == []
            RETURN {
                _key: p._key,
                _id: p._id,
                anforandetext: p.anforandetext,
                chunks: p.chunks
            }
        """,
        batch_size=1000,
        ttl=360,
    )

    n = 0
    embed_batch_size = 20  # Number of chunks per Ollama call
    chunk_batches: List[List[Dict]] = []

    for talk in cursor:
        talk_key = talk["_key"]
        parent_id = f"talks/{talk_key}"

        if talk.get("chunks"):
            # Legacy path: chunks were previously generated and stored on the talk document.
            # Strip out the old ChromaDB-specific fields and assign a proper _key.
            _chunks = []
            for chunk in talk["chunks"]:
                idx = chunk.get("index", 0)
                _chunks.append({
                    "_key": f"{talk_key}:{idx}",
                    "text": chunk["text"],
                    "index": idx,
                    "parent_id": parent_id,
                    "collection": "talks",
                })
        else:
            # New path: chunk the speech text directly with TextChunker.
            text = (talk.get("anforandetext") or "").strip()
            text_chunks = TextChunker(chunk_limit=500).chunk(text)
            _chunks = [
                {
                    "_key": f"{talk_key}:{idx}",
                    "text": content,
                    "index": idx,
                    "parent_id": parent_id,
                    "collection": "talks",
                }
                for idx, content in enumerate(text_chunks)
                if content and content.strip()
            ]

        # Split into batches for embedding
        for i in range(0, len(_chunks), embed_batch_size):
            batch = _chunks[i : i + embed_batch_size]
            if batch:
                chunk_batches.append(batch)

    # Embed all batches in parallel (Ollama calls are I/O-bound, threads are fine)
    total_batches = len(chunk_batches)
    completed_batches = 0
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(process_chunk_batch, batch) for batch in chunk_batches]
        processed_chunks: List[Dict] = []
        for future in as_completed(futures):
            result = future.result()
            completed_batches += 1
            processed_chunks.extend(result)
            print(f"Embedding batches: {completed_batches}/{total_batches} | chunks ready to insert: {len(processed_chunks)}", end="\r")
            # Insert in batches of 100 to keep HTTP payloads small
            if len(processed_chunks) >= 100:
                n += len(processed_chunks)
                chunks_collection.insert_many(processed_chunks, overwrite=True)
                processed_chunks = []
        if processed_chunks:
            n += len(processed_chunks)
            chunks_collection.insert_many(processed_chunks, overwrite=True)

    print(f"\nDone. Inserted/updated {n} chunks in ArangoDB.")
    return n


if __name__ == "__main__":
    make_arango_embeddings()