from arango_client import arango from scripts.make_arango_embeddings import process_chunk_batch from arango.collection import Collection from typing import List, Dict def test_full_make_arango_embeddings_for_one_talk() -> None: """ Integration test for the full make_arango_embeddings chain: - Fetches a specific talk document from ArangoDB. - Processes its chunks to generate embeddings. - Inserts/updates those chunks in the 'chunks' collection. - Verifies that the chunks were updated in ArangoDB. This test requires ArangoDB and Ollama to be running and accessible. """ # The _id of the talk we want to process target_id: str = "talks/000004cc-b896-e611-9441-00262d0d7125" _key = target_id.split("/")[-1] # Get the talks and chunks collections talks_collection: Collection = arango.db.collection("talks") chunks_collection: Collection = arango.db.collection("chunks") # Fetch the talk document talk: Dict = talks_collection.get(target_id) assert talk is not None, f"Talk with _id {target_id} not found" assert "chunks" in talk and talk["chunks"], "Talk has no chunks" # Prepare chunks for embedding processed_chunks: List[Dict] = [] for chunk in talk["chunks"]: key: str = chunk["chroma_id"].split("/")[-1] chunk["_key"] = key.split(":")[-1] chunk["parent_id"] = target_id chunk["collection"] = "talks" # Remove fields not needed for embedding if "chroma_id" in chunk: del chunk["chroma_id"] if "chroma_collecton" in chunk: del chunk["chroma_collecton"] processed_chunks.append(chunk) # Generate embeddings for all chunks processed_chunks = process_chunk_batch(processed_chunks) # Insert/update chunks in the 'chunks' collection chunks_collection.insert_many(processed_chunks, overwrite=True) # Verify that the chunks were updated in ArangoDB for chunk in processed_chunks: db_chunk = chunks_collection.get(chunk["_key"]) assert db_chunk is not None, f"Chunk {_key} not found in DB" assert "embedding" in db_chunk, "Chunk missing embedding in DB" assert isinstance(db_chunk["embedding"], list), "Embedding is not a list" print(f"Chunk {chunk['_key']} updated with embedding of length {len(db_chunk['embedding'])}") test_full_make_arango_embeddings_for_one_talk()