You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

57 lines
2.4 KiB

from arango_client import arango
from scripts.make_arango_embeddings import process_chunk_batch
from arango.collection import Collection
from typing import List, Dict
def test_full_make_arango_embeddings_for_one_talk() -> None:
"""
Integration test for the full make_arango_embeddings chain:
- Fetches a specific talk document from ArangoDB.
- Processes its chunks to generate embeddings.
- Inserts/updates those chunks in the 'chunks' collection.
- Verifies that the chunks were updated in ArangoDB.
This test requires ArangoDB and Ollama to be running and accessible.
"""
# The _id of the talk we want to process
target_id: str = "talks/000004cc-b896-e611-9441-00262d0d7125"
_key = target_id.split("/")[-1]
# Get the talks and chunks collections
talks_collection: Collection = arango.db.collection("talks")
chunks_collection: Collection = arango.db.collection("chunks")
# Fetch the talk document
talk: Dict = talks_collection.get(target_id)
assert talk is not None, f"Talk with _id {target_id} not found"
assert "chunks" in talk and talk["chunks"], "Talk has no chunks"
# Prepare chunks for embedding
processed_chunks: List[Dict] = []
for chunk in talk["chunks"]:
key: str = chunk["chroma_id"].split("/")[-1]
chunk["_key"] = key.split(":")[-1]
chunk["parent_id"] = target_id
chunk["collection"] = "talks"
# Remove fields not needed for embedding
if "chroma_id" in chunk:
del chunk["chroma_id"]
if "chroma_collecton" in chunk:
del chunk["chroma_collecton"]
processed_chunks.append(chunk)
# Generate embeddings for all chunks
processed_chunks = process_chunk_batch(processed_chunks)
# Insert/update chunks in the 'chunks' collection
chunks_collection.insert_many(processed_chunks, overwrite=True)
# Verify that the chunks were updated in ArangoDB
for chunk in processed_chunks:
db_chunk = chunks_collection.get(chunk["_key"])
assert db_chunk is not None, f"Chunk {_key} not found in DB"
assert "embedding" in db_chunk, "Chunk missing embedding in DB"
assert isinstance(db_chunk["embedding"], list), "Embedding is not a list"
print(f"Chunk {chunk['_key']} updated with embedding of length {len(db_chunk['embedding'])}")
test_full_make_arango_embeddings_for_one_talk()