You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
2.4 KiB
57 lines
2.4 KiB
from arango_client import arango |
|
from scripts.make_arango_embeddings import process_chunk_batch |
|
from arango.collection import Collection |
|
from typing import List, Dict |
|
|
|
def test_full_make_arango_embeddings_for_one_talk() -> None: |
|
""" |
|
Integration test for the full make_arango_embeddings chain: |
|
- Fetches a specific talk document from ArangoDB. |
|
- Processes its chunks to generate embeddings. |
|
- Inserts/updates those chunks in the 'chunks' collection. |
|
- Verifies that the chunks were updated in ArangoDB. |
|
|
|
This test requires ArangoDB and Ollama to be running and accessible. |
|
""" |
|
# The _id of the talk we want to process |
|
target_id: str = "talks/000004cc-b896-e611-9441-00262d0d7125" |
|
_key = target_id.split("/")[-1] |
|
|
|
# Get the talks and chunks collections |
|
talks_collection: Collection = arango.db.collection("talks") |
|
chunks_collection: Collection = arango.db.collection("chunks") |
|
|
|
# Fetch the talk document |
|
talk: Dict = talks_collection.get(target_id) |
|
assert talk is not None, f"Talk with _id {target_id} not found" |
|
assert "chunks" in talk and talk["chunks"], "Talk has no chunks" |
|
|
|
# Prepare chunks for embedding |
|
processed_chunks: List[Dict] = [] |
|
for chunk in talk["chunks"]: |
|
key: str = chunk["chroma_id"].split("/")[-1] |
|
chunk["_key"] = key.split(":")[-1] |
|
chunk["parent_id"] = target_id |
|
chunk["collection"] = "talks" |
|
# Remove fields not needed for embedding |
|
if "chroma_id" in chunk: |
|
del chunk["chroma_id"] |
|
if "chroma_collecton" in chunk: |
|
del chunk["chroma_collecton"] |
|
processed_chunks.append(chunk) |
|
|
|
# Generate embeddings for all chunks |
|
processed_chunks = process_chunk_batch(processed_chunks) |
|
|
|
# Insert/update chunks in the 'chunks' collection |
|
chunks_collection.insert_many(processed_chunks, overwrite=True) |
|
|
|
# Verify that the chunks were updated in ArangoDB |
|
for chunk in processed_chunks: |
|
db_chunk = chunks_collection.get(chunk["_key"]) |
|
assert db_chunk is not None, f"Chunk {_key} not found in DB" |
|
assert "embedding" in db_chunk, "Chunk missing embedding in DB" |
|
assert isinstance(db_chunk["embedding"], list), "Embedding is not a list" |
|
print(f"Chunk {chunk['_key']} updated with embedding of length {len(db_chunk['embedding'])}") |
|
|
|
test_full_make_arango_embeddings_for_one_talk() |