import math from typing import Iterable, List import os import sys # Set /home/lasse/riksdagen as working directory os.chdir("/home/lasse/riksdagen") # Add the project root to Python path to locate local modules sys.path.append("/home/lasse/riksdagen") from _chromadb.chroma_client import ChromaClient from config import embedding_model, llm_base_url CHUNK_SIZE_WORDS = 600 def chunk_text(heading: str | None, body: str, chunk_size: int = CHUNK_SIZE_WORDS) -> Iterable[tuple[int, str]]: words = body.split() if not words: yield 0, "" return for index in range(0, len(words), chunk_size): chunk_words = words[index : index + chunk_size] prefix = f"{heading}\n\n" if heading else "" yield index // chunk_size, f"{prefix}{' '.join(chunk_words)}" if __name__ == "__main__": chroma = ChromaClient() # Test the embedding function sample_text = """Öppnande av riksmötet Jag hemställer att Ers Majestät måtte förklara 1993/94 års riksmöte öppnat.""" embedding = chroma.embed_text(sample_text) print(embedding)