You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

34 lines
1.1 KiB

import math
from typing import Iterable, List
import os
import sys
# Set /home/lasse/riksdagen as working directory
os.chdir("/home/lasse/riksdagen")
# Add the project root to Python path to locate local modules
sys.path.append("/home/lasse/riksdagen")
from _chromadb.chroma_client import ChromaClient
from config import embedding_model, llm_base_url
CHUNK_SIZE_WORDS = 600
def chunk_text(heading: str | None, body: str, chunk_size: int = CHUNK_SIZE_WORDS) -> Iterable[tuple[int, str]]:
words = body.split()
if not words:
yield 0, ""
return
for index in range(0, len(words), chunk_size):
chunk_words = words[index : index + chunk_size]
prefix = f"{heading}\n\n" if heading else ""
yield index // chunk_size, f"{prefix}{' '.join(chunk_words)}"
if __name__ == "__main__":
chroma = ChromaClient()
# Test the embedding function
sample_text = """Öppnande av riksmötet
Jag hemställer att Ers Majestät måtte förklara 1993/94 års riksmöte öppnat."""
embedding = chroma.embed_text(sample_text)
print(embedding)