You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
34 lines
1.1 KiB
34 lines
1.1 KiB
import math |
|
from typing import Iterable, List |
|
import os |
|
import sys |
|
# Set /home/lasse/riksdagen as working directory |
|
os.chdir("/home/lasse/riksdagen") |
|
# Add the project root to Python path to locate local modules |
|
sys.path.append("/home/lasse/riksdagen") |
|
|
|
from _chromadb.chroma_client import ChromaClient |
|
from config import embedding_model, llm_base_url |
|
|
|
CHUNK_SIZE_WORDS = 600 |
|
|
|
|
|
def chunk_text(heading: str | None, body: str, chunk_size: int = CHUNK_SIZE_WORDS) -> Iterable[tuple[int, str]]: |
|
words = body.split() |
|
if not words: |
|
yield 0, "" |
|
return |
|
for index in range(0, len(words), chunk_size): |
|
chunk_words = words[index : index + chunk_size] |
|
prefix = f"{heading}\n\n" if heading else "" |
|
yield index // chunk_size, f"{prefix}{' '.join(chunk_words)}" |
|
|
|
|
|
if __name__ == "__main__": |
|
chroma = ChromaClient() |
|
# Test the embedding function |
|
sample_text = """Öppnande av riksmötet |
|
|
|
Jag hemställer att Ers Majestät måtte förklara 1993/94 års riksmöte öppnat.""" |
|
embedding = chroma.embed_text(sample_text) |
|
print(embedding)
|
|
|