You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

334 lines
12 KiB

from pydantic import BaseModel, Field
from typing import Dict, List, Tuple, Optional, Any
class ArticleChunk(BaseModel):
summary: str
tags: List[str]
references: Optional[List[str]]
class QueryResponse(BaseModel):
"""
Represents a query generated for retrieving documents from a vector database.
Attributes:
query (str): The generated query text, short and concise.
"""
query: str = Field(
description="The generated query that will be used to retrieve documents from a vector database (ChromaDB). Should be short and concise.",
example="capital of France",
)
class ArticleMetadataResponse(BaseModel):
"""
Represents structured metadata extracted from an article by an LLM.
"""
published_date: Optional[str] = Field(
description="The publication date of the article in YYYY-MM-DD format."
)
title: str = Field(
description="The full title of the article."
)
journal: Optional[str] = Field(
description="The name of the journal/paper/outlet where the article was published."
)
class PlanEvaluationResponse(BaseModel):
"""
Represents the evaluation of a plan's step.
Attributes:
reasoning (str): Explanation of the reasoning behind the evaluation.
complete (bool): Indicates if the step has sufficient information to proceed.
"""
reasoning: str = Field(
description="A short explanation of the reasoning behind the evaluation",
example="Although some information is missing, the existing data is sufficient to complete the step.",
)
complete: bool = Field(
description="Indicates whether the information is sufficient to complete the step",
example=False,
)
class EvaluateFormat(BaseModel):
"""
Represents the evaluation format for determining sufficiency of information.
Attributes:
explanation (str): Explanation of whether the information is sufficient.
status (bool): Indicates sufficiency of the information.
additional_info (Optional[str]): Additional information needed if insufficient.
"""
explanation: str = Field(
description="A very short explanation of whether the information is sufficient or not",
example="The information is sufficient because...",
)
status: bool = Field(
description="If the information is sufficient to complete the step or not.",
example=True,
)
additional_info: Optional[str] = Field(
description="If the information is not sufficient, what additional information would be needed",
example="We need more information about...",
)
class Plan(BaseModel):
"""
Represents a structured plan with steps and corresponding tasks or facts.
Attributes:
steps (Dict[str, List[Tuple[str, str]]]): A dictionary where keys are step names and values are lists of tasks or facts.
"""
steps: Dict[str, List[Tuple[str, str]]] = Field(
description="Structured plan represented as steps with their corresponding tasks or facts",
example={
"Step 1: Gather Existing Materials": [
("Task 1", "Description of task"),
("Task 2", "Description of task"),
],
"Step 2: Extract Relevant Information": [
("Task 1", "Description of task"),
("Task 2", "Description of task"),
],
},
)
class ChunkMetadata(BaseModel):
"""
Metadata associated with a document chunk.
Attributes:
title (str): Title of the document chunk.
journal (Optional[str]): Journal where the document was published.
published_date (Optional[str]): Date of publication.
user_notes (Optional[str]): User-provided notes.
arango_id (Optional[str]): Unique identifier for the document in ArangoDB.
additional_metadata (Dict[str, Any]): Any additional metadata fields.
doi (Optional[str]): Digital Object Identifier for the document.
link: (Optional[str]): URL to access the document.
authors (Optional[List[str]]): List of authors of the document.
published_year (Optional[int]): Year of publication.
abstract: (Optional[str]): Abstract of the document.
pages: (Optional[str]): Page numbers of the document.
chroma_id (Optional[str]): Unique identifier for the chunk in ChromaDB.
"""
title: str = Field(default="No title", description="Title of the document chunk.")
journal: Optional[str] = None
published_date: Optional[str] = None
user_notes: Optional[str] = None
_id: Optional[str] = None
additional_metadata: Dict[str, Any] = Field(default_factory=dict)
doi: Optional[str] = None
link: Optional[str] = None
authors: Optional[List[str]] = Field(
default_factory=list,
description="List of authors of the document.",
)
published_year: Optional[int] = Field(
default=None,
description="Year of publication.",
)
abstract: Optional[str] = Field(
default=None,
description="Abstract of the document.",
)
pages: Optional[str] = Field(
default=None,
description="Page numbers of the document.",
)
chroma_id: Optional[str] = Field(
default=None,
description="Unique identifier for the chunk in ChromaDB.",
)
class DocumentChunk(BaseModel):
"""
Represents a chunk of text from a document with its metadata.
Attributes:
document (str): The text content of the chunk.
metadata (ChunkMetadata): Metadata associated with the chunk.
"""
document: str
metadata: ChunkMetadata
class UnifiedDataChunk(BaseModel):
"""
Represents a unified chunk of data from any source.
Attributes:
content (str): The main content of the chunk (e.g., text, note, or document).
metadata (Optional[Dict[str, Any]]): Metadata associated with the chunk.
source_type (str): The type of source (e.g., 'note', 'article', 'document').
"""
content: str = Field(
description="The main content of the chunk (e.g., text, note, or document)."
)
metadata: Optional[ChunkMetadata] = Field(
description="Metadata associated with the chunk (e.g., title, source, date).",
)
source_type: str = Field(
description="The type of source (e.g., 'note', 'article', 'document')."
)
class UnifiedSearchResults(BaseModel):
"""
Represents unified search results from any search tool.
Attributes:
chunks (List[UnifiedDataChunk]): List of data chunks from the search.
source_ids (List[str]): List of unique source IDs for the chunks.
"""
chunks: List[UnifiedDataChunk] = Field(
description="List of data chunks from the search."
)
source_ids: List[str] = Field(
default_factory=list, description="List of unique source IDs for the chunks."
)
class UnifiedToolResponse(BaseModel):
"""
Represents a unified response from any tool.
Attributes:
search_results (Optional[UnifiedSearchResults]): The unified search results, if the tool used is returning search results.
text_result (Optional[str]): Text result from the tool, e.g., if the tool is an analysis.
tool_name (str): The name of the tool used to generate the response.
"""
search_results: Optional[UnifiedSearchResults] = Field(
default=None,
description="The unified search results, if the tools used is returning search results.",
)
text_results: Optional[list[str]] = Field(
default=None,
description="Text results from the tool, e.g., if the tool is an analysis.",
)
tool_names: Optional[list[str]] = Field(
default=None, description="The name of the tool used to generate the response."
)
def extend_search_results(self, search_results: UnifiedSearchResults) -> None:
"""
Extends the search results with additional data.
Args:
search_results (UnifiedSearchResults): The new search results to extend.
"""
if self.search_results is None:
self.search_results = search_results
else:
self.search_results.chunks.extend(search_results.chunks)
self.search_results.source_ids.extend(search_results.source_ids)
def extend_text_results(self, text_result: str) -> None:
"""
Extends the text result with additional data.
Args:
text_result (str): The new text result to extend.
"""
if self.text_results is None:
self.text_results = [text_result]
else:
self.text_results.append(text_result)
def extend_tool_name(self, tool_name: str) -> None:
"""
Extends the tool name with additional data.
Args:
tool_name (str): The new tool name to extend.
"""
if self.tool_names is None:
self.tool_names = [tool_name]
else:
self.tool_names.append(tool_name)
@property
def to_text(self) -> str:
"""
Generates formatted text from search results or returns the text result.
If search_results exists, formats content from each chunk along with its source.
Otherwise, returns the text_result if available.
Returns:
str: The formatted text from search results or the text result.
Raises:
ValueError: If neither search_results nor text_results are available.
"""
if self.search_results and self.search_results.chunks:
formatted_chunks = []
for i, chunk in enumerate(self.search_results.chunks):
# Handle UnifiedDataChunk structure
content = chunk.content
metadata = chunk.metadata or {}
source_info = f"Source: {metadata.title}"
if metadata.journal:
source_info += f" - {metadata.journal}"
if metadata.published_date:
source_info += f" ({metadata.published_date})"
# Format the chunk with its content and source
formatted_chunk = f"### Chunk {i+1}\n{content}\n\n*{source_info}*\n"
formatted_chunks.append(formatted_chunk)
return "\n---\n".join(formatted_chunks)
elif self.text_results:
return '\n---\n'.join(self.text_results)
else:
return "No search results or text results available."
@property
def get_chroma_ids(self) -> List[str]:
"""
Returns the list of Chroma IDs from the search results.
Returns:
List[str]: The list of Chroma IDs.
"""
if self.search_results and self.search_results.source_ids:
return self.search_results.source_ids
return []
class ChunkSearchResults(BaseModel):
"""
Represents the results of a search query across document collections.
Attributes:
chunks (List[DocumentChunk]): List of document chunks containing text and metadata.
chroma_ids (List[str]): List of Chroma IDs for the chunks.
arango_ids (List[str]): List of ArangoDB IDs for the related documents.
"""
chunks: List[UnifiedDataChunk] = Field(
description="List of document chunks containing text, metadata, and relevance scores."
)
chroma_ids: List[str] = Field(
default_factory=list, description="List of Chroma IDs for the chunks"
)
arango_ids: List[str] = Field(
default_factory=list,
description="List of ArangoDB IDs for the related documents",
)