from pydantic import BaseModel, Field from typing import Dict, List, Tuple, Optional, Any class ArticleChunk(BaseModel): summary: str tags: List[str] references: Optional[List[str]] class QueryResponse(BaseModel): """ Represents a query generated for retrieving documents from a vector database. Attributes: query (str): The generated query text, short and concise. """ query: str = Field( description="The generated query that will be used to retrieve documents from a vector database (ChromaDB). Should be short and concise.", example="capital of France", ) class ArticleMetadataResponse(BaseModel): """ Represents structured metadata extracted from an article by an LLM. """ published_date: Optional[str] = Field( description="The publication date of the article in YYYY-MM-DD format." ) title: str = Field( description="The full title of the article." ) journal: Optional[str] = Field( description="The name of the journal/paper/outlet where the article was published." ) class PlanEvaluationResponse(BaseModel): """ Represents the evaluation of a plan's step. Attributes: reasoning (str): Explanation of the reasoning behind the evaluation. complete (bool): Indicates if the step has sufficient information to proceed. """ reasoning: str = Field( description="A short explanation of the reasoning behind the evaluation", example="Although some information is missing, the existing data is sufficient to complete the step.", ) complete: bool = Field( description="Indicates whether the information is sufficient to complete the step", example=False, ) class EvaluateFormat(BaseModel): """ Represents the evaluation format for determining sufficiency of information. Attributes: explanation (str): Explanation of whether the information is sufficient. status (bool): Indicates sufficiency of the information. additional_info (Optional[str]): Additional information needed if insufficient. """ explanation: str = Field( description="A very short explanation of whether the information is sufficient or not", example="The information is sufficient because...", ) status: bool = Field( description="If the information is sufficient to complete the step or not.", example=True, ) additional_info: Optional[str] = Field( description="If the information is not sufficient, what additional information would be needed", example="We need more information about...", ) class Plan(BaseModel): """ Represents a structured plan with steps and corresponding tasks or facts. Attributes: steps (Dict[str, List[Tuple[str, str]]]): A dictionary where keys are step names and values are lists of tasks or facts. """ steps: Dict[str, List[Tuple[str, str]]] = Field( description="Structured plan represented as steps with their corresponding tasks or facts", example={ "Step 1: Gather Existing Materials": [ ("Task 1", "Description of task"), ("Task 2", "Description of task"), ], "Step 2: Extract Relevant Information": [ ("Task 1", "Description of task"), ("Task 2", "Description of task"), ], }, ) class ChunkMetadata(BaseModel): """ Metadata associated with a document chunk. Attributes: title (str): Title of the document chunk. journal (Optional[str]): Journal where the document was published. published_date (Optional[str]): Date of publication. user_notes (Optional[str]): User-provided notes. arango_id (Optional[str]): Unique identifier for the document in ArangoDB. additional_metadata (Dict[str, Any]): Any additional metadata fields. doi (Optional[str]): Digital Object Identifier for the document. link: (Optional[str]): URL to access the document. authors (Optional[List[str]]): List of authors of the document. published_year (Optional[int]): Year of publication. abstract: (Optional[str]): Abstract of the document. pages: (Optional[str]): Page numbers of the document. chroma_id (Optional[str]): Unique identifier for the chunk in ChromaDB. """ title: str = Field(default="No title", description="Title of the document chunk.") journal: Optional[str] = None published_date: Optional[str] = None user_notes: Optional[str] = None _id: Optional[str] = None additional_metadata: Dict[str, Any] = Field(default_factory=dict) doi: Optional[str] = None link: Optional[str] = None authors: Optional[List[str]] = Field( default_factory=list, description="List of authors of the document.", ) published_year: Optional[int] = Field( default=None, description="Year of publication.", ) abstract: Optional[str] = Field( default=None, description="Abstract of the document.", ) pages: Optional[str] = Field( default=None, description="Page numbers of the document.", ) chroma_id: Optional[str] = Field( default=None, description="Unique identifier for the chunk in ChromaDB.", ) class DocumentChunk(BaseModel): """ Represents a chunk of text from a document with its metadata. Attributes: document (str): The text content of the chunk. metadata (ChunkMetadata): Metadata associated with the chunk. """ document: str metadata: ChunkMetadata class UnifiedDataChunk(BaseModel): """ Represents a unified chunk of data from any source. Attributes: content (str): The main content of the chunk (e.g., text, note, or document). metadata (Optional[Dict[str, Any]]): Metadata associated with the chunk. source_type (str): The type of source (e.g., 'note', 'article', 'document'). """ content: str = Field( description="The main content of the chunk (e.g., text, note, or document)." ) metadata: Optional[ChunkMetadata] = Field( description="Metadata associated with the chunk (e.g., title, source, date).", ) source_type: str = Field( description="The type of source (e.g., 'note', 'article', 'document')." ) class UnifiedSearchResults(BaseModel): """ Represents unified search results from any search tool. Attributes: chunks (List[UnifiedDataChunk]): List of data chunks from the search. source_ids (List[str]): List of unique source IDs for the chunks. """ chunks: List[UnifiedDataChunk] = Field( description="List of data chunks from the search." ) source_ids: List[str] = Field( default_factory=list, description="List of unique source IDs for the chunks." ) class UnifiedToolResponse(BaseModel): """ Represents a unified response from any tool. Attributes: search_results (Optional[UnifiedSearchResults]): The unified search results, if the tool used is returning search results. text_result (Optional[str]): Text result from the tool, e.g., if the tool is an analysis. tool_name (str): The name of the tool used to generate the response. """ search_results: Optional[UnifiedSearchResults] = Field( default=None, description="The unified search results, if the tools used is returning search results.", ) text_results: Optional[list[str]] = Field( default=None, description="Text results from the tool, e.g., if the tool is an analysis.", ) tool_names: Optional[list[str]] = Field( default=None, description="The name of the tool used to generate the response." ) def extend_search_results(self, search_results: UnifiedSearchResults) -> None: """ Extends the search results with additional data. Args: search_results (UnifiedSearchResults): The new search results to extend. """ if self.search_results is None: self.search_results = search_results else: self.search_results.chunks.extend(search_results.chunks) self.search_results.source_ids.extend(search_results.source_ids) def extend_text_results(self, text_result: str) -> None: """ Extends the text result with additional data. Args: text_result (str): The new text result to extend. """ if self.text_results is None: self.text_results = [text_result] else: self.text_results.append(text_result) def extend_tool_name(self, tool_name: str) -> None: """ Extends the tool name with additional data. Args: tool_name (str): The new tool name to extend. """ if self.tool_names is None: self.tool_names = [tool_name] else: self.tool_names.append(tool_name) @property def to_text(self) -> str: """ Generates formatted text from search results or returns the text result. If search_results exists, formats content from each chunk along with its source. Otherwise, returns the text_result if available. Returns: str: The formatted text from search results or the text result. Raises: ValueError: If neither search_results nor text_results are available. """ if self.search_results and self.search_results.chunks: formatted_chunks = [] for i, chunk in enumerate(self.search_results.chunks): # Handle UnifiedDataChunk structure content = chunk.content metadata = chunk.metadata or {} source_info = f"Source: {metadata.title}" if metadata.journal: source_info += f" - {metadata.journal}" if metadata.published_date: source_info += f" ({metadata.published_date})" # Format the chunk with its content and source formatted_chunk = f"### Chunk {i+1}\n{content}\n\n*{source_info}*\n" formatted_chunks.append(formatted_chunk) return "\n---\n".join(formatted_chunks) elif self.text_results: return '\n---\n'.join(self.text_results) else: return "No search results or text results available." @property def get_chroma_ids(self) -> List[str]: """ Returns the list of Chroma IDs from the search results. Returns: List[str]: The list of Chroma IDs. """ if self.search_results and self.search_results.source_ids: return self.search_results.source_ids return [] class ChunkSearchResults(BaseModel): """ Represents the results of a search query across document collections. Attributes: chunks (List[DocumentChunk]): List of document chunks containing text and metadata. chroma_ids (List[str]): List of Chroma IDs for the chunks. arango_ids (List[str]): List of ArangoDB IDs for the related documents. """ chunks: List[UnifiedDataChunk] = Field( description="List of document chunks containing text, metadata, and relevance scores." ) chroma_ids: List[str] = Field( default_factory=list, description="List of Chroma IDs for the chunks" ) arango_ids: List[str] = Field( default_factory=list, description="List of ArangoDB IDs for the related documents", )