You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
334 lines
12 KiB
334 lines
12 KiB
from pydantic import BaseModel, Field |
|
from typing import Dict, List, Tuple, Optional, Any |
|
|
|
class ArticleChunk(BaseModel): |
|
summary: str |
|
tags: List[str] |
|
references: Optional[List[str]] |
|
|
|
|
|
class QueryResponse(BaseModel): |
|
""" |
|
Represents a query generated for retrieving documents from a vector database. |
|
|
|
Attributes: |
|
query (str): The generated query text, short and concise. |
|
""" |
|
|
|
query: str = Field( |
|
description="The generated query that will be used to retrieve documents from a vector database (ChromaDB). Should be short and concise.", |
|
example="capital of France", |
|
) |
|
|
|
class ArticleMetadataResponse(BaseModel): |
|
""" |
|
Represents structured metadata extracted from an article by an LLM. |
|
""" |
|
published_date: Optional[str] = Field( |
|
description="The publication date of the article in YYYY-MM-DD format." |
|
) |
|
title: str = Field( |
|
description="The full title of the article." |
|
) |
|
journal: Optional[str] = Field( |
|
description="The name of the journal/paper/outlet where the article was published." |
|
) |
|
|
|
|
|
class PlanEvaluationResponse(BaseModel): |
|
""" |
|
Represents the evaluation of a plan's step. |
|
|
|
Attributes: |
|
reasoning (str): Explanation of the reasoning behind the evaluation. |
|
complete (bool): Indicates if the step has sufficient information to proceed. |
|
""" |
|
|
|
reasoning: str = Field( |
|
description="A short explanation of the reasoning behind the evaluation", |
|
example="Although some information is missing, the existing data is sufficient to complete the step.", |
|
) |
|
complete: bool = Field( |
|
description="Indicates whether the information is sufficient to complete the step", |
|
example=False, |
|
) |
|
|
|
|
|
class EvaluateFormat(BaseModel): |
|
""" |
|
Represents the evaluation format for determining sufficiency of information. |
|
|
|
Attributes: |
|
explanation (str): Explanation of whether the information is sufficient. |
|
status (bool): Indicates sufficiency of the information. |
|
additional_info (Optional[str]): Additional information needed if insufficient. |
|
""" |
|
|
|
explanation: str = Field( |
|
description="A very short explanation of whether the information is sufficient or not", |
|
example="The information is sufficient because...", |
|
) |
|
status: bool = Field( |
|
description="If the information is sufficient to complete the step or not.", |
|
example=True, |
|
) |
|
additional_info: Optional[str] = Field( |
|
description="If the information is not sufficient, what additional information would be needed", |
|
example="We need more information about...", |
|
) |
|
|
|
|
|
class Plan(BaseModel): |
|
""" |
|
Represents a structured plan with steps and corresponding tasks or facts. |
|
|
|
Attributes: |
|
steps (Dict[str, List[Tuple[str, str]]]): A dictionary where keys are step names and values are lists of tasks or facts. |
|
""" |
|
|
|
steps: Dict[str, List[Tuple[str, str]]] = Field( |
|
description="Structured plan represented as steps with their corresponding tasks or facts", |
|
example={ |
|
"Step 1: Gather Existing Materials": [ |
|
("Task 1", "Description of task"), |
|
("Task 2", "Description of task"), |
|
], |
|
"Step 2: Extract Relevant Information": [ |
|
("Task 1", "Description of task"), |
|
("Task 2", "Description of task"), |
|
], |
|
}, |
|
) |
|
|
|
|
|
class ChunkMetadata(BaseModel): |
|
""" |
|
Metadata associated with a document chunk. |
|
|
|
Attributes: |
|
title (str): Title of the document chunk. |
|
journal (Optional[str]): Journal where the document was published. |
|
published_date (Optional[str]): Date of publication. |
|
user_notes (Optional[str]): User-provided notes. |
|
arango_id (Optional[str]): Unique identifier for the document in ArangoDB. |
|
additional_metadata (Dict[str, Any]): Any additional metadata fields. |
|
doi (Optional[str]): Digital Object Identifier for the document. |
|
link: (Optional[str]): URL to access the document. |
|
authors (Optional[List[str]]): List of authors of the document. |
|
published_year (Optional[int]): Year of publication. |
|
abstract: (Optional[str]): Abstract of the document. |
|
pages: (Optional[str]): Page numbers of the document. |
|
chroma_id (Optional[str]): Unique identifier for the chunk in ChromaDB. |
|
""" |
|
|
|
title: str = Field(default="No title", description="Title of the document chunk.") |
|
journal: Optional[str] = None |
|
published_date: Optional[str] = None |
|
user_notes: Optional[str] = None |
|
_id: Optional[str] = None |
|
additional_metadata: Dict[str, Any] = Field(default_factory=dict) |
|
doi: Optional[str] = None |
|
link: Optional[str] = None |
|
authors: Optional[List[str]] = Field( |
|
default_factory=list, |
|
description="List of authors of the document.", |
|
) |
|
published_year: Optional[int] = Field( |
|
default=None, |
|
description="Year of publication.", |
|
) |
|
abstract: Optional[str] = Field( |
|
default=None, |
|
description="Abstract of the document.", |
|
) |
|
pages: Optional[str] = Field( |
|
default=None, |
|
description="Page numbers of the document.", |
|
) |
|
chroma_id: Optional[str] = Field( |
|
default=None, |
|
description="Unique identifier for the chunk in ChromaDB.", |
|
) |
|
|
|
|
|
class DocumentChunk(BaseModel): |
|
""" |
|
Represents a chunk of text from a document with its metadata. |
|
|
|
Attributes: |
|
document (str): The text content of the chunk. |
|
metadata (ChunkMetadata): Metadata associated with the chunk. |
|
""" |
|
|
|
document: str |
|
metadata: ChunkMetadata |
|
|
|
|
|
|
|
|
|
class UnifiedDataChunk(BaseModel): |
|
""" |
|
Represents a unified chunk of data from any source. |
|
|
|
Attributes: |
|
content (str): The main content of the chunk (e.g., text, note, or document). |
|
metadata (Optional[Dict[str, Any]]): Metadata associated with the chunk. |
|
source_type (str): The type of source (e.g., 'note', 'article', 'document'). |
|
""" |
|
|
|
content: str = Field( |
|
description="The main content of the chunk (e.g., text, note, or document)." |
|
) |
|
metadata: Optional[ChunkMetadata] = Field( |
|
description="Metadata associated with the chunk (e.g., title, source, date).", |
|
) |
|
source_type: str = Field( |
|
description="The type of source (e.g., 'note', 'article', 'document')." |
|
) |
|
|
|
|
|
class UnifiedSearchResults(BaseModel): |
|
""" |
|
Represents unified search results from any search tool. |
|
|
|
Attributes: |
|
chunks (List[UnifiedDataChunk]): List of data chunks from the search. |
|
source_ids (List[str]): List of unique source IDs for the chunks. |
|
""" |
|
|
|
chunks: List[UnifiedDataChunk] = Field( |
|
description="List of data chunks from the search." |
|
) |
|
source_ids: List[str] = Field( |
|
default_factory=list, description="List of unique source IDs for the chunks." |
|
) |
|
|
|
|
|
class UnifiedToolResponse(BaseModel): |
|
""" |
|
Represents a unified response from any tool. |
|
|
|
Attributes: |
|
search_results (Optional[UnifiedSearchResults]): The unified search results, if the tool used is returning search results. |
|
text_result (Optional[str]): Text result from the tool, e.g., if the tool is an analysis. |
|
tool_name (str): The name of the tool used to generate the response. |
|
""" |
|
|
|
search_results: Optional[UnifiedSearchResults] = Field( |
|
default=None, |
|
description="The unified search results, if the tools used is returning search results.", |
|
) |
|
text_results: Optional[list[str]] = Field( |
|
default=None, |
|
description="Text results from the tool, e.g., if the tool is an analysis.", |
|
) |
|
tool_names: Optional[list[str]] = Field( |
|
default=None, description="The name of the tool used to generate the response." |
|
) |
|
|
|
def extend_search_results(self, search_results: UnifiedSearchResults) -> None: |
|
""" |
|
Extends the search results with additional data. |
|
|
|
Args: |
|
search_results (UnifiedSearchResults): The new search results to extend. |
|
""" |
|
if self.search_results is None: |
|
self.search_results = search_results |
|
else: |
|
self.search_results.chunks.extend(search_results.chunks) |
|
self.search_results.source_ids.extend(search_results.source_ids) |
|
|
|
def extend_text_results(self, text_result: str) -> None: |
|
""" |
|
Extends the text result with additional data. |
|
|
|
Args: |
|
text_result (str): The new text result to extend. |
|
""" |
|
if self.text_results is None: |
|
self.text_results = [text_result] |
|
else: |
|
self.text_results.append(text_result) |
|
|
|
def extend_tool_name(self, tool_name: str) -> None: |
|
""" |
|
Extends the tool name with additional data. |
|
|
|
Args: |
|
tool_name (str): The new tool name to extend. |
|
""" |
|
if self.tool_names is None: |
|
self.tool_names = [tool_name] |
|
else: |
|
self.tool_names.append(tool_name) |
|
|
|
@property |
|
def to_text(self) -> str: |
|
""" |
|
Generates formatted text from search results or returns the text result. |
|
|
|
If search_results exists, formats content from each chunk along with its source. |
|
Otherwise, returns the text_result if available. |
|
|
|
Returns: |
|
str: The formatted text from search results or the text result. |
|
Raises: |
|
ValueError: If neither search_results nor text_results are available. |
|
""" |
|
if self.search_results and self.search_results.chunks: |
|
formatted_chunks = [] |
|
for i, chunk in enumerate(self.search_results.chunks): |
|
# Handle UnifiedDataChunk structure |
|
content = chunk.content |
|
metadata = chunk.metadata or {} |
|
|
|
source_info = f"Source: {metadata.title}" |
|
if metadata.journal: |
|
source_info += f" - {metadata.journal}" |
|
if metadata.published_date: |
|
source_info += f" ({metadata.published_date})" |
|
|
|
# Format the chunk with its content and source |
|
formatted_chunk = f"### Chunk {i+1}\n{content}\n\n*{source_info}*\n" |
|
formatted_chunks.append(formatted_chunk) |
|
|
|
return "\n---\n".join(formatted_chunks) |
|
elif self.text_results: |
|
return '\n---\n'.join(self.text_results) |
|
else: |
|
return "No search results or text results available." |
|
|
|
|
|
@property |
|
def get_chroma_ids(self) -> List[str]: |
|
""" |
|
Returns the list of Chroma IDs from the search results. |
|
|
|
Returns: |
|
List[str]: The list of Chroma IDs. |
|
""" |
|
if self.search_results and self.search_results.source_ids: |
|
return self.search_results.source_ids |
|
return [] |
|
|
|
class ChunkSearchResults(BaseModel): |
|
""" |
|
Represents the results of a search query across document collections. |
|
|
|
Attributes: |
|
chunks (List[DocumentChunk]): List of document chunks containing text and metadata. |
|
chroma_ids (List[str]): List of Chroma IDs for the chunks. |
|
arango_ids (List[str]): List of ArangoDB IDs for the related documents. |
|
""" |
|
|
|
chunks: List[UnifiedDataChunk] = Field( |
|
description="List of document chunks containing text, metadata, and relevance scores." |
|
) |
|
chroma_ids: List[str] = Field( |
|
default_factory=list, description="List of Chroma IDs for the chunks" |
|
) |
|
arango_ids: List[str] = Field( |
|
default_factory=list, |
|
description="List of ArangoDB IDs for the related documents", |
|
)
|
|
|