sci/models.py

from pydantic import BaseModel, Field
from typing import Dict, List, Tuple, Optional, Any

class ArticleChunk(BaseModel):
    summary: str
    tags: List[str]
    references: Optional[List[str]]


class QueryResponse(BaseModel):
    """
    Represents a query generated for retrieving documents from a vector database.

    Attributes:
        query (str): The generated query text, short and concise.
    """

    query: str = Field(
        description="The generated query that will be used to retrieve documents from a vector database (ChromaDB). Should be short and concise.",
        example="capital of France",
    )

class ArticleMetadataResponse(BaseModel):
    """
    Represents structured metadata extracted from an article by an LLM.
    """
    published_date: Optional[str] = Field(
        description="The publication date of the article in YYYY-MM-DD format."
    )
    title: str = Field(
        description="The full title of the article."
    )
    journal: Optional[str] = Field(
        description="The name of the journal/paper/outlet where the article was published."
    )


class PlanEvaluationResponse(BaseModel):
    """
    Represents the evaluation of a plan's step.

    Attributes:
        reasoning (str): Explanation of the reasoning behind the evaluation.
        complete (bool): Indicates if the step has sufficient information to proceed.
    """

    reasoning: str = Field(
        description="A short explanation of the reasoning behind the evaluation",
        example="Although some information is missing, the existing data is sufficient to complete the step.",
    )
    complete: bool = Field(
        description="Indicates whether the information is sufficient to complete the step",
        example=False,
    )


class EvaluateFormat(BaseModel):
    """
    Represents the evaluation format for determining sufficiency of information.

    Attributes:
        explanation (str): Explanation of whether the information is sufficient.
        status (bool): Indicates sufficiency of the information.
        additional_info (Optional[str]): Additional information needed if insufficient.
    """

    explanation: str = Field(
        description="A very short explanation of whether the information is sufficient or not",
        example="The information is sufficient because...",
    )
    status: bool = Field(
        description="If the information is sufficient to complete the step or not.",
        example=True,
    )
    additional_info: Optional[str] = Field(
        description="If the information is not sufficient, what additional information would be needed",
        example="We need more information about...",
    )


class Plan(BaseModel):
    """
    Represents a structured plan with steps and corresponding tasks or facts.

    Attributes:
        steps (Dict[str, List[Tuple[str, str]]]): A dictionary where keys are step names and values are lists of tasks or facts.
    """

    steps: Dict[str, List[Tuple[str, str]]] = Field(
        description="Structured plan represented as steps with their corresponding tasks or facts",
        example={
            "Step 1: Gather Existing Materials": [
                ("Task 1", "Description of task"),
                ("Task 2", "Description of task"),
            ],
            "Step 2: Extract Relevant Information": [
                ("Task 1", "Description of task"),
                ("Task 2", "Description of task"),
            ],
        },
    )


class ChunkMetadata(BaseModel):
    """
    Metadata associated with a document chunk.

    Attributes:
        title (str): Title of the document chunk.
        journal (Optional[str]): Journal where the document was published.
        published_date (Optional[str]): Date of publication.
        user_notes (Optional[str]): User-provided notes.
        arango_id (Optional[str]): Unique identifier for the document in ArangoDB.
        additional_metadata (Dict[str, Any]): Any additional metadata fields.
        doi (Optional[str]): Digital Object Identifier for the document.
        link: (Optional[str]): URL to access the document.
        authors (Optional[List[str]]): List of authors of the document.
        published_year (Optional[int]): Year of publication.
        abstract: (Optional[str]): Abstract of the document.
        pages: (Optional[str]): Page numbers of the document.
        chroma_id (Optional[str]): Unique identifier for the chunk in ChromaDB.
    """

    title: str = Field(default="No title", description="Title of the document chunk.")
    journal: Optional[str] = None
    published_date: Optional[str] = None
    user_notes: Optional[str] = None
    _id: Optional[str] = None
    additional_metadata: Dict[str, Any] = Field(default_factory=dict)
    doi: Optional[str] = None
    link: Optional[str] = None
    authors: Optional[List[str]] = Field(
        default_factory=list,
        description="List of authors of the document.",
    )
    published_year: Optional[int] = Field(
        default=None,
        description="Year of publication.",
    )
    abstract: Optional[str] = Field(
        default=None,
        description="Abstract of the document.",
    )
    pages: Optional[str] = Field(
        default=None,
        description="Page numbers of the document.",
    )
    chroma_id: Optional[str] = Field(
        default=None,
        description="Unique identifier for the chunk in ChromaDB.",
    )


class DocumentChunk(BaseModel):
    """
    Represents a chunk of text from a document with its metadata.

    Attributes:
        document (str): The text content of the chunk.
        metadata (ChunkMetadata): Metadata associated with the chunk.
    """

    document: str
    metadata: ChunkMetadata


class UnifiedDataChunk(BaseModel):
    """
    Represents a unified chunk of data from any source.

    Attributes:
        content (str): The main content of the chunk (e.g., text, note, or document).
        metadata (Optional[Dict[str, Any]]): Metadata associated with the chunk.
        source_type (str): The type of source (e.g., 'note', 'article', 'document').
    """

    content: str = Field(
        description="The main content of the chunk (e.g., text, note, or document)."
    )
    metadata: Optional[ChunkMetadata] = Field(
        description="Metadata associated with the chunk (e.g., title, source, date).",
    )
    source_type: str = Field(
        description="The type of source (e.g., 'note', 'article', 'document')."
    )


class UnifiedSearchResults(BaseModel):
    """
    Represents unified search results from any search tool.

    Attributes:
        chunks (List[UnifiedDataChunk]): List of data chunks from the search.
        source_ids (List[str]): List of unique source IDs for the chunks.
    """

    chunks: List[UnifiedDataChunk] = Field(
        description="List of data chunks from the search."
    )
    source_ids: List[str] = Field(
        default_factory=list, description="List of unique source IDs for the chunks."
    )


class UnifiedToolResponse(BaseModel):
    """
    Represents a unified response from any tool.

    Attributes:
        search_results (Optional[UnifiedSearchResults]): The unified search results, if the tool used is returning search results.
        text_result (Optional[str]): Text result from the tool, e.g., if the tool is an analysis.
        tool_name (str): The name of the tool used to generate the response.
    """

    search_results: Optional[UnifiedSearchResults] = Field(
        default=None,
        description="The unified search results, if the tools used is returning search results.",
    )
    text_results: Optional[list[str]] = Field(
        default=None,
        description="Text results from the tool, e.g., if the tool is an analysis.",
    )
    tool_names: Optional[list[str]] = Field(
        default=None, description="The name of the tool used to generate the response."
    )

    def extend_search_results(self, search_results: UnifiedSearchResults) -> None:
        """
        Extends the search results with additional data.

        Args:
            search_results (UnifiedSearchResults): The new search results to extend.
        """
        if self.search_results is None:
            self.search_results = search_results
        else:
            self.search_results.chunks.extend(search_results.chunks)
            self.search_results.source_ids.extend(search_results.source_ids)

    def extend_text_results(self, text_result: str) -> None:
        """
        Extends the text result with additional data.

        Args:
            text_result (str): The new text result to extend.
        """
        if self.text_results is None:
            self.text_results = [text_result]
        else:
            self.text_results.append(text_result)

    def extend_tool_name(self, tool_name: str) -> None:
        """
        Extends the tool name with additional data.

        Args:
            tool_name (str): The new tool name to extend.
        """
        if self.tool_names is None:
            self.tool_names = [tool_name]
        else:
            self.tool_names.append(tool_name)

    @property
    def to_text(self) -> str:
        """
        Generates formatted text from search results or returns the text result.

        If search_results exists, formats content from each chunk along with its source.
        Otherwise, returns the text_result if available.

        Returns:
            str: The formatted text from search results or the text result.
        Raises:
            ValueError: If neither search_results nor text_results are available.
        """
        if self.search_results and self.search_results.chunks:
            formatted_chunks = []
            for i, chunk in enumerate(self.search_results.chunks):
                # Handle UnifiedDataChunk structure
                content = chunk.content
                metadata = chunk.metadata or {}

                source_info = f"Source: {metadata.title}"
                if metadata.journal:
                    source_info += f" - {metadata.journal}"
                if metadata.published_date:
                    source_info += f" ({metadata.published_date})"

                # Format the chunk with its content and source
                formatted_chunk = f"### Chunk {i+1}\n{content}\n\n*{source_info}*\n"
                formatted_chunks.append(formatted_chunk)

            return "\n---\n".join(formatted_chunks)
        elif self.text_results:
            return '\n---\n'.join(self.text_results)
        else:
            return "No search results or text results available."


    @property
    def get_chroma_ids(self) -> List[str]:
        """
        Returns the list of Chroma IDs from the search results.

        Returns:
            List[str]: The list of Chroma IDs.
        """
        if self.search_results and self.search_results.source_ids:
            return self.search_results.source_ids
        return []

class ChunkSearchResults(BaseModel):
    """
    Represents the results of a search query across document collections.

    Attributes:
        chunks (List[DocumentChunk]): List of document chunks containing text and metadata.
        chroma_ids (List[str]): List of Chroma IDs for the chunks.
        arango_ids (List[str]): List of ArangoDB IDs for the related documents.
    """

    chunks: List[UnifiedDataChunk] = Field(
        description="List of document chunks containing text, metadata, and relevance scores."
    )
    chroma_ids: List[str] = Field(
        default_factory=list, description="List of Chroma IDs for the chunks"
    )
    arango_ids: List[str] = Field(
        default_factory=list,
        description="List of ArangoDB IDs for the related documents",
    )