sci/test_llm_server.py

import requests
import json
import time
from _arango import ArangoDB  # Import ArangoDB client to fetch results

def test_summarize_document():
    """
    Test the document summarization functionality of the LLM server by sending a POST request
    to the summarize_document endpoint.

    This function creates a sample document, sends it to the LLM server, and then polls for results.
    """
    print("Testing document summarization...")

    # Define server endpoint
    url = "http://localhost:8100/summarise_document"

    # Create a sample document
    sample_document = {
        "arango_doc": {
            "text": """
            The Impact of Climate Change on Coral Reefs

            Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable.
            Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae,
            leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature
            can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption
            makes it difficult for corals to build their calcium carbonate skeletons.

            Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90%
            of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt
            to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts
            for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may
            provide hope for reef preservation.
            """,
            "chunks": []
        },
        "arango_db_name": "test_db",
        "arango_id": "articles/test_article",
        "is_sci": True
    }

    # Send request to server
    print("Sending document to server for summarization...")
    response = requests.post(url, json=sample_document)

    if response.status_code == 200:
        print("Request accepted. Response:", response.json())

        # Save values for checking results later
        return {
            "db_name": sample_document["arango_db_name"],
            "doc_id": sample_document["arango_id"]
        }
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

def test_summarize_chunks():
    """
    Test the chunk summarization functionality directly by creating a sample document with chunks.

    In a real application, you'd typically query the results from the database after processing.
    """
    print("\nTesting chunk summarization example...")

    # Sample document with chunks
    sample_document_with_chunks = {
        "arango_doc": {
            "text": "",
            "chunks": [
                {
                    "text": "Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. Rising sea temperatures have led to increased coral bleaching events.",
                    "pages": [1]
                },
                {
                    "text": "Studies by Smith et al. [1] show that even a 1-2°C increase in water temperature can trigger mass bleaching events. Additionally, ocean acidification makes it difficult for corals to build their calcium carbonate skeletons.",
                    "pages": [1, 2]
                }
            ]
        },
        "arango_db_name": "test_db",
        "arango_id": "interviews/test_interview",
        "is_sci": False
    }

    url = "http://localhost:8100/summarise_document"
    print("Sending document with chunks for summarization...")
    response = requests.post(url, json=sample_document_with_chunks)

    if response.status_code == 200:
        print("Request accepted. Response:", response.json())
        return {
            "db_name": sample_document_with_chunks["arango_db_name"],
            "doc_id": sample_document_with_chunks["arango_id"]
        }
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

def poll_for_results(doc_info, max_retries=10, wait_time=5):
    """
    Poll the ArangoDB database to check if the document has been summarized.

    Args:
        doc_info (dict): Dictionary containing db_name and doc_id
        max_retries (int): Maximum number of polling attempts
        wait_time (int): Time to wait between polling attempts (seconds)

    Returns:
        dict or None: The document with summaries if available, None otherwise
    """
    if not doc_info:
        return None

    db_name = doc_info["db_name"]
    doc_id = doc_info["doc_id"]

    print(f"\nPolling for results in {db_name}/{doc_id}...")

    arango = ArangoDB(db_name=db_name)

    for attempt in range(max_retries):
        print(f"Attempt {attempt+1}/{max_retries}...")

        try:
            # Get the document from ArangoDB
            document = arango.get_document(doc_id)

            # Check if the document has been summarized
            if document and "summary" in document:
                print("✓ Document summary found!")
                print("-" * 50)
                print("Document Summary:")
                print("-" * 50)
                print(document["summary"]["text_sum"])
                print("-" * 50)

                # Check if chunks have been summarized
                if "chunks" in document and document["chunks"] and "summary" in document["chunks"][0]:
                    print("✓ Chunk summaries found!")
                    print("-" * 50)
                    print("First Chunk Summary:")
                    print("-" * 50)
                    print(document["chunks"][0]["summary"])
                    print("-" * 50)
                    if len(document["chunks"]) > 1:
                        print("Tags:", document["chunks"][0]["tags"])

                return document

            # If we haven't found summaries yet, wait and try again
            time.sleep(wait_time)

        except Exception as e:
            print(f"Error checking document: {e}")
            time.sleep(wait_time)

    print("❌ Summarization not completed after maximum retries.")
    return None

if __name__ == "__main__":
    print("LLM Server Test Script")
    print("=====================\n")

    # Test if server is running
    try:
        requests.get("http://localhost:8100")
        print("Server is running at http://localhost:8100\n")
    except requests.exceptions.ConnectionError:
        print("ERROR: Cannot connect to server at http://localhost:8100")
        print("Make sure the server is running before continuing.\n")
        exit(1)

    # Run tests and store document info for polling
    doc1_info = test_summarize_document()
    time.sleep(2)  # Brief pause between tests
    doc2_info = test_summarize_chunks()

    print("\nWaiting for background tasks to complete...")
    print("This may take some time depending on LLM response speed.")

    # Poll for results (with longer wait time for the first document which needs to be chunked)
    poll_for_results(doc1_info, max_retries=20, wait_time=6)
    poll_for_results(doc2_info, max_retries=12, wait_time=5)

    print("\nTest script completed.")
    print("If you didn't see results, the background tasks might still be processing.")
    print("You can run this script again later to check, or query the database directly.")