from fastapi import FastAPI, BackgroundTasks, Request from fastapi.responses import JSONResponse, HTMLResponse import logging from datetime import datetime import json import os from typing import Dict, Any from prompts import get_summary_prompt from _llm import LLM from _arango import ArangoDB from models import ArticleChunk from _chromadb import ChromaDB app = FastAPI() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Storage for the latest processed document latest_result: Dict[str, Any] = {} latest_result_file = os.path.join(os.path.dirname(__file__), "latest_summary_result.json") # Load any previously saved result on startup try: if os.path.exists(latest_result_file): with open(latest_result_file, 'r') as f: latest_result = json.load(f) logger.info(f"Loaded previous result from {latest_result_file}") except Exception as e: logger.warning(f"Could not load previous result: {e}") # Function to save the latest result to disk def save_latest_result(result: Dict[str, Any]): global latest_result latest_result = result try: # Save sanitized version (remove internal fields if needed) result_to_save = {k: v for k, v in result.items() if not k.startswith('_') or k == '_id'} with open(latest_result_file, 'w') as f: json.dump(result_to_save, f, indent=2) logger.info(f"Saved latest result to {latest_result_file}") except Exception as e: logger.error(f"Error saving latest result: {e}") # New endpoint to get the latest summarized document @app.get("/latest_result") async def get_latest_result(): """ Get the latest summarized document result. Returns the most recently processed document summary and chunk information. If no document has been processed yet, returns an empty object. Returns ------- dict The latest processed document with summaries """ if not latest_result: return {"message": "No documents have been processed yet"} return latest_result @app.get("/view_results") async def view_results(): """ View the latest summarization results in a more readable format. Returns a formatted response with document summary and chunks. Returns ------- dict A formatted representation of the latest summarized document """ if not latest_result: return {"message": "No documents have been processed yet"} # Extract the key information formatted_result = { "document_id": latest_result.get("_id", "Unknown"), "timestamp": datetime.now().isoformat(), "summary": latest_result.get("summary", {}).get("text_sum", "No summary available"), "model": latest_result.get("summary", {}).get("meta", {}).get("model", "Unknown model"), } # Format chunks information if available chunks = latest_result.get("chunks", []) if chunks: formatted_chunks = [] for i, chunk in enumerate(chunks): chunk_data = { "chunk_number": i + 1, "summary": chunk.get("summary", "No summary available"), "tags": chunk.get("tags", []) } # Add references for scientific articles if available if "references" in chunk: chunk_data["references"] = chunk.get("references", []) formatted_chunks.append(chunk_data) formatted_result["chunks"] = formatted_chunks formatted_result["chunk_count"] = len(chunks) return formatted_result @app.get("/html_results", response_class=HTMLResponse) async def html_results(): """ View the latest summarization results in a human-readable HTML format. """ if not latest_result: return """ No Results Available

No Documents Have Been Processed Yet

Submit a document for summarization first.

""" # Get the document ID and summary doc_id = latest_result.get("_id", "Unknown") summary = latest_result.get("summary", {}).get("text_sum", "No summary available") model = latest_result.get("summary", {}).get("meta", {}).get("model", "Unknown model") # Format chunks chunks_html = "" chunks = latest_result.get("chunks", []) for i, chunk in enumerate(chunks): chunk_summary = chunk.get("summary", "No summary available") tags = chunk.get("tags", []) tags_html = ", ".join(tags) if tags else "None" references_html = "" if "references" in chunk and chunk["references"]: references_html = "

References:

{ref}

" chunks_html += f"""

Chunk {i+1}

{chunk_summary}

Tags: {tags_html}

{references_html}

""" html_content = f""" Document Summary: {doc_id}

Document Summary

Document ID: {doc_id}
Model: {model}
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Summary

{summary}

Chunks ({len(chunks)})

{chunks_html} """ return html_content @app.post("/summarise_document") async def summarize_document(request: Request, background_tasks: BackgroundTasks): try: data = await request.json() logger.info(f"Received data: {data}") # Extract arango_id, checking both top-level field and inside arango_doc arango_doc = data.get('arango_doc', {}) or {} arango_id = arango_doc.get('_id', '') arango_db_name = data.get('arango_db_name', '').strip() if not arango_db_name: return JSONResponse( status_code=400, content={"detail": "Missing required field: arango_db_name"}, ) print(arango_doc) # Prepare data for processing data['text'] = arango_doc.get('text', '').strip() data['chunks'] = arango_doc.get('chunks', []) data['arango_db_name'] = arango_db_name data['arango_id'] = arango_id data["arango_key"] = arango_doc['_key'] data['is_sci'] = data.get('is_sci', False) background_tasks.add_task(summarise_document_task, data) return {"message": "Document summarization has started."} except Exception as e: logger.error(f"Error in summarize_document: {e}") return JSONResponse( status_code=500, content={"detail": f"An unexpected error occurred: {str(e)}"}, ) def summarise_document_task(doc_data: dict): try: # Get document ID and validate it _id = doc_data.get("arango_id", "") # Validate document ID - it should be in format "collection/key" if not _id or '/' not in _id: logger.error(f"Invalid document ID format: {_id}") return text = doc_data.get("text", "") is_sci = doc_data.get("is_sci", False) # Get collection name from document ID collection = _id.split('/')[0] # Set appropriate system message based on document type if collection == 'interviews': system_message = "You are summarising interview transcripts. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English." elif is_sci or collection == 'sci_articles': system_message = "You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English." else: system_message = "You are summarising a document. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English." # Initialize LLM and generate summary llm = LLM(system_message=system_message) #if 'abstract' prompt = get_summary_prompt(text, is_sci) response = llm.generate(query=prompt) summary = response.content # Create summary document summary_doc = { "text_sum": summary, "meta": { "model": llm.model, "temperature": llm.options["temperature"] if text else 0, }, } # Process chunks if they exist chunks = doc_data.get("chunks", []) if chunks: doc_data["chunks"] = summarise_chunks(chunks, is_sci=is_sci) # Get database name and validate it db_name = doc_data.get("arango_db_name") if not db_name: logger.error("Missing database name") return # Update document in ArangoDB arango = ArangoDB(db_name=db_name) arango.db.update_document( {"summary": summary_doc, "_id": _id, "chunks": doc_data["chunks"]}, silent=True, check_rev=False, ) # Update ChromaDB with the new summary chroma = ChromaDB() if db_name == "sci_articles": chroma.add_document( collection="sci_articles_article_summaries", document_id= doc_data["_key"] text=summary_doc["text_sum"], metadata={ "model": summary_doc["meta"]["model"], "date": datetime.now().strftime("%Y-%m-%d"), "arango_id": _id, "arango_db_name": db_name, }, ) # Save the latest result save_latest_result({"summary": summary_doc, "_id": _id, "chunks": doc_data["chunks"]}) logger.info(f"Successfully processed document {_id}") except Exception as e: # Log error with document ID if available doc_id = doc_data.get("arango_id", "unknown") logger.error(f'Error processing document ID: {doc_id}') logger.error(f"Error in summarise_document_task: {e}") def summarise_chunks(chunks: list, is_sci=False): """ Summarize chunks of text in a document using a language model. For each chunk in the document that doesn't already have a summary, this function: 1. Generates a summary of the chunk text 2. Creates tags for the chunk 3. If is_sci=True, extracts scientific references from the chunk Parameters ---------- chunks: list A list of dictionaries representing chunks of text from a document. Each chunk should have a "text" field containing the text to summarize. is_sci : bool, default=False If True, uses a scientific article summarization prompt and extracts references. If False, uses a general article summarization prompt. Returns ------- list A list of updated chunks containing summaries, tags, and metadata. Raises ------ Exception If there's an error processing a chunk. Notes ----- - Chunks that already have a "summary" field are skipped. - The function uses an LLM instance with a system prompt tailored to the document type. - The structured response is validated against the ArticleChunk model. """ if is_sci: system_message = """You are a science assistant summarizing scientific articles. You will get an article chunk by chunk, and you have three tasks for each chunk: 1. Summarize the content of the chunk. 2. Tag the chunk with relevant tags. 3. Extract the scientific references from the chunk. """ else: system_message = """You are a general assistant summarizing articles. You will get an article chunk by chunk, and you have two tasks for each chunk: 1. Summarize the content of the chunk. 2. Tag the chunk with relevant tags. """ system_message += """\nPlease make use of the previous chunks you have already seen to understand the current chunk in context and make the summary stand for itself. But remember, *it is the current chunk you are summarizing* ONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks.""" llm = LLM(system_message=system_message) new_chunks = [] for chunk in chunks: if "summary" in chunk: new_chunks.append(chunk) continue prompt = f"""Summarize the following text to make it stand on its own:\n ''' {chunk['text']} '''\n Your tasks are: 1. Summarize the content of the chunk. Make sure to include all relevant details! 2. Tag the chunk with relevant tags. """ if is_sci: prompt += "\n3. Extract the scientific references mentioned in this specific chunk. If there is a DOI reference, include that in the reference. Sometimes the reference is only a number in brackets, like [1], so make sure to include that as well (in brackets)." prompt += "\nONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks." try: response = llm.generate(prompt, format=ArticleChunk.model_json_schema()) structured_response = ArticleChunk.model_validate_json(response.content) chunk["summary"] = structured_response.summary chunk["tags"] = [i.lower() for i in structured_response.tags] # Add references for scientific articles if they exist in the response if is_sci and hasattr(structured_response, 'references') and structured_response.references: chunk["references"] = structured_response.references chunk["summary_meta"] = { "model": llm.model, "date": datetime.now().strftime("%Y-%m-%d"), } except Exception as e: logger.error(f"Error processing chunk: {e}") # Continue processing other chunks even if one fails chunk["summary"] = "Error processing chunk" chunk["tags"] = [] new_chunks.append(chunk) return new_chunks if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8100)