electric_cars_project/enrich_sci_articles.py

from _arango import ArangoDB
from _llm import LLM
import tiktoken
from colorprinter.print_color import *

def make_summaries():

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("cl100k_base")

    def count_tokens(text):
        tokens = tokenizer.encode(text)
        return len(tokens)

    articles = [i for i in arango.db.aql.execute('''
    for doc in sci_articles
    return doc
    ''')]

    for article in articles:
        num_tokens = count_tokens(article["text"])
        llm = LLM(
        system_message="You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations.",
        num_ctx=num_tokens+3000 if num_tokens < 67000 else 70000,
        temperature=0,
    )
        try:
            prompt = f'''
    Make a summary of the following text:
    """
    {article["text"]}
    """
    Write a detailed summary. Make sure to include information from all sections: introduction, methods, results, and conclusion.
    Everything about electric vehicles, and things related to electric cars, is very important.
    Write the summary as if you are writing for someone who is not familiar with the topic.
    Write it from the point of the view of the author of the text.
    '''

            article["summary"] = {
                "meta": {"model": llm.llm_model, "system_message": llm.system_message, 'num_ctx': llm.options['num_ctx'], 'temperature': llm.options['temperature']},
                "text_sum": llm.generate(prompt),
            }
            print(article["summary"])
            arango.db.collection("sci_articles").update(article)
        except Exception as e:
            print(e)
            article['summary_error'] = str(e)
            arango.db.collection("sci_articles").update(article)
            continue


def make_chunk_qa(num_qa=5):
    articles = [i for i in arango.db.aql.execute('''
    for doc in sci_articles
    return doc
    ''')]

    for article in articles:
        try:
            if 'abstract' in article['metadata']:
                abstract = article['metadata']['abstract'].replace('<jats:p', '').replace('</jats:p>', '')
            else:
                abstract = article['summary']['text_sum']

            question_machine = LLM(
            system_message= f'''You are creating questions based on scientific articles. You will be given one text snippet from the article at a time and you should create {num_qa} questions based on that snippet.
            To understand the article as a whole you can read this abstract:
            """
            {abstract}
            """
            The {num_qa} questions should be based on the text snippet and should be answerable by the text, but you can check the conversation history to make them more relevant for the context.
            Don't write general questions like "what is the text about?", but rather questions that reflect the facts in the text.
            The questions will be used in a CSV file so it's important that you answer on the format: "question1;question2;question3;question4;question5".
            Always make {num_qa} questions to every text!
            ''',
            num_ctx=20000,
            temperature=0.2,
        )

            answer_machine = LLM(
            system_message=f'''You are answering questions about a text snippet from a scientific article. You will be given one question and one text snippet at a time and you should answer the questions based on that snippet.
            The answers should be based on the text snippet, but you can check the conversation history to make them more relevant for the context.
            Answer ONLY with the answer to the question, not a reasoning where you explain why you think that is the answer.
            Make the answers long enough to be informative and contain relevant information, but not too long.
            ''',
            num_ctx=20000,
            temperature=0.2,
        )

            for chunk in article["chunks"]:
                if 'qa' in chunk:
                    continue
                chunk["qa"] = []
                prompt = f'''
                """
                {chunk['text']}
                """
                Remember:
                - If there is something in the text about electric cars, please include that in the question.
                - Don't write general questinos like "what is the text about?" or "what is the main point of the text?", but rather questions that can be answered by the text. The questions will be used to query a vector database.
                - Answer on the format: "question1;question2;question3;question4;question5" as the questions will be used in a CSV file. Answer ONLY with the questions, not anything else!
                '''
                questions = question_machine.generate(prompt).split(';')
                for question in questions:
                    print_blue(question)
                    if questions.index(question) == 0:
                        prompt = f'''
                        Answer the following question based on the text snippet below: {question}
                        """
                        {chunk['text']}
                        """
                        Remember:
                        - The answer should be based on the text.
                        - If there is something in the text about electric cars, please include that in the answer.
                        - Answer ONLY with the answer, nothing else.
                        '''
                    else:
                        prompt = question

                    answer = answer_machine.generate(question)
                    print_green(answer)
                    qa = {
                        "question": question,
                        "answer": answer,
                    }
                    chunk["qa"].append(qa)

            arango.db.collection("sci_articles").update(article, check_rev=False)
        except Exception as e:
            print(e)
            article['qa_error'] = str(e)
            arango.db.collection("sci_articles").update(article, check_rev=False)
            continue


if __name__ == "__main__":
    arango = ArangoDB()
    make_chunk_qa()