You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
139 lines
6.2 KiB
139 lines
6.2 KiB
from _arango import ArangoDB |
|
from _llm import LLM |
|
import tiktoken |
|
from colorprinter.print_color import * |
|
|
|
def make_summaries(): |
|
|
|
# Initialize the tokenizer |
|
tokenizer = tiktoken.get_encoding("cl100k_base") |
|
|
|
def count_tokens(text): |
|
tokens = tokenizer.encode(text) |
|
return len(tokens) |
|
|
|
articles = [i for i in arango.db.aql.execute(''' |
|
for doc in sci_articles |
|
return doc |
|
''')] |
|
|
|
for article in articles: |
|
num_tokens = count_tokens(article["text"]) |
|
llm = LLM( |
|
system_message="You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations.", |
|
num_ctx=num_tokens+3000 if num_tokens < 67000 else 70000, |
|
temperature=0, |
|
) |
|
try: |
|
prompt = f''' |
|
Make a summary of the following text: |
|
""" |
|
{article["text"]} |
|
""" |
|
Write a detailed summary. Make sure to include information from all sections: introduction, methods, results, and conclusion. |
|
Everything about electric vehicles, and things related to electric cars, is very important. |
|
Write the summary as if you are writing for someone who is not familiar with the topic. |
|
Write it from the point of the view of the author of the text. |
|
''' |
|
|
|
article["summary"] = { |
|
"meta": {"model": llm.llm_model, "system_message": llm.system_message, 'num_ctx': llm.options['num_ctx'], 'temperature': llm.options['temperature']}, |
|
"text_sum": llm.generate(prompt), |
|
} |
|
print(article["summary"]) |
|
arango.db.collection("sci_articles").update(article) |
|
except Exception as e: |
|
print(e) |
|
article['summary_error'] = str(e) |
|
arango.db.collection("sci_articles").update(article) |
|
continue |
|
|
|
|
|
def make_chunk_qa(num_qa=5): |
|
articles = [i for i in arango.db.aql.execute(''' |
|
for doc in sci_articles |
|
return doc |
|
''')] |
|
|
|
for article in articles: |
|
try: |
|
if 'abstract' in article['metadata']: |
|
abstract = article['metadata']['abstract'].replace('<jats:p', '').replace('</jats:p>', '') |
|
else: |
|
abstract = article['summary']['text_sum'] |
|
|
|
question_machine = LLM( |
|
system_message= f'''You are creating questions based on scientific articles. You will be given one text snippet from the article at a time and you should create {num_qa} questions based on that snippet. |
|
To understand the article as a whole you can read this abstract: |
|
""" |
|
{abstract} |
|
""" |
|
The {num_qa} questions should be based on the text snippet and should be answerable by the text, but you can check the conversation history to make them more relevant for the context. |
|
Don't write general questions like "what is the text about?", but rather questions that reflect the facts in the text. |
|
The questions will be used in a CSV file so it's important that you answer on the format: "question1;question2;question3;question4;question5". |
|
Always make {num_qa} questions to every text! |
|
''', |
|
num_ctx=20000, |
|
temperature=0.2, |
|
) |
|
|
|
answer_machine = LLM( |
|
system_message=f'''You are answering questions about a text snippet from a scientific article. You will be given one question and one text snippet at a time and you should answer the questions based on that snippet. |
|
The answers should be based on the text snippet, but you can check the conversation history to make them more relevant for the context. |
|
Answer ONLY with the answer to the question, not a reasoning where you explain why you think that is the answer. |
|
Make the answers long enough to be informative and contain relevant information, but not too long. |
|
''', |
|
num_ctx=20000, |
|
temperature=0.2, |
|
) |
|
|
|
for chunk in article["chunks"]: |
|
if 'qa' in chunk: |
|
continue |
|
chunk["qa"] = [] |
|
prompt = f''' |
|
""" |
|
{chunk['text']} |
|
""" |
|
Remember: |
|
- If there is something in the text about electric cars, please include that in the question. |
|
- Don't write general questinos like "what is the text about?" or "what is the main point of the text?", but rather questions that can be answered by the text. The questions will be used to query a vector database. |
|
- Answer on the format: "question1;question2;question3;question4;question5" as the questions will be used in a CSV file. Answer ONLY with the questions, not anything else! |
|
''' |
|
questions = question_machine.generate(prompt).split(';') |
|
for question in questions: |
|
print_blue(question) |
|
if questions.index(question) == 0: |
|
prompt = f''' |
|
Answer the following question based on the text snippet below: {question} |
|
""" |
|
{chunk['text']} |
|
""" |
|
Remember: |
|
- The answer should be based on the text. |
|
- If there is something in the text about electric cars, please include that in the answer. |
|
- Answer ONLY with the answer, nothing else. |
|
''' |
|
else: |
|
prompt = question |
|
|
|
answer = answer_machine.generate(question) |
|
print_green(answer) |
|
qa = { |
|
"question": question, |
|
"answer": answer, |
|
} |
|
chunk["qa"].append(qa) |
|
|
|
arango.db.collection("sci_articles").update(article, check_rev=False) |
|
except Exception as e: |
|
print(e) |
|
article['qa_error'] = str(e) |
|
arango.db.collection("sci_articles").update(article, check_rev=False) |
|
continue |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
arango = ArangoDB() |
|
make_chunk_qa() |