This commit adds code to download review references from a CSV file and process them. The references are read from a file called 'review_references.csv' and each reference is checked for a DOI. The user is prompted to confirm if a DOI is found, and the reference along with the confirmation is written to a new file called 'review_references.txt'. This code will be useful for managing review references in the future.main
parent
08e17d13a5
commit
911b8c33b0
15 changed files with 581 additions and 2 deletions
@ -0,0 +1,14 @@ |
||||
import chromadb |
||||
import os |
||||
import pymupdf4llm |
||||
from semantic_text_splitter import MarkdownSplitter |
||||
from _arango import ArangoDB |
||||
from pprint import pprint |
||||
|
||||
class ChromaDB: |
||||
def __init__(self): |
||||
self.db = chromadb.PersistentClient("chroma_db") |
||||
max_characters = 2200 |
||||
self.ts = MarkdownSplitter(max_characters) |
||||
self.sci_articles = self.db.get_or_create_collection("sci_articles") |
||||
|
||||
@ -0,0 +1,26 @@ |
||||
from ollama import Client |
||||
import os |
||||
import env_manager |
||||
env_manager.set_env() |
||||
|
||||
class LLM: |
||||
def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None: |
||||
self.llm_model = os.getenv("LLM_MODEL") |
||||
self.system_message = system_message |
||||
self.options = {"temperature": temperature, "num_ctx": num_ctx} |
||||
self.messages = [{'role': 'system', 'content': self.system_message}] |
||||
self.chat = chat |
||||
self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}') |
||||
|
||||
def generate(self, prompt: str) -> str: |
||||
self.messages.append({"role": "user", "content": prompt}) |
||||
|
||||
result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, ) |
||||
|
||||
answer = result['message']['content'] |
||||
self.messages.append({"role": "assistant", "content": answer}) |
||||
if not self.chat: |
||||
self.messages = [{'role': 'system', 'content': self.system_message}] |
||||
|
||||
return answer |
||||
|
||||
@ -0,0 +1,23 @@ |
||||
from _llm import LLM |
||||
from _chromadb import ChromaDB |
||||
|
||||
chromadb = ChromaDB() |
||||
llm = LLM(temperature=0.1) |
||||
|
||||
while True: |
||||
user_input = input("Enter a prompt: ") |
||||
chunks = chromadb.sci_articles.query(query_texts=user_input) |
||||
chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]]) |
||||
prompt = f'''{user_input} |
||||
Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information. |
||||
|
||||
""" |
||||
{chunks_string} |
||||
""" |
||||
|
||||
{user_input} |
||||
|
||||
''' |
||||
response = llm.generate(prompt) |
||||
print(response) |
||||
print() |
||||
@ -0,0 +1,15 @@ |
||||
import csv |
||||
|
||||
# Preprocess the CSV file to ensure consistent field counts |
||||
input_file = 'speeches.csv' |
||||
output_file = 'cleaned_speeches.csv' |
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile: |
||||
reader = csv.reader(infile, delimiter=';', quotechar='"') |
||||
writer = csv.writer(outfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) |
||||
|
||||
for row in reader: |
||||
if len(row) == 22: # Ensure the row has the correct number of fields |
||||
writer.writerow(row) |
||||
|
||||
print("CSV file has been cleaned and saved as 'cleaned_speeches.csv'") |
||||
@ -0,0 +1,149 @@ |
||||
import re |
||||
import chromadb |
||||
import os |
||||
import pymupdf4llm |
||||
from semantic_text_splitter import MarkdownSplitter |
||||
from _arango import ArangoDB |
||||
from pprint import pprint |
||||
import crossref_commons.retrieval as crossref |
||||
import ebooklib |
||||
from ebooklib import epub |
||||
import nltk |
||||
from bs4 import BeautifulSoup |
||||
|
||||
# from epub_conversion.utils import open_book, convert_epub_to_lines |
||||
|
||||
|
||||
def get_crossref(doi): |
||||
try: |
||||
work = crossref.get_publication_as_json(doi) |
||||
|
||||
# Determine the best publication date |
||||
if "published-print" in work: |
||||
publication_date = work["published-print"]["date-parts"][0] |
||||
elif "published-online" in work: |
||||
publication_date = work["published-online"]["date-parts"][0] |
||||
elif "issued" in work: |
||||
publication_date = work["issued"]["date-parts"][0] |
||||
else: |
||||
publication_date = [None] |
||||
|
||||
metadata = { |
||||
"doi": work.get("DOI", None), |
||||
"title": work.get("title", [None])[ |
||||
0 |
||||
], # Extract the first title if available |
||||
"authors": [ |
||||
f"{author['given']} {author['family']}" |
||||
for author in work.get("author", []) |
||||
], |
||||
"abstract": work.get("abstract", None), |
||||
"journal": work.get("container-title", [None])[ |
||||
0 |
||||
], # Extract the first journal title if available |
||||
"volume": work.get("volume", None), |
||||
"issue": work.get("issue", None), |
||||
"pages": work.get("page", None), |
||||
"published_date": "-".join( |
||||
map(str, publication_date) |
||||
), # Join date parts with hyphens |
||||
"url_doi": work.get("URL", None), |
||||
"link": ( |
||||
work.get("link", [None])[0]["URL"] if work.get("link", None) else None |
||||
), |
||||
"language": work.get("language", None), |
||||
} |
||||
return metadata |
||||
except Exception as e: |
||||
print(f"Error retrieving metadata for DOI {doi}: {e}") |
||||
return None |
||||
|
||||
|
||||
arango = ArangoDB() |
||||
arango.db.collection("sci_articles").truncate() #! |
||||
|
||||
# Initialize the chroma database |
||||
db = chromadb.PersistentClient("chroma_db") |
||||
col = db.get_or_create_collection("articles") |
||||
db.delete_collection("articles") #! |
||||
col = db.get_or_create_collection("articles") |
||||
max_characters = 2200 |
||||
ts = MarkdownSplitter(max_characters) |
||||
|
||||
|
||||
def add_pdfs(path_folder): |
||||
pdf_in_folder = [] |
||||
for file in os.listdir(path_folder): |
||||
if file.endswith(".pdf"): |
||||
pdf_in_folder.append(file) |
||||
|
||||
for pdf in pdf_in_folder: |
||||
doi = pdf.strip(".pdf").replace("_", "/") |
||||
crossref_info = get_crossref(doi) |
||||
|
||||
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
||||
print(f"Article {doi} already in database") |
||||
continue |
||||
pdf_path = os.path.join("sci_articles", pdf) |
||||
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) |
||||
|
||||
md_text = "" |
||||
for page in md_pages: |
||||
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" |
||||
|
||||
ids = [] |
||||
documents = [] |
||||
metadatas = [] |
||||
better_chunks = [] |
||||
chunks = ts.chunks(md_text) |
||||
|
||||
# Merge chunks that are too short |
||||
for chunk in chunks: |
||||
if all( |
||||
[ |
||||
len(chunk) < int(max_characters / 3), # TODO Are those values good? |
||||
len(chunks[-1]) < int(max_characters * 1.5), |
||||
len(better_chunks) > 0, |
||||
] |
||||
): |
||||
better_chunks[-1] += chunk |
||||
else: |
||||
better_chunks.append(chunks) |
||||
arango_chunks = [] |
||||
last_page = 1 |
||||
for i, chunk in enumerate(chunks): |
||||
page_numbers = re.findall(r"@(\d+)@", chunk) |
||||
if page_numbers == []: |
||||
page_numbers = [last_page] |
||||
else: |
||||
last_page = page_numbers[-1] |
||||
id = arango.fix_key(doi) + f"_{i}" |
||||
ids.append(id) |
||||
metadatas.append( |
||||
{ |
||||
"doi": pdf.strip(".pdf"), |
||||
"file": pdf_path, |
||||
"chunk_nr": i, |
||||
"pages": ",".join([str(i) for i in page_numbers]), |
||||
} |
||||
) |
||||
chunk = re.sub(r"@(\d+)@", "", chunk) |
||||
documents.append(chunk) |
||||
arango_chunks.append({"text": chunk, "pages": page_numbers}) |
||||
col.add(ids=ids, documents=documents, metadatas=metadatas) |
||||
arango_document = { |
||||
"_key": arango.fix_key(doi), |
||||
"doi": doi, |
||||
"file": pdf_path, |
||||
"chunks": arango_chunks, |
||||
"text": md_text, |
||||
"metadata": crossref_info, |
||||
} |
||||
arango.db.collection("sci_articles").insert( |
||||
arango_document, overwrite=True, overwrite_mode="update" |
||||
) |
||||
print(f"Inserted article {doi} into database") |
||||
|
||||
|
||||
path_folder = "sci_articles" |
||||
add_pdfs(path_folder) |
||||
@ -0,0 +1,19 @@ |
||||
import dropbox |
||||
|
||||
# Replace with your access token |
||||
ACCESS_TOKEN = 'sl.B-hTaHGCpioPzyC_BVCulhgIP3xTfpTcEgaPwkpzu00j3rgA7Q-9Durd2S1TnA5yqiS_ucn4YcDdyG_VFxropLZiyVPhxd4MiIHpFItugn9DCoMjtiy3Y8lJ6iD2I1A7DAhjlTavVUnxNTc' |
||||
|
||||
# Initialize a Dropbox client |
||||
dbx = dropbox.Dropbox(ACCESS_TOKEN) |
||||
|
||||
# Define the folder path |
||||
folder_path = '/Filinlämningar/Electric Cars' |
||||
|
||||
# List all files in the root directory |
||||
try: |
||||
result = dbx.files_list_folder(folder_path) |
||||
print(f"Files in the root directory:") |
||||
for entry in result.entries: |
||||
print(entry.name) |
||||
except dropbox.exceptions.ApiError as err: |
||||
print(f"Failed to list folder contents: {err}") |
||||
@ -0,0 +1,13 @@ |
||||
import pyperclip |
||||
|
||||
with open('review_references.csv', 'r') as f: |
||||
with open('review_references.txt', 'w') as f2: |
||||
references = f.readlines() |
||||
for ref in references: |
||||
print(ref) |
||||
# Copy ref to clipboard |
||||
found = input("Found DOI? (y/n): ") |
||||
f2.write(f"{ref.strip()}: {found}\n") |
||||
|
||||
|
||||
|
||||
@ -0,0 +1,26 @@ |
||||
"""An example program that uses the elsapy module""" |
||||
|
||||
from elsapy.elsclient import ElsClient |
||||
from elsapy.elsprofile import ElsAuthor, ElsAffil |
||||
from elsapy.elsdoc import FullDoc, AbsDoc |
||||
from elsapy.elssearch import ElsSearch |
||||
import json |
||||
|
||||
## Load configuration |
||||
con_file = open("config.json") |
||||
config = json.load(con_file) |
||||
con_file.close() |
||||
|
||||
## Initialize client |
||||
client = ElsClient(config['apikey']) |
||||
|
||||
def get_doc(doi): |
||||
## ScienceDirect (full-text) document example using DOI |
||||
doi_doc = FullDoc(doi = doi) |
||||
if doi_doc.read(client): |
||||
print ("doi_doc.title: ", doi_doc.title) |
||||
doi_doc.write() |
||||
return doi |
||||
else: |
||||
return None |
||||
|
||||
@ -0,0 +1,55 @@ |
||||
import pandas as pd |
||||
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
||||
|
||||
# Step 1: Read the CSV file |
||||
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') |
||||
|
||||
print(df.head()) |
||||
|
||||
# Step 2: Extract relevant columns |
||||
# Assuming the arguments start from the 5th column onwards |
||||
arguments = df.columns[5:] |
||||
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] |
||||
|
||||
# Step 3: Create a binary matrix for arguments |
||||
# Convert the argument columns to integers |
||||
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
||||
|
||||
# Step 4: Calculate sentiment scores for each politician |
||||
def calculate_sentiment_score(row): |
||||
score = 0 |
||||
for arg in arguments: |
||||
if row[arg] > 0: |
||||
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') |
||||
if sentiment == 'positive': |
||||
score += 1 |
||||
elif sentiment == 'negative': |
||||
score -= 1 |
||||
return score |
||||
|
||||
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) |
||||
|
||||
# Step 5: Identify the top 3 most positive and negative politicians |
||||
top_3_positive = df_arguments.nlargest(3, 'sentiment_score') |
||||
top_3_negative = df_arguments.nsmallest(3, 'sentiment_score') |
||||
|
||||
# Step 6: Extract arguments used by these politicians |
||||
def extract_arguments(df): |
||||
result = {} |
||||
for _, row in df.iterrows(): |
||||
name = row['name'] |
||||
used_arguments = [arg for arg in arguments if row[arg] > 0] |
||||
result[name] = used_arguments |
||||
return result |
||||
|
||||
positive_arguments = extract_arguments(top_3_positive) |
||||
negative_arguments = extract_arguments(top_3_negative) |
||||
|
||||
# Print the results |
||||
print("Top 3 Positive Politicians and their Arguments:") |
||||
for name, args in positive_arguments.items(): |
||||
print(f"{name}: {args}") |
||||
|
||||
print("\nTop 3 Negative Politicians and their Arguments:") |
||||
for name, args in negative_arguments.items(): |
||||
print(f"{name}: {args}") |
||||
@ -0,0 +1,108 @@ |
||||
import pyperclip |
||||
from pprint import pprint |
||||
import requests |
||||
import crossref_commons.retrieval |
||||
from time import sleep |
||||
from bs4 import BeautifulSoup |
||||
import dl_elsy |
||||
|
||||
def download_file(doi, url): |
||||
try: |
||||
response = requests.get(url) |
||||
response.raise_for_status() # Check if the request was successful |
||||
content_type = response.headers['Content-Type'] |
||||
|
||||
if content_type == 'application/pdf': |
||||
file_extension = 'pdf' |
||||
elif content_type.startswith('text/'): |
||||
file_extension = 'md' |
||||
else: |
||||
print(f"Unsupported content type: {content_type} for DOI: {doi}") |
||||
return |
||||
|
||||
file_name = f"{doi}.{file_extension}".replace('/', '_') |
||||
|
||||
if file_extension == 'md': |
||||
soup = BeautifulSoup(response.content, 'html.parser') |
||||
print(soup.text) |
||||
exit() |
||||
|
||||
with open(file_name, 'wb') as f: |
||||
f.write(response.content) |
||||
print(f"Downloaded {file_extension.upper()} for DOI: {doi}") |
||||
|
||||
except requests.exceptions.RequestException as e: |
||||
print(f"Failed to download file for DOI: {doi}. Error: {e}") |
||||
|
||||
def get_article_info(doi): |
||||
url = f'https://doaj.org/api/search/articles/{doi}' |
||||
response = requests.get(url) |
||||
|
||||
if response.status_code == 200: |
||||
data = response.json() |
||||
for result in data.get('results', []): |
||||
for link in result.get('bibjson', {}).get('link', []): |
||||
if 'mdpi.com' in link['url']: |
||||
r = requests.get(link['url']) |
||||
soup = BeautifulSoup(r.content, 'html.parser') |
||||
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) |
||||
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] |
||||
pdf = requests.get(pdf_url) |
||||
with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: |
||||
f.write(pdf.content) |
||||
sleep(1) |
||||
epub = requests.get(link['url'] + '/epub') |
||||
with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f: |
||||
f.write(epub.content) |
||||
sleep(1) |
||||
print(f'Downloaded PDF and EPUB for {doi}') |
||||
elif 'sciencedirect.com' in link['url']: |
||||
return dl_elsy.get_doc(doi) |
||||
sleep(1) |
||||
else: |
||||
|
||||
|
||||
print(link['url']) |
||||
input() |
||||
return doi |
||||
|
||||
else: |
||||
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") |
||||
|
||||
# Read DOIs from file |
||||
|
||||
with open('review_references.csv', 'r') as f: |
||||
with open('review_references.txt', 'w') as f2: |
||||
references = f.readlines() |
||||
# Process each DOI |
||||
with open('review_references.txt') as f2: |
||||
ref_done = f2.readlines() |
||||
|
||||
|
||||
|
||||
for ref in references: |
||||
doi = ref.strip() |
||||
print('###', ref.upper()) |
||||
try: |
||||
cr = crossref_commons.retrieval.get_publication_as_json(doi) |
||||
except ValueError: |
||||
print(f"Error fetching metadata for DOI: {doi}") |
||||
continue |
||||
if 'sciencedirect.com' not in str(cr): |
||||
continue |
||||
|
||||
if doi not in ref_done: |
||||
sleep(1) |
||||
r = dl_elsy.get_doc(doi) |
||||
if r: |
||||
with open('review_references.txt', 'a+') as f2: |
||||
f2.write(f'{r}\n') |
||||
|
||||
exit() |
||||
for ref in references: |
||||
doi = ref.strip() |
||||
with open('review_references.txt', 'a') as f2: |
||||
|
||||
r = get_article_info(doi) |
||||
if r: |
||||
f2.write(r) |
||||
@ -0,0 +1,47 @@ |
||||
import pandas as pd |
||||
import matplotlib.pyplot as plt |
||||
import seaborn as sns |
||||
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
||||
|
||||
|
||||
# Step 1: Read the CSV file |
||||
df = pd.read_csv('speeches.csv', delimiter=';') |
||||
|
||||
# Step 2: Extract relevant columns |
||||
# Assuming the arguments start from the 5th column onwards |
||||
arguments = df.columns[5:] |
||||
df_arguments = df[['_key', 'name'] + list(arguments)] |
||||
|
||||
# Step 3: Create a binary matrix for arguments |
||||
# Convert the argument columns to integers |
||||
df_arguments[arguments] = df_arguments[arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
||||
|
||||
# Step 4: Sum the arguments for each politician |
||||
df_sum = df_arguments.groupby('name')[arguments].sum().reset_index() |
||||
|
||||
# Step 5: Plot the data |
||||
plt.figure(figsize=(12, 8)) |
||||
sns.heatmap(df_sum.set_index('name'), annot=True, cmap='coolwarm', cbar=True) |
||||
plt.title('Arguments Used by Politicians') |
||||
plt.xlabel('Arguments') |
||||
plt.ylabel('Politicians') |
||||
|
||||
# Step 6: Color the x-axis labels based on sentiment |
||||
ax = plt.gca() |
||||
x_labels = ax.get_xticklabels() |
||||
for label in x_labels: |
||||
argument = label.get_text() |
||||
sentiment = arguments_dict.get(argument, {}).get('sentiment', 'neutral') |
||||
if sentiment == 'positive': |
||||
label.set_color('green') |
||||
elif sentiment == 'negative': |
||||
label.set_color('red') |
||||
else: |
||||
label.set_color('black') |
||||
|
||||
plt.xticks(rotation=45, ha='right') |
||||
plt.tight_layout() |
||||
|
||||
# Save the plot instead of showing it |
||||
plt.savefig('arguments_used_by_politicians.png') |
||||
plt.close() |
||||
@ -0,0 +1,80 @@ |
||||
import pandas as pd |
||||
import plotly.express as px |
||||
import plotly.graph_objects as go |
||||
from sklearn.decomposition import PCA |
||||
from sklearn.cluster import KMeans |
||||
from sklearn.preprocessing import StandardScaler |
||||
import streamlit as st |
||||
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
||||
|
||||
# Step 1: Read the CSV file |
||||
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') |
||||
|
||||
# Step 2: Extract relevant columns |
||||
# Assuming the arguments start from the 5th column onwards |
||||
arguments = df.columns[5:] |
||||
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] |
||||
|
||||
# Step 3: Create a binary matrix for arguments |
||||
# Convert the argument columns to integers |
||||
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
||||
|
||||
# Step 4: Calculate sentiment scores for each politician |
||||
def calculate_sentiment_score(row): |
||||
score = 0 |
||||
for arg in arguments: |
||||
if row[arg] > 0: |
||||
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') |
||||
if sentiment == 'positive': |
||||
score += 1 |
||||
elif sentiment == 'negative': |
||||
score -= 1 |
||||
return score |
||||
|
||||
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) |
||||
|
||||
# # Step 5: Standardize the data |
||||
# scaler = StandardScaler() |
||||
# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments]) |
||||
|
||||
# Step 6: Dimensionality reduction using PCA |
||||
pca = PCA(n_components=2) |
||||
pca_result = pca.fit_transform(df_arguments[arguments]) |
||||
df_arguments['pca1'] = pca_result[:, 0] |
||||
df_arguments['pca2'] = pca_result[:, 1] |
||||
|
||||
# Step 7: Examine loadings |
||||
loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments) |
||||
|
||||
# Step 8: Perform clustering |
||||
kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed |
||||
df_arguments['cluster'] = kmeans.fit_predict(pca_result) |
||||
|
||||
# Streamlit app |
||||
st.title('Politicians Grouped by Arguments Used and Sentiment Score') |
||||
|
||||
# Step 9: Plot the data with clusters using Plotly |
||||
fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters', |
||||
labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True}) |
||||
st.plotly_chart(fig) |
||||
|
||||
# Step 10: Visualize original arguments using Plotly |
||||
fig = go.Figure() |
||||
|
||||
# Add arrows for loadings |
||||
for argument in arguments: |
||||
fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']], |
||||
mode='lines+text', text=[None, argument], textposition='top center', |
||||
line=dict(color='red', width=2), showlegend=False)) |
||||
|
||||
# Add scatter plot for politicians |
||||
fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers', |
||||
marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10), |
||||
text=df_arguments['name'], hoverinfo='text')) |
||||
|
||||
fig.update_layout(title='PCA Biplot of Politicians and Arguments', |
||||
xaxis_title='PCA Component 1', |
||||
yaxis_title='PCA Component 2', |
||||
showlegend=False) |
||||
|
||||
st.plotly_chart(fig) |
||||
Loading…
Reference in new issue