You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

66 lines
2.2 KiB

import re
from arango import ArangoClient
from dotenv import load_dotenv
import os
import env_manager
load_dotenv() # Install with pip install python-dotenv
class ArangoDB:
def __init__(self, user=None, password=None, db_name=None):
"""
Initializes an instance of the ArangoClass.
Args:
db_name (str): The name of the database.
username (str): The username for authentication.
password (str): The password for authentication.
"""
host = os.getenv("ARANGO_HOST")
if not user:
user = os.getenv("ARANGO_USER")
if not password:
password = os.getenv("ARANGO_PASSWORD")
if not db_name:
db_name = os.getenv("ARANGO_DB")
self.client = ArangoClient(hosts=host)
self.db = self.client.db(db_name, username=user, password=password)
def fix_key(self, _key):
"""
Sanitize a given key by replacing all characters that are not alphanumeric,
underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon,
dollar sign, asterisk, single quote, percent, or colon with an underscore.
Args:
_key (str): The key to be sanitized.
Returns:
str: The sanitized key with disallowed characters replaced by underscores.
"""
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
if __name__ == "__main__":
arango = ArangoDB(db_name='base')
articles = arango.db.collection('sci_articles').all()
for article in articles:
if 'metadata' in article and article['metadata']:
if 'abstract' in article['metadata']:
abstract = article['metadata']['abstract']
if isinstance(abstract, str):
# Remove text within <> brackets and the brackets themselves
article['metadata']['abstract'] = re.sub(r'<[^>]*>', '', abstract)
arango.db.collection('sci_articles').update_match(
filters={'_key': article['_key']},
body={'metadata': article['metadata']},
merge=True
)
print(f"Updated abstract for {article['_key']}")