import re from arango import ArangoClient from dotenv import load_dotenv import os import env_manager load_dotenv() # Install with pip install python-dotenv class ArangoDB: def __init__(self, user=None, password=None, db_name=None): """ Initializes an instance of the ArangoClass. Args: db_name (str): The name of the database. username (str): The username for authentication. password (str): The password for authentication. """ host = os.getenv("ARANGO_HOST") if not user: user = os.getenv("ARANGO_USER") if not password: password = os.getenv("ARANGO_PASSWORD") if not db_name: db_name = os.getenv("ARANGO_DB") self.client = ArangoClient(hosts=host) self.db = self.client.db(db_name, username=user, password=password) def fix_key(self, _key): """ Sanitize a given key by replacing all characters that are not alphanumeric, underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon, dollar sign, asterisk, single quote, percent, or colon with an underscore. Args: _key (str): The key to be sanitized. Returns: str: The sanitized key with disallowed characters replaced by underscores. """ return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key) if __name__ == "__main__": arango = ArangoDB(db_name='base') articles = arango.db.collection('sci_articles').all() for article in articles: if 'metadata' in article and article['metadata']: if 'abstract' in article['metadata']: abstract = article['metadata']['abstract'] if isinstance(abstract, str): # Remove text within <> brackets and the brackets themselves article['metadata']['abstract'] = re.sub(r'<[^>]*>', '', abstract) arango.db.collection('sci_articles').update_match( filters={'_key': article['_key']}, body={'metadata': article['metadata']}, merge=True ) print(f"Updated abstract for {article['_key']}")