You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
2.2 KiB
66 lines
2.2 KiB
import re |
|
from arango import ArangoClient |
|
from dotenv import load_dotenv |
|
import os |
|
import env_manager |
|
|
|
load_dotenv() # Install with pip install python-dotenv |
|
|
|
|
|
class ArangoDB: |
|
def __init__(self, user=None, password=None, db_name=None): |
|
""" |
|
Initializes an instance of the ArangoClass. |
|
|
|
Args: |
|
db_name (str): The name of the database. |
|
username (str): The username for authentication. |
|
password (str): The password for authentication. |
|
""" |
|
|
|
host = os.getenv("ARANGO_HOST") |
|
if not user: |
|
user = os.getenv("ARANGO_USER") |
|
if not password: |
|
password = os.getenv("ARANGO_PASSWORD") |
|
if not db_name: |
|
db_name = os.getenv("ARANGO_DB") |
|
|
|
self.client = ArangoClient(hosts=host) |
|
self.db = self.client.db(db_name, username=user, password=password) |
|
|
|
def fix_key(self, _key): |
|
""" |
|
Sanitize a given key by replacing all characters that are not alphanumeric, |
|
underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon, |
|
dollar sign, asterisk, single quote, percent, or colon with an underscore. |
|
|
|
Args: |
|
_key (str): The key to be sanitized. |
|
|
|
Returns: |
|
str: The sanitized key with disallowed characters replaced by underscores. |
|
""" |
|
|
|
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
arango = ArangoDB(db_name='base') |
|
articles = arango.db.collection('sci_articles').all() |
|
for article in articles: |
|
if 'metadata' in article and article['metadata']: |
|
if 'abstract' in article['metadata']: |
|
abstract = article['metadata']['abstract'] |
|
if isinstance(abstract, str): |
|
# Remove text within <> brackets and the brackets themselves |
|
article['metadata']['abstract'] = re.sub(r'<[^>]*>', '', abstract) |
|
arango.db.collection('sci_articles').update_match( |
|
filters={'_key': article['_key']}, |
|
body={'metadata': article['metadata']}, |
|
merge=True |
|
) |
|
print(f"Updated abstract for {article['_key']}") |
|
|
|
|
|
|