sci/utils.py

# utils.py
import re

def fix_key(_key: str) -> str:
    """
    Sanitize a given key by replacing all characters that are not alphanumeric,
    underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon,
    dollar sign, asterisk, single quote, percent, or colon with an underscore.

    Args:
        _key (str): The key to be sanitized.

    Returns:
        str: The sanitized key with disallowed characters replaced by underscores.
    """
    return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)


def is_reference_chunk(text: str) -> bool:
    """
    Determine if a text chunk consists PREDOMINANTLY of references or end matter.
    Conservative approach: only returns True for chunks that are clearly mostly references.

    Args:
        text (str): Text chunk to analyze

    Returns:
        bool: True if the chunk appears to be mostly references/end matter
    """
    # Split text into lines for analysis
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    if not lines:
        return False

    # First, check for unambiguous reference chunks (many DOIs or reference links)
    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
    doi_matches = len(re.findall(doi_pattern, text))
    refhub_matches = len(re.findall(r'http://refhub\.elsevier\.com/\S+', text))

    # If there are many DOIs or refhub links, it's almost certainly primarily references
    if doi_matches >= 15 or refhub_matches >= 10:
        return True

    # Find positions of common end matter section headers
    end_matter_patterns = [
        r"\*\*Credit author statement\*\*",
        r"\*\*Declaration of competing interest\*\*",
        r"\*\*Acknowledgment\*\*",
        r"\*\*Acknowledgement\*\*",
        r"\*\*Appendix\b.*\*\*",
        r"\*\*References\*\*",
        r"^References[\s]*$"
    ]

    # Try to identify where end matter begins
    end_matter_positions = []
    for pattern in end_matter_patterns:
        matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
        for match in matches:
            end_matter_positions.append(match.start())

    # If we found end matter sections
    if end_matter_positions:
        # Find the earliest end matter position
        first_end_matter = min(end_matter_positions)
        # Calculate ratio of substantive content
        substantive_ratio = first_end_matter / len(text)

        # If less than 30% of the chunk is substantive content, filter it
        # This is conservative - only filter if the chunk is predominantly end matter
        if substantive_ratio < 0.10:
            return True
        else:
            # There's significant substantive content before end matter
            return False

    # Count reference indicators
    reference_indicators = 0

    # Citation patterns with year, volume, pages
    citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text))
    reference_indicators += citation_patterns * 2

    # Check for lines starting with citation numbers
    lines_starting_with_citation = 0
    for line in lines:
        if re.match(r'^\s*\[\d+\]', line):
            lines_starting_with_citation += 1

    # If more than half the lines start with reference numbers, it's a reference list
    if lines_starting_with_citation > len(lines) / 2:
        return True

    # Check for abbreviation list (only if it makes up most of the chunk)
    abbreviation_lines = 0
    for line in lines:
        if re.match(r'^[A-Z0-9]{2,}\s+[A-Z][a-z]+', line):
            abbreviation_lines += 1

    # If more than 70% of lines are abbreviations, it's an abbreviation list
    if abbreviation_lines > len(lines) * 0.7:
        return True

    # Conservative approach: only filter if it's clearly mostly references
    return False