# utils.py import re def fix_key(_key: str) -> str: """ Sanitize a given key by replacing all characters that are not alphanumeric, underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon, dollar sign, asterisk, single quote, percent, or colon with an underscore. Args: _key (str): The key to be sanitized. Returns: str: The sanitized key with disallowed characters replaced by underscores. """ return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key) def is_reference_chunk(text: str) -> bool: """ Determine if a text chunk consists PREDOMINANTLY of references or end matter. Conservative approach: only returns True for chunks that are clearly mostly references. Args: text (str): Text chunk to analyze Returns: bool: True if the chunk appears to be mostly references/end matter """ # Split text into lines for analysis lines = [line.strip() for line in text.split('\n') if line.strip()] if not lines: return False # First, check for unambiguous reference chunks (many DOIs or reference links) doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" doi_matches = len(re.findall(doi_pattern, text)) refhub_matches = len(re.findall(r'http://refhub\.elsevier\.com/\S+', text)) # If there are many DOIs or refhub links, it's almost certainly primarily references if doi_matches >= 15 or refhub_matches >= 10: return True # Find positions of common end matter section headers end_matter_patterns = [ r"\*\*Credit author statement\*\*", r"\*\*Declaration of competing interest\*\*", r"\*\*Acknowledgment\*\*", r"\*\*Acknowledgement\*\*", r"\*\*Appendix\b.*\*\*", r"\*\*References\*\*", r"^References[\s]*$" ] # Try to identify where end matter begins end_matter_positions = [] for pattern in end_matter_patterns: matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)) for match in matches: end_matter_positions.append(match.start()) # If we found end matter sections if end_matter_positions: # Find the earliest end matter position first_end_matter = min(end_matter_positions) # Calculate ratio of substantive content substantive_ratio = first_end_matter / len(text) # If less than 30% of the chunk is substantive content, filter it # This is conservative - only filter if the chunk is predominantly end matter if substantive_ratio < 0.10: return True else: # There's significant substantive content before end matter return False # Count reference indicators reference_indicators = 0 # Citation patterns with year, volume, pages citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text)) reference_indicators += citation_patterns * 2 # Check for lines starting with citation numbers lines_starting_with_citation = 0 for line in lines: if re.match(r'^\s*\[\d+\]', line): lines_starting_with_citation += 1 # If more than half the lines start with reference numbers, it's a reference list if lines_starting_with_citation > len(lines) / 2: return True # Check for abbreviation list (only if it makes up most of the chunk) abbreviation_lines = 0 for line in lines: if re.match(r'^[A-Z0-9]{2,}\s+[A-Z][a-z]+', line): abbreviation_lines += 1 # If more than 70% of lines are abbreviations, it's an abbreviation list if abbreviation_lines > len(lines) * 0.7: return True # Conservative approach: only filter if it's clearly mostly references return False