You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

108 lines
3.8 KiB

# utils.py
import re
def fix_key(_key: str) -> str:
"""
Sanitize a given key by replacing all characters that are not alphanumeric,
underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon,
dollar sign, asterisk, single quote, percent, or colon with an underscore.
Args:
_key (str): The key to be sanitized.
Returns:
str: The sanitized key with disallowed characters replaced by underscores.
"""
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
def is_reference_chunk(text: str) -> bool:
"""
Determine if a text chunk consists PREDOMINANTLY of references or end matter.
Conservative approach: only returns True for chunks that are clearly mostly references.
Args:
text (str): Text chunk to analyze
Returns:
bool: True if the chunk appears to be mostly references/end matter
"""
# Split text into lines for analysis
lines = [line.strip() for line in text.split('\n') if line.strip()]
if not lines:
return False
# First, check for unambiguous reference chunks (many DOIs or reference links)
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
doi_matches = len(re.findall(doi_pattern, text))
refhub_matches = len(re.findall(r'http://refhub\.elsevier\.com/\S+', text))
# If there are many DOIs or refhub links, it's almost certainly primarily references
if doi_matches >= 15 or refhub_matches >= 10:
return True
# Find positions of common end matter section headers
end_matter_patterns = [
r"\*\*Credit author statement\*\*",
r"\*\*Declaration of competing interest\*\*",
r"\*\*Acknowledgment\*\*",
r"\*\*Acknowledgement\*\*",
r"\*\*Appendix\b.*\*\*",
r"\*\*References\*\*",
r"^References[\s]*$"
]
# Try to identify where end matter begins
end_matter_positions = []
for pattern in end_matter_patterns:
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
for match in matches:
end_matter_positions.append(match.start())
# If we found end matter sections
if end_matter_positions:
# Find the earliest end matter position
first_end_matter = min(end_matter_positions)
# Calculate ratio of substantive content
substantive_ratio = first_end_matter / len(text)
# If less than 30% of the chunk is substantive content, filter it
# This is conservative - only filter if the chunk is predominantly end matter
if substantive_ratio < 0.10:
return True
else:
# There's significant substantive content before end matter
return False
# Count reference indicators
reference_indicators = 0
# Citation patterns with year, volume, pages
citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text))
reference_indicators += citation_patterns * 2
# Check for lines starting with citation numbers
lines_starting_with_citation = 0
for line in lines:
if re.match(r'^\s*\[\d+\]', line):
lines_starting_with_citation += 1
# If more than half the lines start with reference numbers, it's a reference list
if lines_starting_with_citation > len(lines) / 2:
return True
# Check for abbreviation list (only if it makes up most of the chunk)
abbreviation_lines = 0
for line in lines:
if re.match(r'^[A-Z0-9]{2,}\s+[A-Z][a-z]+', line):
abbreviation_lines += 1
# If more than 70% of lines are abbreviations, it's an abbreviation list
if abbreviation_lines > len(lines) * 0.7:
return True
# Conservative approach: only filter if it's clearly mostly references
return False