You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
3.8 KiB
108 lines
3.8 KiB
# utils.py |
|
import re |
|
|
|
def fix_key(_key: str) -> str: |
|
""" |
|
Sanitize a given key by replacing all characters that are not alphanumeric, |
|
underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon, |
|
dollar sign, asterisk, single quote, percent, or colon with an underscore. |
|
|
|
Args: |
|
_key (str): The key to be sanitized. |
|
|
|
Returns: |
|
str: The sanitized key with disallowed characters replaced by underscores. |
|
""" |
|
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key) |
|
|
|
|
|
|
|
|
|
def is_reference_chunk(text: str) -> bool: |
|
""" |
|
Determine if a text chunk consists PREDOMINANTLY of references or end matter. |
|
Conservative approach: only returns True for chunks that are clearly mostly references. |
|
|
|
Args: |
|
text (str): Text chunk to analyze |
|
|
|
Returns: |
|
bool: True if the chunk appears to be mostly references/end matter |
|
""" |
|
# Split text into lines for analysis |
|
lines = [line.strip() for line in text.split('\n') if line.strip()] |
|
if not lines: |
|
return False |
|
|
|
# First, check for unambiguous reference chunks (many DOIs or reference links) |
|
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" |
|
doi_matches = len(re.findall(doi_pattern, text)) |
|
refhub_matches = len(re.findall(r'http://refhub\.elsevier\.com/\S+', text)) |
|
|
|
# If there are many DOIs or refhub links, it's almost certainly primarily references |
|
if doi_matches >= 15 or refhub_matches >= 10: |
|
return True |
|
|
|
# Find positions of common end matter section headers |
|
end_matter_patterns = [ |
|
r"\*\*Credit author statement\*\*", |
|
r"\*\*Declaration of competing interest\*\*", |
|
r"\*\*Acknowledgment\*\*", |
|
r"\*\*Acknowledgement\*\*", |
|
r"\*\*Appendix\b.*\*\*", |
|
r"\*\*References\*\*", |
|
r"^References[\s]*$" |
|
] |
|
|
|
# Try to identify where end matter begins |
|
end_matter_positions = [] |
|
for pattern in end_matter_patterns: |
|
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)) |
|
for match in matches: |
|
end_matter_positions.append(match.start()) |
|
|
|
# If we found end matter sections |
|
if end_matter_positions: |
|
# Find the earliest end matter position |
|
first_end_matter = min(end_matter_positions) |
|
# Calculate ratio of substantive content |
|
substantive_ratio = first_end_matter / len(text) |
|
|
|
# If less than 30% of the chunk is substantive content, filter it |
|
# This is conservative - only filter if the chunk is predominantly end matter |
|
if substantive_ratio < 0.10: |
|
return True |
|
else: |
|
# There's significant substantive content before end matter |
|
return False |
|
|
|
# Count reference indicators |
|
reference_indicators = 0 |
|
|
|
# Citation patterns with year, volume, pages |
|
citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text)) |
|
reference_indicators += citation_patterns * 2 |
|
|
|
# Check for lines starting with citation numbers |
|
lines_starting_with_citation = 0 |
|
for line in lines: |
|
if re.match(r'^\s*\[\d+\]', line): |
|
lines_starting_with_citation += 1 |
|
|
|
# If more than half the lines start with reference numbers, it's a reference list |
|
if lines_starting_with_citation > len(lines) / 2: |
|
return True |
|
|
|
# Check for abbreviation list (only if it makes up most of the chunk) |
|
abbreviation_lines = 0 |
|
for line in lines: |
|
if re.match(r'^[A-Z0-9]{2,}\s+[A-Z][a-z]+', line): |
|
abbreviation_lines += 1 |
|
|
|
# If more than 70% of lines are abbreviations, it's an abbreviation list |
|
if abbreviation_lines > len(lines) * 0.7: |
|
return True |
|
|
|
# Conservative approach: only filter if it's clearly mostly references |
|
return False |
|
|
|
|