import re from typing import List, Tuple from dataclasses import dataclass class TextChunker: """ A smart text chunker that analyzes text structure and automatically determines the best splitting strategy based on detected patterns. Always splits on sentence boundaries. """ def __init__(self, chunk_limit: int = 500, chunk_overlap: int = 0): """ Initialize the smart chunker. Args: chunk_limit: Target maximum characters per chunk (may be exceeded to preserve sentences) chunk_overlap: Number of characters to overlap between chunks """ self.chunk_limit = chunk_limit self.chunk_overlap = chunk_overlap @dataclass class SeparatorInfo: """Information about a detected separator in the text.""" pattern: str count: int priority: int description: str keep_separator: bool = True def _detect_separators(self, text: str) -> List["TextChunker.SeparatorInfo"]: """ Analyze the text and detect available separators with their priority. Returns a list of separators ordered by priority (best to worst). """ separators = [] # Markdown headers (# Header, ## Header, etc.) md_headers = re.findall(r"^#{1,6}\s+.+$", text, re.MULTILINE) if md_headers: separators.append( self.SeparatorInfo( pattern=r"\n(?=#{1,6}\s+)", count=len(md_headers), priority=1, description=f"Markdown headers ({len(md_headers)} found)", ) ) # HTML headers (

,

, etc.) html_headers = re.findall( r"]*>.*?", text, re.IGNORECASE | re.DOTALL ) if html_headers: separators.append( self.SeparatorInfo( pattern=r"\n(?=]*>", text, re.IGNORECASE) if html_divs: separators.append( self.SeparatorInfo( pattern=r"\n(?=<(?:div|section))", count=len(html_divs), priority=3, description=f"HTML divs/sections ({len(html_divs)} found)", ) ) # Horizontal rules (---, ***, ___) hr_count = len(re.findall(r"^(?:---+|\*\*\*+|___+)\s*$", text, re.MULTILINE)) if hr_count: separators.append( self.SeparatorInfo( pattern=r"\n(?:---+|\*\*\*+|___+)\s*\n", count=hr_count, priority=4, description=f"Horizontal rules ({hr_count} found)", ) ) # Bullet points or numbered lists list_items = re.findall(r"^[\s]*(?:[-*+]|\d+\.)\s+", text, re.MULTILINE) if list_items: # Group consecutive list items list_groups = len( re.findall(r"(?:^[\s]*(?:[-*+]|\d+\.)\s+.*\n)+", text, re.MULTILINE) ) if list_groups > 1: separators.append( self.SeparatorInfo( pattern=r"\n(?=[\s]*(?:[-*+]|\d+\.)\s+)", count=list_groups, priority=5, description=f"List groups ({list_groups} found)", ) ) # Double newlines (paragraphs) double_newlines = text.count("\n\n") if double_newlines > 0: separators.append( self.SeparatorInfo( pattern=r"\n\n", count=double_newlines, priority=6, description=f"Paragraphs ({double_newlines} found)", ) ) # Single newlines single_newlines = text.count("\n") - (double_newlines * 2) if single_newlines > 0: separators.append( self.SeparatorInfo( pattern=r"\n", count=single_newlines, priority=7, description=f"Lines ({single_newlines} found)", ) ) # Sentence endings sentences = re.findall(r"[.!?]+[\s\n]+", text) if sentences: separators.append( self.SeparatorInfo( pattern=r"(?<=[.!?])\s+", count=len(sentences), priority=8, description=f"Sentences ({len(sentences)} found)", ) ) # Sort by priority (lower number = higher priority) separators.sort(key=lambda x: x.priority) return separators def _split_by_separator(self, text: str, separator_pattern: str) -> List[str]: """Split text by a separator pattern, preserving the separator.""" if not text: return [] # Split while keeping the separator parts = re.split(f"({separator_pattern})", text) # Reconstruct pieces with separators result = [] current = "" for part in parts: if part: current += part # If we just added a separator, save this piece if re.match(separator_pattern, part): if current.strip(): result.append(current) current = "" # Add any remaining text if current.strip(): result.append(current) # If no splits occurred, return the original text if not result: result = [text] return result def _split_by_sentences(self, text: str) -> List[str]: """ Split text into complete sentences, ensuring no mid-sentence breaks. Returns chunks that respect sentence boundaries and tries to balance chunk sizes. """ # Match sentence boundaries: period, exclamation, or question mark followed by space/newline sentence_pattern = r"(?<=[.!?])\s+" sentences = re.split(sentence_pattern, text) if not sentences: return [text] # Filter out empty sentences sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return [text] # If all sentences fit in one chunk, return as is total_length = ( sum(len(s) for s in sentences) + len(sentences) - 1 ) # +1 for spaces between if total_length <= self.chunk_limit: return [" ".join(sentences)] # Build chunks greedily first chunks = [] current_chunk = "" for sentence in sentences: # If adding this sentence would exceed limit and we have content, start new chunk if ( current_chunk and len(current_chunk) + len(sentence) + 1 > self.chunk_limit ): chunks.append(current_chunk) current_chunk = sentence else: # Add sentence to current chunk if current_chunk: current_chunk += " " + sentence else: current_chunk = sentence # Add final chunk if current_chunk: chunks.append(current_chunk) # Now balance the chunks: if the last chunk is too small, redistribute if len(chunks) >= 2: last_chunk_size = len(chunks[-1]) # If last chunk is less than 40% of chunk_limit, try to rebalance if last_chunk_size < self.chunk_limit * 0.5: # Rebuild from sentences, distributing more evenly chunks = self._balance_sentence_chunks(sentences) return chunks if chunks else [text] def _balance_sentence_chunks(self, sentences: List[str]) -> List[str]: """ Distribute sentences across chunks to minimize size variance. Uses a greedy approach that looks ahead to avoid tiny final chunks. """ if not sentences: return [] total_length = sum(len(s) for s in sentences) + len(sentences) - 1 # Estimate number of chunks needed estimated_chunks = max( 1, (total_length + self.chunk_limit - 1) // self.chunk_limit ) target_size = total_length / estimated_chunks chunks = [] current_chunk = "" remaining_sentences = len(sentences) for i, sentence in enumerate(sentences): remaining_sentences -= 1 if not current_chunk: current_chunk = sentence else: # Calculate what's left to process remaining_text_length = sum(len(s) for s in sentences[i + 1 :]) if remaining_sentences > 0: remaining_text_length += remaining_sentences # spaces current_length = len(current_chunk) new_length = current_length + len(sentence) + 1 # Decide whether to add to current chunk or start new one # Start new chunk if: # 1. Adding would exceed limit AND current chunk is at least 60% of target # 2. OR we're getting close to target size and have plenty of text left if ( new_length > self.chunk_limit and current_length >= target_size * 0.7 ): chunks.append(current_chunk) current_chunk = sentence elif ( current_length >= target_size * 0.9 and remaining_text_length > target_size * 0.5 ): # We're near target and there's enough remaining - start new chunk chunks.append(current_chunk) current_chunk = sentence else: current_chunk += " " + sentence if current_chunk: chunks.append(current_chunk) return chunks def _merge_small_chunks(self, chunks: List[str]) -> List[str]: """ Merge chunks that are smaller than the limit to optimize chunk sizes. Ensures the last chunk is not much smaller than the chunk_limit by merging it with the previous chunk if needed. """ if not chunks: return [] merged = [] current = chunks[0] for next_chunk in chunks[1:]: # If combining won't exceed limit, merge them if len(current) + len(next_chunk) <= self.chunk_limit: current += next_chunk else: merged.append(current) current = next_chunk # Add the last chunk merged.append(current) # If the last chunk is much smaller than chunk_limit, merge it with the previous one # (unless there's only one chunk) if len(merged) >= 2 and len(merged[-1]) < self.chunk_limit * 0.5: # Merge last two chunks merged[-2] += merged[-1] merged.pop(-1) return merged def _recursive_split( self, text: str, separators: List[SeparatorInfo], separator_idx: int = 0 ) -> List[str]: """ Recursively split text using available separators until chunks fit the limit. Always falls back to sentence-aware splitting to avoid mid-sentence breaks. """ # Base case: if text fits, return it if len(text) <= self.chunk_limit: return [text] # If we've exhausted all separators, use sentence-aware splitting if separator_idx >= len(separators): return self._split_by_sentences(text) # Try current separator separator = separators[separator_idx] splits = self._split_by_separator(text, separator.pattern) # If no split occurred or only one piece, try next separator if len(splits) <= 1: return self._recursive_split(text, separators, separator_idx + 1) # Process each split result = [] for split in splits: if len(split) <= self.chunk_limit: result.append(split) else: # This split is still too large, recurse with next separator sub_chunks = self._recursive_split(split, separators, separator_idx + 1) result.extend(sub_chunks) # Merge small consecutive chunks result = self._merge_small_chunks(result) return result def chunk(self, text: str, verbose: bool = False, headings: str = "") -> List[str]: """ Chunk the text using automatically detected separators. Always splits on complete sentences. Args: text: The text to chunk verbose: If True, print information about detected separators headings: Optional headings/context to prepend to each chunk (string) Returns: List of text chunks, each optionally prefixed with the provided headings """ if not text: return [] # Detect available separators separators = self._detect_separators(text) if verbose: print(f"Detected {len(separators)} separator types:") for sep in separators: print(f" - {sep.description} (priority {sep.priority})") print() # If no separators found, use sentence-aware splitting if not separators: if verbose: print("No natural separators found, splitting by sentences") chunks = self._split_by_sentences(text) else: # Recursively split using detected separators chunks = self._recursive_split(text, separators) # Clean up chunks chunks = [chunk.strip() for chunk in chunks if chunk.strip()] # Add headings to each chunk if provided if headings and headings.strip(): # Ensure headings end with newlines for proper formatting formatted_headings = headings.strip() if not formatted_headings.endswith("\n"): formatted_headings += "\n\n" else: formatted_headings += "\n" # Prepend headings to each chunk chunks = [f"#{formatted_headings}" + chunk for chunk in chunks] if verbose: print(f"Created {len(chunks)} chunks") if headings: print(f"Added headings to each chunk: '{headings.strip()}'") print(f"Chunk sizes: {[len(c) for c in chunks]}") return chunks def detect_sql_syntax(query: str) -> dict: """ Detects if a query contains SQL syntax instead of AQL. Args: query: The query string to check Returns: dict with keys: - is_sql: bool, True if SQL patterns detected - issues: list of detected SQL patterns - suggestion: str, helpful message for the LLM """ query_upper = query.upper() issues = [] # Common SQL patterns that don't exist in AQL sql_patterns = [ (r"\bINNER\s+JOIN\b", "Found 'INNER JOIN'"), (r"\bIS\s+NULL\b", "Found 'IS NULL' - SQL null test"), (r"\bHAVING\b", "Found 'HAVING' - SQL post-aggregation filter"), (r"\bHAVING\b", "Found 'HAVING' - use FILTER after COLLECT instead"), (r"\bINSTR\s*\(", "Found 'INSTR' - Oracle string position function"), (r"\bORDER\s+BY\b", "Found 'ORDER BY' - use 'SORT' instead"), (r"\bPOSITION\s*\(", "Found 'POSITION' - SQL POSITION function"), ( r"\bCASE\b[\s\S]{0,200}\bWHEN\b", "Found 'CASE ... WHEN' - SQL conditional expression", ), (r"\bINNER\s+JOIN\b", "Found 'INNER JOIN' - use nested FOR loops instead"), (r"\bSTRING_AGG\s*\(", "Found 'STRING_AGG(' - Postgres aggregate"), (r"\bRIGHT\s+JOIN\b", "Found 'RIGHT JOIN'"), ( r"\bSUBSTRING\s*\(", "Found 'SUBSTRING' - SQL substring function (AQL uses SUBSTRING() but with diff. semantics; beware false positives)", ), (r"\bOVER\s*\(", "Found 'OVER(' - SQL window clause"), (r"\bWHERE\b", "Found 'WHERE' - SQL WHERE (AQL uses FILTER)"), (r"\bREGEXP_LIKE\s*\(", "Found 'REGEXP_LIKE' - SQL regex function, not in AQL"), (r"\bPATINDEX\s*\(", "Found 'PATINDEX' - T-SQL pattern search"), ( r"\bJOIN\s+\w+\s+ON\b", "Found 'JOIN ... ON' - use nested FOR loops with FILTER instead", ), (r"\bSTRPOS\s*\(", "Found 'STRPOS' - Postgres string position function"), (r"\bWHERE\b", "Found 'WHERE' - use 'FILTER' instead"), (r"\bPARTITION\s+BY\b", "Found 'PARTITION BY' - window function partitioning"), (r"\bCAST\s*\([^)]*\s+AS\s+\w+\)", "Found 'CAST(... AS type)' - SQL cast"), (r"\bLIKE\b", "Found 'LIKE' - SQL pattern match"), (r"\bILIKE\b", "Found 'ILIKE' - Postgres case-insensitive LIKE"), (r"\bGROUP\s+BY\b", "Found 'GROUP BY' - AQL equivalent: COLLECT"), ( r"\bMIN\s*\(\s*\w+\.\w+\s*\)", "Found 'MIN(table.column)' - use 'RETURN MIN(doc.field)' or aggregate in COLLECT instead", ), (r"\bCOUNT\s*\(", "Found 'COUNT(' - SQL aggregate"), ( r"\bREGEXP_REPLACE\s*\(", "Found 'REGEXP_REPLACE' - SQL regex function, not in AQL", ), (r"\bMIN\s*\(", "Found 'MIN(' - SQL aggregate"), ( r"\bOFFSET\s+\d+", "Found 'OFFSET' alone - in AQL use 'LIMIT offset, count' format", ), ( r"\bAVG\s*\(\s*\w+\.\w+\s*\)", "Found 'AVG(table.column)' - use 'RETURN AVG(doc.field)' or aggregate in COLLECT instead", ), ( r"\bAS\s+\w+\s+FROM\b", "Found table alias with 'AS' - AQL doesn't use AS for collections", ), (r"\bUNION\b", "Found 'UNION' - SQL set union"), ( r"\bWITH\s+\w+\s+AS\s*\(", "Found CTE 'WITH name AS (' - common table expression", ), (r"\bGROUP_CONCAT\s*\(", "Found 'GROUP_CONCAT(' - MySQL aggregate"), (r"\bMAX\s*\(", "Found 'MAX(' - SQL aggregate"), (r"\bTOP\s+\d+\b", "Found 'TOP N' - SQL Server style (pagination)"), ( r"\bREGEXP_INSTR\s*\(", "Found 'REGEXP_INSTR' - SQL regex function, not in AQL", ), (r"\bROW_NUMBER\s*\(", "Found 'ROW_NUMBER(' - SQL window function"), (r"\bLEFT\s+JOIN\b", "Found 'LEFT JOIN'"), (r"\bJOIN\b", "Found 'JOIN' - use nested FOR loops in AQL"), ( r"\bLENGTH\s*\(", "Found 'LENGTH' - SQL string length (AQL uses LENGTH() but semantics differ: counts array elements too)", ), (r"\bSELECT\s+", "Found 'SELECT' - use 'FOR ... IN ... RETURN' instead"), ( r"\bCOUNT\s*\(\s*\*\s*\)", "Found 'COUNT(*)' - use 'COLLECT WITH COUNT INTO var' instead", ), ( r"\bSUM\s*\(\s*\w+\.\w+\s*\)", "Found 'SUM(table.column)' - use 'RETURN SUM(doc.field)' or aggregate in COLLECT instead", ), (r"\bSELECT\s+", "Found 'SELECT' - SQL-style SELECT"), (r"\bOFFSET\b", "Found 'OFFSET' - SQL-style pagination (watch variants)"), ( r"\bSELECT\b[\s\S]{0,400}\bFROM\b", "Found 'SELECT ... FROM' - SQL-style query (use 'FOR ... IN ... RETURN')", ), (r"\bDISTINCT\b", "Found 'DISTINCT' - SQL DISTINCT (AQL uses COLLECT/UNIQUE)"), ( r"\bEXISTS\s*\(\s*SELECT\b", "Found 'EXISTS (SELECT ...)' - SQL subquery existence check", ), (r"\(\s*SELECT\b", "Found '(SELECT ...)' - SQL subquery (nested select)"), (r"\bON\s+", "Found 'ON' (JOIN condition) - SQL join condition indicator"), (r"\bSUM\s*\(", "Found 'SUM(' - SQL aggregate"), (r"\bGROUP\s+BY\b", "Found 'GROUP BY' - use 'COLLECT' instead"), (r"\bAVG\s*\(", "Found 'AVG(' - SQL aggregate"), (r"\bRIGHT\s+JOIN\b", "Found 'RIGHT JOIN' - use nested FOR loops instead"), ( r"\bFROM\s+\w+\s+WHERE\b", "Found 'FROM ... WHERE' - use 'FOR ... IN ... FILTER' instead", ), (r"\bLEFT\s+JOIN\b", "Found 'LEFT JOIN' - use nested FOR loops instead"), ( r"\bMAX\s*\(\s*\w+\.\w+\s*\)", "Found 'MAX(table.column)' - use 'RETURN MAX(doc.field)' or aggregate in COLLECT instead", ), (r"\bCONVERT\s*\([^)]*\)", "Found 'CONVERT(...)' - SQL convert/cast"), (r"\bBETWEEN\b", "Found 'BETWEEN' - SQL range operator"), ( r"\bREGEXP_SUBSTR\s*\(", "Found 'REGEXP_SUBSTR' - SQL regex function, not in AQL", ), (r"\bCHARINDEX\s*\(", "Found 'CHARINDEX' - T-SQL string search"), ( r"\bFROM\s+\w+\s+WHERE\b", "Found 'FROM ... WHERE' - SQL-style; use 'FOR ... IN ... FILTER' in AQL", ), ( r"\bREGEXP_COUNT\s*\(", "Found 'REGEXP_COUNT' - SQL regex function, not in AQL", ), (r"\bORDER\s+BY\b", "Found 'ORDER BY' - AQL uses SORT"), (r"\bUNION\s+ALL\b", "Found 'UNION ALL' - SQL set union"), (r"\bIS\s+NOT\s+NULL\b", "Found 'IS NOT NULL' - SQL null test"), ] for pattern, message in sql_patterns: if re.search(pattern, query_upper): issues.append(message) # Special case: SELECT without FROM (common typo) if re.search(r"\bSELECT\b", query_upper) and not re.search( r"\bFOR\s+\w+\s+IN\b", query_upper ): if "Found 'SELECT'" not in [i for i in issues]: issues.append( "Query starts with SELECT but has no FOR loop - this is SQL, not AQL" ) is_sql = len(issues) > 0 suggestion = "" if is_sql: suggestion = ( "ERROR: This query uses SQL syntax, not AQL! " "AQL (ArangoDB Query Language) syntax:\n" "- Start with: FOR doc IN collection\n" "- Filter with: FILTER doc.field == value\n" "- End with: RETURN doc (or specific fields)\n" "- For joins: use nested FOR loops\n" "- For grouping: use COLLECT\n\n" f"Detected issues:\n" + "\n".join(f"- {issue}" for issue in issues) ) return {"is_sql": is_sql, "issues": issues, "suggestion": suggestion} import re from typing import List, Tuple def _norm_whitespace(s: str) -> str: return re.sub(r'\s+', ' ', s).strip() def _extract_clause(sql: str, name: str, terminators: List[str]) -> Tuple[str, str]: """Extract clause `name` (like 'where') returning (content, remainder)""" pattern = rf'(?i)\b{name}\b\s*(.+)' m = re.search(pattern, sql) if not m: return '', sql rest = m.group(1) # cut at first terminator token min_pos = len(rest) for t in terminators: t_re = re.search(rf'(?i)\b{t}\b', rest) if t_re: min_pos = min(min_pos, t_re.start()) return rest[:min_pos].strip(), rest[min_pos:].strip() def sql_to_aql(sql: str) -> str: s = _norm_whitespace(sql).rstrip(';') s_low = s.lower() # SELECT clause m = re.search(r'(?i)\bselect\b\s+(.+?)\s+\bfrom\b\s', s) if not m: raise ValueError("Cannot parse SELECT clause") select_part = m.group(1).strip() # FROM clause (capture table and optional alias) m = re.search(r'(?i)\bfrom\b\s+([^\s,]+)(?:\s+([a-zA-Z_][\w]*))?', s) if not m: raise ValueError("Cannot parse FROM clause") from_table = m.group(1) from_alias = m.group(2) if m.group(2) else from_table # Find joins (simple INNER JOIN / JOIN) joins = [] for jm in re.finditer(r'(?i)\bjoin\b\s+([^\s]+)(?:\s+([a-zA-Z_][\w]*))?\s+\bon\b\s+([^ ]+)', s): j_table, j_alias, j_on = jm.group(1), (jm.group(2) or jm.group(1)), jm.group(3) joins.append((j_table, j_alias, j_on)) # WHERE where_part, _ = _extract_clause(s, 'where', ['group by', 'order by', 'limit']) # GROUP BY group_by = '' m = re.search(r'(?i)\bgroup\s+by\b\s+(.+?)(?:\s+\border\s+by\b|\s+\blimit\b|$)', s) if m: group_by = m.group(1).strip() # ORDER BY order_by = '' m = re.search(r'(?i)\border\s+by\b\s+(.+?)(?:\s+\blimit\b|$)', s) if m: order_by = m.group(1).strip() # LIMIT / OFFSET offset = None limit = None m = re.search(r'(?i)\blimit\b\s+(\d+)\s*,\s*(\d+)', s) if m: offset = int(m.group(1)); limit = int(m.group(2)) else: m = re.search(r'(?i)\blimit\b\s+(\d+)', s) if m: limit = int(m.group(1)) m = re.search(r'(?i)\boffset\b\s+(\d+)', s) if m: offset = int(m.group(1)) # Heuristic: if WHERE contains anforandetext LIKE '%term%' or LIKE '%term%' or anforandetext ILIKE, map to talks_search + SEARCH TOKENS use_view_search = False search_term = None like_m = re.search(r"(?i)(anforandetext)\s+like\s+'%([^%']+)%'", s) if like_m: use_view_search = True search_term = like_m.group(2) else: # also check generic LIKE on any column -- if column looks like text, map to view like_m = re.search(r"(?i)([a-zA-Z0-9_\.]+)\s+like\s+'%([^%']+)%'", s) if like_m and 'anforandetext' in like_m.group(1).lower(): use_view_search = True search_term = like_m.group(2) # Start building AQL aql_lines = [] if use_view_search: aql_lines.append(f"FOR {from_alias} IN {from_table}_search".replace('_search_search','_search')) # if talks -> talks_search else: aql_lines.append(f"FOR {from_alias} IN {from_table}") # add join FOR loops for j_table, j_alias, j_on in joins: aql_lines.append(f" FOR {j_alias} IN {j_table}") # Convert ON conditions and WHERE into FILTERs filters = [] # Add join ON conditions as FILTERs for _, j_alias, j_on in joins: # j_on example: p._key = t.intressent_id cond = j_on.replace('=', '==') filters.append(cond.strip()) if where_part: # Basic transformations: = stays ==, <> => !=, AND/OR uppercase, remove table aliases if needed cond = where_part cond = re.sub(r'(?i)\s+and\s+', ' AND ', cond) cond = re.sub(r'(?i)\s+or\s+', ' OR ', cond) cond = cond.replace('<>', '!=') cond = cond.replace('=', '==', 1) if ('=' in cond and '==' not in cond) else cond # don't blindly replace all = -> ==; do cautious: replace operators like ' = ' with ' == ' cond = re.sub(r'\s=\s', ' == ', cond) # if LIKE already handled above, skip adding raw LIKE filter cond = re.sub(r"(?i)\s+like\s+'%[^']+%'", '', cond) filters.append(cond.strip()) for f in filters: if f: aql_lines.append(f" FILTER {f}") # If use_view_search, add SEARCH line if use_view_search and search_term: aql_lines.append(f" SEARCH ANALYZER({from_alias}.anforandetext IN TOKENS(\"{search_term}\", \"text_sv\"), \"text_sv\")") # SORT / ORDER BY conversion if order_by: # simple conversion: replace table.column with same order_expr = order_by.replace(' desc', ' DESC').replace(' asc', ' ASC') aql_lines.append(f" SORT {order_expr}") # GROUP BY -> COLLECT (simple support for COUNT(*) and grouping by single key) if group_by: group_cols = [c.strip() for c in group_by.split(',')] if len(group_cols) == 1 and re.search(r'(?i)count\(\s*\*\s*\)', select_part): key = group_cols[0] # map table.column -> alias.column if no alias aql_lines.append(f" COLLECT key = {key} WITH COUNT INTO cnt") aql_lines.append(" SORT cnt DESC") aql_lines.append(" RETURN { key, count: cnt }") return "\n".join(aql_lines) # LIMIT/OFFSET if limit is not None: if offset is None: aql_lines.append(f" LIMIT {limit}") else: aql_lines.append(f" LIMIT {offset}, {limit}") # Build the RETURN clause # if select_part is COUNT(*) or COUNT(1) if re.search(r'(?i)^count\s*\(\s*\*\s*\)\s*$', select_part.strip()): aql_lines.append(" COLLECT WITH COUNT INTO c") aql_lines.append(" RETURN c") else: # map columns: simply return them as-is (user may need to adapt aliases) # Build a nice returned object if multiple columns cols = [c.strip() for c in select_part.split(',')] if len(cols) == 1: col = cols[0] aql_lines.append(f" RETURN {col}") else: ret_items = [] for c in cols: # try to make a key: if "t._id" -> _id, if "p.fodd_ar" -> p_fodd_ar key = re.sub(r'[^a-zA-Z0-9_]', '_', c) ret_items.append(f'"{key}": {c}') ret_map = "{ " + ", ".join(ret_items) + " }" aql_lines.append(f" RETURN {ret_map}") return "\n".join(aql_lines) # ---- small CLI for quick tests ---- if __name__ == "__main__": examples = [ "SELECT COUNT(*) FROM talks WHERE anforandetext LIKE '%korallrev%';", "SELECT t._id, p.fodd_ar FROM talks t JOIN people p ON p._key = t.intressent_id WHERE t.year = 2016;", "SELECT parti, COUNT(*) FROM talks WHERE dok_datum >= '2016-01-01' AND dok_datum <= '2016-12-31' GROUP BY parti ORDER BY COUNT(*) DESC;" ] for sql in examples: print("SQL:", sql) try: print("AQL:\n", sql_to_aql(sql)) except Exception as e: print("Error:", e) print("-" * 60) # Example usage: if __name__ == "__main__": pass