From 00ff35dc9133a3442b0957801837566e6f4995af Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Fri, 26 Jun 2026 10:44:21 -0700 Subject: [PATCH] Fix LookML filter-expression to SQL conversion The LookML adapter treated Looker filter values as opaque strings, so anything beyond simple comparisons produced wrong or invalid SQL: - date/numeric ranges and intervals ([1,10], "5 to 10") became string equality or malformed IN lists (matched zero rows) - "not X" / negation -> literal "= 'not X'" - EMPTY -> omitted the NULL case - single quotes were not escaped (broken SQL / injection risk) Rewrite _convert_lookml_filter_to_sql to implement the representable parts of Looker's filter expression language (numeric ranges, interval brackets, NOT/negation, EMPTY = NULL OR '', wildcard NOT LIKE, mixed lists) and escape quoted string literals. Untranslated date/interval expressions now log a warning instead of silently emitting a zero-row equality. --- sidemantic/adapters/lookml.py | 289 +++++++++++++++-------- tests/adapters/lookml/test_edge_cases.py | 74 ++++++ 2 files changed, 266 insertions(+), 97 deletions(-) diff --git a/sidemantic/adapters/lookml.py b/sidemantic/adapters/lookml.py index c2607b79..7832196b 100644 --- a/sidemantic/adapters/lookml.py +++ b/sidemantic/adapters/lookml.py @@ -1,5 +1,6 @@ """LookML adapter for importing Looker semantic models.""" +import logging import re from pathlib import Path @@ -10,6 +11,8 @@ from sidemantic.core.relationship import Relationship from sidemantic.core.semantic_graph import SemanticGraph +logger = logging.getLogger(__name__) + def _import_lkml(): """Lazily import lkml, raising a clear error if not installed.""" @@ -193,107 +196,199 @@ def replace_ref(match: re.Match) -> str: return resolved - def _convert_lookml_filter_to_sql(self, field: str, value: str) -> str: - """Convert a LookML filter value to SQL condition. - - Handles LookML filter syntax: - - "value" -> field = 'value' - - "val1,val2,val3" -> field IN ('val1', 'val2', 'val3') - - "-value" -> field != 'value' (negation) - - "-val1,-val2" -> field NOT IN ('val1', 'val2') - - "yes"/"no" -> field = true/false (for yesno dimensions) - - ">100", ">=50", "<10", "<=5", "!=0" -> numeric comparisons - - "%pattern%" -> field LIKE '%pattern%' (wildcards) - - "NULL" -> field IS NULL - - "-NULL" -> field IS NOT NULL - - "EMPTY" -> field = '' - - "-EMPTY" -> field != '' - - Args: - field: The field name - value: The LookML filter value + @staticmethod + def _filter_is_number(s: str) -> bool: + """True if ``s`` is a plain numeric literal (incl. signed / decimal).""" + try: + float(s) + return True + except ValueError: + return False + + # Tokens that mark a value as a LookML date/interval filter expression + # (e.g. "last 7 days", "3 months ago", "this year"). These are not yet + # translated to SQL; we warn rather than silently string-comparing them. + _DATE_FILTER_RE = re.compile( + r"(?i)\b(ago|day|days|week|weeks|month|months|year|years|quarter|quarters|" + r"hour|hours|minute|minutes|second|seconds|today|yesterday|tomorrow|now|fiscal|" + r"week|month|year)\b" + ) - Returns: - SQL condition string + def _convert_lookml_filter_to_sql(self, field: str, value: str) -> str: + """Convert a LookML filter value to a SQL condition. + + Implements the representable parts of Looker's filter expression + language (https://cloud.google.com/looker/docs/filter-expressions): + + - ``value`` -> ``field = 'value'`` (single quotes escaped) + - ``a,b,c`` -> ``field IN ('a','b','c')`` (numeric: unquoted) + - ``-value`` / ``not value`` -> ``field <> 'value'`` + - ``-a,-b`` -> ``field NOT IN ('a','b')`` + - ``-%pat%`` / ``not %pat%`` -> ``field NOT LIKE '%pat%'`` + - ``yes`` / ``no`` -> ``field = true`` / ``field = false`` + - ``>100``, ``>=5``, ``<=5``, ``<10``, ``!=0``, ``<>0`` -> comparisons + - ``5 to 10`` -> ``field >= 5 AND field <= 10`` (open: ``5 to`` / ``to 10``) + - ``[1,10]`` ``(1,10)`` ``[1,10)`` ``(1,10]`` -> inclusive/exclusive ranges + - ``%pat%`` / ``_at`` -> ``field LIKE '%pat%'`` + - ``NULL`` / ``-NULL`` -> ``IS NULL`` / ``IS NOT NULL`` + - ``EMPTY`` -> ``(field IS NULL OR field = '')`` (and the negation) + - ``before X`` / ``after X`` -> ``field < X`` / ``field > X`` + - mixed comma lists (operators/wildcards) -> includes OR'd, excludes AND'd + + Date/interval expressions (``last 7 days``, ``3 months ago`` ...) are not + yet translated: they emit a literal equality but log a warning so the + silent zero-row match is at least surfaced. """ - # Handle NULL special values - if value.upper() == "NULL": - return f"{{model}}.{field} IS NULL" - if value.upper() == "-NULL": - return f"{{model}}.{field} IS NOT NULL" - - # Handle EMPTY special values - if value.upper() == "EMPTY": - return f"{{model}}.{field} = ''" - if value.upper() == "-EMPTY": - return f"{{model}}.{field} != ''" - - # Handle yes/no boolean values - if value.lower() == "yes": - return f"{{model}}.{field} = true" - if value.lower() == "no": - return f"{{model}}.{field} = false" - - # Check if this is a comma-separated list of values (OR condition) - # But be careful: ">100,<200" is two comparison operators, not a list - if "," in value: - parts = [p.strip() for p in value.split(",")] - - # Check if all parts are negations (NOT IN) - if all(p.startswith("-") for p in parts): - # Remove the - prefix from each - clean_parts = [p[1:] for p in parts] - # Check if they're all simple strings (not operators) - if all(not re.match(r"^(>=|<=|!=|<>|>|<)", p) for p in clean_parts): - quoted = ", ".join(f"'{p}'" for p in clean_parts) - return f"{{model}}.{field} NOT IN ({quoted})" - - # Check if all parts are simple values (no operators) -> IN clause - if all(not p.startswith("-") and not re.match(r"^(>=|<=|!=|<>|>|<)", p) for p in parts): - # Check if all parts are numeric - if all(p.replace(".", "").replace("-", "").isdigit() for p in parts): - # Numeric IN clause (no quotes) - return f"{{model}}.{field} IN ({', '.join(parts)})" - else: - # String IN clause (with quotes) - quoted = ", ".join(f"'{p}'" for p in parts) - return f"{{model}}.{field} IN ({quoted})" - - # Mixed operators - this is actually multiple filter conditions - # LookML doesn't really support this in a single filter value - # Fall through to single value handling (will be slightly wrong but safer) - - # Handle negation prefix for single values - if value.startswith("-") and not re.match(r"^-(>=|<=|!=|<>|>|<|\d)", value): - negated_value = value[1:] - if negated_value.replace(".", "").replace("-", "").isdigit(): - return f"{{model}}.{field} != {negated_value}" - else: - return f"{{model}}.{field} != '{negated_value}'" - - # Handle comparison operators: ">1000", "<=100", ">=5", "<10", "!=0" - if match := re.match(r"^(>=|<=|!=|<>|>|<)(.+)$", value): - operator, operand = match.groups() - operand = operand.strip() - # Normalize <> to != - if operator == "<>": - operator = "!=" - # Check if operand is numeric - if operand.replace(".", "").replace("-", "").isdigit(): - return f"{{model}}.{field} {operator} {operand}" - else: - return f"{{model}}.{field} {operator} '{operand}'" - - # Handle wildcard patterns (LIKE) - if "%" in value or "_" in value: - return f"{{model}}.{field} LIKE '{value}'" + col = f"{{model}}.{field}" + + def q(s: str) -> str: + return "'" + s.replace("'", "''") + "'" + + is_number = self._filter_is_number + + def single(v: str) -> str: + """Convert one (non-list) LookML filter token to a SQL condition.""" + v = v.strip() + up = v.upper() + if up == "NULL": + return f"{col} IS NULL" + if up == "-NULL": + return f"{col} IS NOT NULL" + if up == "EMPTY": + return f"({col} IS NULL OR {col} = '')" + if up == "-EMPTY": + return f"({col} IS NOT NULL AND {col} <> '')" + if v.lower() == "yes": + return f"{col} = true" + if v.lower() == "no": + return f"{col} = false" + + # "not X" / "NOT X" negation + nm = re.match(r"(?i)^not\s+(.+)$", v) + neg = nm.group(1).strip() if nm else None + # single-value "-X" negation (but not a negative number / operator) + if neg is None and v.startswith("-") and len(v) > 1 and not re.match(r"^-(>=|<=|!=|<>|>|<|\d|\.)", v): + neg = v[1:] + if neg is not None: + if "%" in neg or "_" in neg: + return f"{col} NOT LIKE {q(neg)}" + if is_number(neg): + return f"{col} != {neg}" + return f"{col} != {q(neg)}" + + # before / after (date or numeric bound). Per Looker, "before" + # is exclusive and "after" is inclusive (on or after the boundary). + bm = re.match(r"(?i)^(before|after)\s+(.+)$", v) + if bm: + op = "<" if bm.group(1).lower() == "before" else ">=" + operand = bm.group(2).strip() + rhs = operand if is_number(operand) else q(operand) + return f"{col} {op} {rhs}" + + # comparison operators ">=", "<=", "!=", "<>", ">", "<" + cm = re.match(r"^(>=|<=|!=|<>|>|<)\s*(.+)$", v) + if cm: + operator, operand = cm.group(1), cm.group(2).strip() + if operator == "<>": + operator = "!=" + rhs = operand if is_number(operand) else q(operand) + return f"{col} {operator} {rhs}" + + # wildcard LIKE + if "%" in v or "_" in v: + return f"{col} LIKE {q(v)}" + + # numeric equality (incl. negative numbers) + if is_number(v): + return f"{col} = {v}" + + # date/interval expression we cannot translate yet -> warn instead of + # silently emitting a string equality that matches zero rows. + if self._DATE_FILTER_RE.search(v): + logger.warning( + "LookML date/interval filter %r on field %r is not translated to SQL; " + "emitting a literal equality (will not match as Looker intends).", + v, + field, + ) - # Handle numeric values - if value.replace(".", "").replace("-", "").isdigit(): - return f"{{model}}.{field} = {value}" + return f"{col} = {q(v)}" + + value = (value or "").strip() + + # Numeric interval: [a,b] (a,b) [a,b) (a,b] + im = re.match(r"^([\[\(])\s*(-?\d*\.?\d*)\s*,\s*(-?\d*\.?\d*)\s*([\]\)])$", value) + if im: + lb, lo, hi, rb = im.groups() + conds = [] + if lo != "": + conds.append(f"{col} >{'=' if lb == '[' else ''} {lo}") + if hi != "": + conds.append(f"{col} <{'=' if rb == ']' else ''} {hi}") + if conds: + return conds[0] if len(conds) == 1 else "(" + " AND ".join(conds) + ")" + + # Numeric range: "a to b", "a to", "to b" + rm = re.match(r"(?i)^(-?\d*\.?\d*)\s*to\s*(-?\d*\.?\d*)$", value) + if rm: + lo, hi = rm.group(1), rm.group(2) + conds = [] + if lo != "": + conds.append(f"{col} >= {lo}") + if hi != "": + conds.append(f"{col} <= {hi}") + if conds: + return conds[0] if len(conds) == 1 else "(" + " AND ".join(conds) + ")" + + # Comma-separated list + if "," in value: + parts = [p.strip() for p in value.split(",") if p.strip() != ""] + + def is_plain(p: str) -> bool: + return ( + not p.startswith("-") + and not re.match(r"^(>=|<=|!=|<>|>|<)", p) + and not re.match(r"(?i)^not\s", p) + and "%" not in p + and "_" not in p + ) - # Default: string equality - return f"{{model}}.{field} = '{value}'" + def is_neg_plain(p: str) -> bool: + return p.startswith("-") and "%" not in p[1:] and "_" not in p[1:] and not re.match(r"^-(\d|\.)", p) + + # A single leading "NOT" negates the whole list: "NOT 1, 2, 3" -> NOT IN (1, 2, 3). + lead_not = re.match(r"(?i)^not\s+(.+)$", value) + if lead_not: + neg_parts = [p.strip() for p in lead_not.group(1).split(",") if p.strip() != ""] + if neg_parts and all(is_plain(p) for p in neg_parts): + if all(is_number(p) for p in neg_parts): + return f"{col} NOT IN ({', '.join(neg_parts)})" + return f"{col} NOT IN ({', '.join(q(p) for p in neg_parts)})" + + if parts and all(is_plain(p) for p in parts): + if all(is_number(p) for p in parts): + return f"{col} IN ({', '.join(parts)})" + return f"{col} IN ({', '.join(q(p) for p in parts)})" + + if parts and all(is_neg_plain(p) for p in parts): + clean = [p[1:] for p in parts] + if all(is_number(p) for p in clean): + return f"{col} NOT IN ({', '.join(clean)})" + return f"{col} NOT IN ({', '.join(q(p) for p in clean)})" + + # Mixed list: OR the includes together, AND the exclusions. + includes, excludes = [], [] + for p in parts: + cond = single(p) + is_exclude = (p.startswith("-") and not re.match(r"^-(\d|\.)", p)) or bool(re.match(r"(?i)^not\s", p)) + (excludes if is_exclude else includes).append(cond) + clauses = [] + if includes: + clauses.append("(" + " OR ".join(includes) + ")" if len(includes) > 1 else includes[0]) + clauses.extend(excludes) + return "(" + " AND ".join(clauses) + ")" if len(clauses) > 1 else clauses[0] + + return single(value) def _parse_view(self, view_def: dict) -> Model | None: """Parse LookML view into Sidemantic model. diff --git a/tests/adapters/lookml/test_edge_cases.py b/tests/adapters/lookml/test_edge_cases.py index 3a13a82b..84e0cb2c 100644 --- a/tests/adapters/lookml/test_edge_cases.py +++ b/tests/adapters/lookml/test_edge_cases.py @@ -2028,5 +2028,79 @@ def test_lookml_multiple_refinements_merged(): assert base.get_metric("count") is not None +def test_lookml_filter_grammar_conversion(): + """Lock in the corrected Looker filter-expression -> SQL conversion. + + Regression coverage for the audit findings: date/numeric ranges, NOT/negation, + EMPTY's NULL case, wildcard NOT LIKE, mixed lists, and single-quote escaping. + """ + adapter = LookMLAdapter() + conv = adapter._convert_lookml_filter_to_sql + f = "{model}.f" + + # Single-quote escaping (was a SQL-injection / breakage bug) + assert conv("f", "O'Brien") == f"{f} = 'O''Brien'" + + # EMPTY must include the NULL case + assert conv("f", "EMPTY") == f"({f} IS NULL OR {f} = '')" + assert conv("f", "-EMPTY") == f"({f} IS NOT NULL AND {f} <> '')" + + # NULL passthrough + assert conv("f", "NULL") == f"{f} IS NULL" + assert conv("f", "-NULL") == f"{f} IS NOT NULL" + + # Numeric ranges and interval syntax (were string-equality / garbage IN) + assert conv("f", "5 to 10") == f"({f} >= 5 AND {f} <= 10)" + assert conv("f", "5 to") == f"{f} >= 5" + assert conv("f", "to 10") == f"{f} <= 10" + assert conv("f", "[1,10]") == f"({f} >= 1 AND {f} <= 10)" + assert conv("f", "(1,10)") == f"({f} > 1 AND {f} < 10)" + assert conv("f", "[1,10)") == f"({f} >= 1 AND {f} < 10)" + assert conv("f", "(1,10]") == f"({f} > 1 AND {f} <= 10)" + + # NOT / negation (were treated as literal string equality) + assert conv("f", "NOT 5") == f"{f} != 5" + assert conv("f", "not foo") == f"{f} != 'foo'" + assert conv("f", "-Completed") == f"{f} != 'Completed'" + + # before / after bounds (Looker: before exclusive, after inclusive) + assert conv("f", "before 2020-01-01") == f"{f} < '2020-01-01'" + assert conv("f", "after 2020-01-01") == f"{f} >= '2020-01-01'" + + # Comparisons + assert conv("f", ">100") == f"{f} > 100" + assert conv("f", "<=5") == f"{f} <= 5" + assert conv("f", "<>0") == f"{f} != 0" + + # Wildcards, incl. negated -> NOT LIKE (was != '%foo%') + assert conv("f", "%foo%") == f"{f} LIKE '%foo%'" + assert conv("f", "-%foo%") == f"{f} NOT LIKE '%foo%'" + + # Lists + assert conv("f", "a,b") == f"{f} IN ('a', 'b')" + assert conv("f", "1,5,9") == f"{f} IN (1, 5, 9)" + assert conv("f", "-a,-b") == f"{f} NOT IN ('a', 'b')" + # A single leading NOT negates the whole list + assert conv("f", "NOT 66, 99, 4") == f"{f} NOT IN (66, 99, 4)" + # Mixed-operator list is no longer silently mangled (valid SQL) + assert conv("f", ">1,<5") == f"({f} > 1 OR {f} < 5)" + assert conv("f", "%a%,%b%") == f"({f} LIKE '%a%' OR {f} LIKE '%b%')" + + # yes/no + assert conv("f", "yes") == f"{f} = true" + assert conv("f", "no") == f"{f} = false" + + +def test_lookml_filter_date_expression_warns(caplog): + """Untranslated date/interval filters should warn, not silently string-equal.""" + import logging + + adapter = LookMLAdapter() + with caplog.at_level(logging.WARNING): + result = adapter._convert_lookml_filter_to_sql("created_date", "last 7 days") + assert "last 7 days" in result # value preserved + assert any("not translated" in rec.getMessage() for rec in caplog.records) + + if __name__ == "__main__": pytest.main([__file__, "-v"])