From 00ff35dc9133a3442b0957801837566e6f4995af Mon Sep 17 00:00:00 2001
From: Nico Ritschel <ritschel@gmail.com>
Date: Fri, 26 Jun 2026 10:44:21 -0700
Subject: [PATCH] Fix LookML filter-expression to SQL conversion

The LookML adapter treated Looker filter values as opaque strings, so
anything beyond simple comparisons produced wrong or invalid SQL:
- date/numeric ranges and intervals ([1,10], "5 to 10") became string
  equality or malformed IN lists (matched zero rows)
- "not X" / negation -> literal "= 'not X'"
- EMPTY -> omitted the NULL case
- single quotes were not escaped (broken SQL / injection risk)

Rewrite _convert_lookml_filter_to_sql to implement the representable parts
of Looker's filter expression language (numeric ranges, interval brackets,
NOT/negation, EMPTY = NULL OR '', wildcard NOT LIKE, mixed lists) and
escape quoted string literals. Untranslated date/interval expressions now
log a warning instead of silently emitting a zero-row equality.
---
 sidemantic/adapters/lookml.py            | 289 +++++++++++++++--------
 tests/adapters/lookml/test_edge_cases.py |  74 ++++++
 2 files changed, 266 insertions(+), 97 deletions(-)

diff --git a/sidemantic/adapters/lookml.py b/sidemantic/adapters/lookml.py
index c2607b79..7832196b 100644
--- a/sidemantic/adapters/lookml.py
+++ b/sidemantic/adapters/lookml.py
@@ -1,5 +1,6 @@
 """LookML adapter for importing Looker semantic models."""
 
+import logging
 import re
 from pathlib import Path
 
@@ -10,6 +11,8 @@
 from sidemantic.core.relationship import Relationship
 from sidemantic.core.semantic_graph import SemanticGraph
 
+logger = logging.getLogger(__name__)
+
 
 def _import_lkml():
     """Lazily import lkml, raising a clear error if not installed."""
@@ -193,107 +196,199 @@ def replace_ref(match: re.Match) -> str:
 
         return resolved
 
-    def _convert_lookml_filter_to_sql(self, field: str, value: str) -> str:
-        """Convert a LookML filter value to SQL condition.
-
-        Handles LookML filter syntax:
-        - "value" -> field = 'value'
-        - "val1,val2,val3" -> field IN ('val1', 'val2', 'val3')
-        - "-value" -> field != 'value' (negation)
-        - "-val1,-val2" -> field NOT IN ('val1', 'val2')
-        - "yes"/"no" -> field = true/false (for yesno dimensions)
-        - ">100", ">=50", "<10", "<=5", "!=0" -> numeric comparisons
-        - "%pattern%" -> field LIKE '%pattern%' (wildcards)
-        - "NULL" -> field IS NULL
-        - "-NULL" -> field IS NOT NULL
-        - "EMPTY" -> field = ''
-        - "-EMPTY" -> field != ''
-
-        Args:
-            field: The field name
-            value: The LookML filter value
+    @staticmethod
+    def _filter_is_number(s: str) -> bool:
+        """True if ``s`` is a plain numeric literal (incl. signed / decimal)."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
+    # Tokens that mark a value as a LookML date/interval filter expression
+    # (e.g. "last 7 days", "3 months ago", "this year"). These are not yet
+    # translated to SQL; we warn rather than silently string-comparing them.
+    _DATE_FILTER_RE = re.compile(
+        r"(?i)\b(ago|day|days|week|weeks|month|months|year|years|quarter|quarters|"
+        r"hour|hours|minute|minutes|second|seconds|today|yesterday|tomorrow|now|fiscal|"
+        r"week|month|year)\b"
+    )
 
-        Returns:
-            SQL condition string
+    def _convert_lookml_filter_to_sql(self, field: str, value: str) -> str:
+        """Convert a LookML filter value to a SQL condition.
+
+        Implements the representable parts of Looker's filter expression
+        language (https://cloud.google.com/looker/docs/filter-expressions):
+
+        - ``value`` -> ``field = 'value'`` (single quotes escaped)
+        - ``a,b,c`` -> ``field IN ('a','b','c')`` (numeric: unquoted)
+        - ``-value`` / ``not value`` -> ``field <> 'value'``
+        - ``-a,-b`` -> ``field NOT IN ('a','b')``
+        - ``-%pat%`` / ``not %pat%`` -> ``field NOT LIKE '%pat%'``
+        - ``yes`` / ``no`` -> ``field = true`` / ``field = false``
+        - ``>100``, ``>=5``, ``<=5``, ``<10``, ``!=0``, ``<>0`` -> comparisons
+        - ``5 to 10`` -> ``field >= 5 AND field <= 10`` (open: ``5 to`` / ``to 10``)
+        - ``[1,10]`` ``(1,10)`` ``[1,10)`` ``(1,10]`` -> inclusive/exclusive ranges
+        - ``%pat%`` / ``_at`` -> ``field LIKE '%pat%'``
+        - ``NULL`` / ``-NULL`` -> ``IS NULL`` / ``IS NOT NULL``
+        - ``EMPTY`` -> ``(field IS NULL OR field = '')`` (and the negation)
+        - ``before X`` / ``after X`` -> ``field < X`` / ``field > X``
+        - mixed comma lists (operators/wildcards) -> includes OR'd, excludes AND'd
+
+        Date/interval expressions (``last 7 days``, ``3 months ago`` ...) are not
+        yet translated: they emit a literal equality but log a warning so the
+        silent zero-row match is at least surfaced.
         """
-        # Handle NULL special values
-        if value.upper() == "NULL":
-            return f"{{model}}.{field} IS NULL"
-        if value.upper() == "-NULL":
-            return f"{{model}}.{field} IS NOT NULL"
-
-        # Handle EMPTY special values
-        if value.upper() == "EMPTY":
-            return f"{{model}}.{field} = ''"
-        if value.upper() == "-EMPTY":
-            return f"{{model}}.{field} != ''"
-
-        # Handle yes/no boolean values
-        if value.lower() == "yes":
-            return f"{{model}}.{field} = true"
-        if value.lower() == "no":
-            return f"{{model}}.{field} = false"
-
-        # Check if this is a comma-separated list of values (OR condition)
-        # But be careful: ">100,<200" is two comparison operators, not a list
-        if "," in value:
-            parts = [p.strip() for p in value.split(",")]
-
-            # Check if all parts are negations (NOT IN)
-            if all(p.startswith("-") for p in parts):
-                # Remove the - prefix from each
-                clean_parts = [p[1:] for p in parts]
-                # Check if they're all simple strings (not operators)
-                if all(not re.match(r"^(>=|<=|!=|<>|>|<)", p) for p in clean_parts):
-                    quoted = ", ".join(f"'{p}'" for p in clean_parts)
-                    return f"{{model}}.{field} NOT IN ({quoted})"
-
-            # Check if all parts are simple values (no operators) -> IN clause
-            if all(not p.startswith("-") and not re.match(r"^(>=|<=|!=|<>|>|<)", p) for p in parts):
-                # Check if all parts are numeric
-                if all(p.replace(".", "").replace("-", "").isdigit() for p in parts):
-                    # Numeric IN clause (no quotes)
-                    return f"{{model}}.{field} IN ({', '.join(parts)})"
-                else:
-                    # String IN clause (with quotes)
-                    quoted = ", ".join(f"'{p}'" for p in parts)
-                    return f"{{model}}.{field} IN ({quoted})"
-
-            # Mixed operators - this is actually multiple filter conditions
-            # LookML doesn't really support this in a single filter value
-            # Fall through to single value handling (will be slightly wrong but safer)
-
-        # Handle negation prefix for single values
-        if value.startswith("-") and not re.match(r"^-(>=|<=|!=|<>|>|<|\d)", value):
-            negated_value = value[1:]
-            if negated_value.replace(".", "").replace("-", "").isdigit():
-                return f"{{model}}.{field} != {negated_value}"
-            else:
-                return f"{{model}}.{field} != '{negated_value}'"
-
-        # Handle comparison operators: ">1000", "<=100", ">=5", "<10", "!=0"
-        if match := re.match(r"^(>=|<=|!=|<>|>|<)(.+)$", value):
-            operator, operand = match.groups()
-            operand = operand.strip()
-            # Normalize <> to !=
-            if operator == "<>":
-                operator = "!="
-            # Check if operand is numeric
-            if operand.replace(".", "").replace("-", "").isdigit():
-                return f"{{model}}.{field} {operator} {operand}"
-            else:
-                return f"{{model}}.{field} {operator} '{operand}'"
-
-        # Handle wildcard patterns (LIKE)
-        if "%" in value or "_" in value:
-            return f"{{model}}.{field} LIKE '{value}'"
+        col = f"{{model}}.{field}"
+
+        def q(s: str) -> str:
+            return "'" + s.replace("'", "''") + "'"
+
+        is_number = self._filter_is_number
+
+        def single(v: str) -> str:
+            """Convert one (non-list) LookML filter token to a SQL condition."""
+            v = v.strip()
+            up = v.upper()
+            if up == "NULL":
+                return f"{col} IS NULL"
+            if up == "-NULL":
+                return f"{col} IS NOT NULL"
+            if up == "EMPTY":
+                return f"({col} IS NULL OR {col} = '')"
+            if up == "-EMPTY":
+                return f"({col} IS NOT NULL AND {col} <> '')"
+            if v.lower() == "yes":
+                return f"{col} = true"
+            if v.lower() == "no":
+                return f"{col} = false"
+
+            # "not X" / "NOT X" negation
+            nm = re.match(r"(?i)^not\s+(.+)$", v)
+            neg = nm.group(1).strip() if nm else None
+            # single-value "-X" negation (but not a negative number / operator)
+            if neg is None and v.startswith("-") and len(v) > 1 and not re.match(r"^-(>=|<=|!=|<>|>|<|\d|\.)", v):
+                neg = v[1:]
+            if neg is not None:
+                if "%" in neg or "_" in neg:
+                    return f"{col} NOT LIKE {q(neg)}"
+                if is_number(neg):
+                    return f"{col} != {neg}"
+                return f"{col} != {q(neg)}"
+
+            # before / after <value> (date or numeric bound). Per Looker, "before"
+            # is exclusive and "after" is inclusive (on or after the boundary).
+            bm = re.match(r"(?i)^(before|after)\s+(.+)$", v)
+            if bm:
+                op = "<" if bm.group(1).lower() == "before" else ">="
+                operand = bm.group(2).strip()
+                rhs = operand if is_number(operand) else q(operand)
+                return f"{col} {op} {rhs}"
+
+            # comparison operators ">=", "<=", "!=", "<>", ">", "<"
+            cm = re.match(r"^(>=|<=|!=|<>|>|<)\s*(.+)$", v)
+            if cm:
+                operator, operand = cm.group(1), cm.group(2).strip()
+                if operator == "<>":
+                    operator = "!="
+                rhs = operand if is_number(operand) else q(operand)
+                return f"{col} {operator} {rhs}"
+
+            # wildcard LIKE
+            if "%" in v or "_" in v:
+                return f"{col} LIKE {q(v)}"
+
+            # numeric equality (incl. negative numbers)
+            if is_number(v):
+                return f"{col} = {v}"
+
+            # date/interval expression we cannot translate yet -> warn instead of
+            # silently emitting a string equality that matches zero rows.
+            if self._DATE_FILTER_RE.search(v):
+                logger.warning(
+                    "LookML date/interval filter %r on field %r is not translated to SQL; "
+                    "emitting a literal equality (will not match as Looker intends).",
+                    v,
+                    field,
+                )
 
-        # Handle numeric values
-        if value.replace(".", "").replace("-", "").isdigit():
-            return f"{{model}}.{field} = {value}"
+            return f"{col} = {q(v)}"
+
+        value = (value or "").strip()
+
+        # Numeric interval: [a,b] (a,b) [a,b) (a,b]
+        im = re.match(r"^([\[\(])\s*(-?\d*\.?\d*)\s*,\s*(-?\d*\.?\d*)\s*([\]\)])$", value)
+        if im:
+            lb, lo, hi, rb = im.groups()
+            conds = []
+            if lo != "":
+                conds.append(f"{col} >{'=' if lb == '[' else ''} {lo}")
+            if hi != "":
+                conds.append(f"{col} <{'=' if rb == ']' else ''} {hi}")
+            if conds:
+                return conds[0] if len(conds) == 1 else "(" + " AND ".join(conds) + ")"
+
+        # Numeric range: "a to b", "a to", "to b"
+        rm = re.match(r"(?i)^(-?\d*\.?\d*)\s*to\s*(-?\d*\.?\d*)$", value)
+        if rm:
+            lo, hi = rm.group(1), rm.group(2)
+            conds = []
+            if lo != "":
+                conds.append(f"{col} >= {lo}")
+            if hi != "":
+                conds.append(f"{col} <= {hi}")
+            if conds:
+                return conds[0] if len(conds) == 1 else "(" + " AND ".join(conds) + ")"
+
+        # Comma-separated list
+        if "," in value:
+            parts = [p.strip() for p in value.split(",") if p.strip() != ""]
+
+            def is_plain(p: str) -> bool:
+                return (
+                    not p.startswith("-")
+                    and not re.match(r"^(>=|<=|!=|<>|>|<)", p)
+                    and not re.match(r"(?i)^not\s", p)
+                    and "%" not in p
+                    and "_" not in p
+                )
 
-        # Default: string equality
-        return f"{{model}}.{field} = '{value}'"
+            def is_neg_plain(p: str) -> bool:
+                return p.startswith("-") and "%" not in p[1:] and "_" not in p[1:] and not re.match(r"^-(\d|\.)", p)
+
+            # A single leading "NOT" negates the whole list: "NOT 1, 2, 3" -> NOT IN (1, 2, 3).
+            lead_not = re.match(r"(?i)^not\s+(.+)$", value)
+            if lead_not:
+                neg_parts = [p.strip() for p in lead_not.group(1).split(",") if p.strip() != ""]
+                if neg_parts and all(is_plain(p) for p in neg_parts):
+                    if all(is_number(p) for p in neg_parts):
+                        return f"{col} NOT IN ({', '.join(neg_parts)})"
+                    return f"{col} NOT IN ({', '.join(q(p) for p in neg_parts)})"
+
+            if parts and all(is_plain(p) for p in parts):
+                if all(is_number(p) for p in parts):
+                    return f"{col} IN ({', '.join(parts)})"
+                return f"{col} IN ({', '.join(q(p) for p in parts)})"
+
+            if parts and all(is_neg_plain(p) for p in parts):
+                clean = [p[1:] for p in parts]
+                if all(is_number(p) for p in clean):
+                    return f"{col} NOT IN ({', '.join(clean)})"
+                return f"{col} NOT IN ({', '.join(q(p) for p in clean)})"
+
+            # Mixed list: OR the includes together, AND the exclusions.
+            includes, excludes = [], []
+            for p in parts:
+                cond = single(p)
+                is_exclude = (p.startswith("-") and not re.match(r"^-(\d|\.)", p)) or bool(re.match(r"(?i)^not\s", p))
+                (excludes if is_exclude else includes).append(cond)
+            clauses = []
+            if includes:
+                clauses.append("(" + " OR ".join(includes) + ")" if len(includes) > 1 else includes[0])
+            clauses.extend(excludes)
+            return "(" + " AND ".join(clauses) + ")" if len(clauses) > 1 else clauses[0]
+
+        return single(value)
 
     def _parse_view(self, view_def: dict) -> Model | None:
         """Parse LookML view into Sidemantic model.
diff --git a/tests/adapters/lookml/test_edge_cases.py b/tests/adapters/lookml/test_edge_cases.py
index 3a13a82b..84e0cb2c 100644
--- a/tests/adapters/lookml/test_edge_cases.py
+++ b/tests/adapters/lookml/test_edge_cases.py
@@ -2028,5 +2028,79 @@ def test_lookml_multiple_refinements_merged():
         assert base.get_metric("count") is not None
 
 
+def test_lookml_filter_grammar_conversion():
+    """Lock in the corrected Looker filter-expression -> SQL conversion.
+
+    Regression coverage for the audit findings: date/numeric ranges, NOT/negation,
+    EMPTY's NULL case, wildcard NOT LIKE, mixed lists, and single-quote escaping.
+    """
+    adapter = LookMLAdapter()
+    conv = adapter._convert_lookml_filter_to_sql
+    f = "{model}.f"
+
+    # Single-quote escaping (was a SQL-injection / breakage bug)
+    assert conv("f", "O'Brien") == f"{f} = 'O''Brien'"
+
+    # EMPTY must include the NULL case
+    assert conv("f", "EMPTY") == f"({f} IS NULL OR {f} = '')"
+    assert conv("f", "-EMPTY") == f"({f} IS NOT NULL AND {f} <> '')"
+
+    # NULL passthrough
+    assert conv("f", "NULL") == f"{f} IS NULL"
+    assert conv("f", "-NULL") == f"{f} IS NOT NULL"
+
+    # Numeric ranges and interval syntax (were string-equality / garbage IN)
+    assert conv("f", "5 to 10") == f"({f} >= 5 AND {f} <= 10)"
+    assert conv("f", "5 to") == f"{f} >= 5"
+    assert conv("f", "to 10") == f"{f} <= 10"
+    assert conv("f", "[1,10]") == f"({f} >= 1 AND {f} <= 10)"
+    assert conv("f", "(1,10)") == f"({f} > 1 AND {f} < 10)"
+    assert conv("f", "[1,10)") == f"({f} >= 1 AND {f} < 10)"
+    assert conv("f", "(1,10]") == f"({f} > 1 AND {f} <= 10)"
+
+    # NOT / negation (were treated as literal string equality)
+    assert conv("f", "NOT 5") == f"{f} != 5"
+    assert conv("f", "not foo") == f"{f} != 'foo'"
+    assert conv("f", "-Completed") == f"{f} != 'Completed'"
+
+    # before / after bounds (Looker: before exclusive, after inclusive)
+    assert conv("f", "before 2020-01-01") == f"{f} < '2020-01-01'"
+    assert conv("f", "after 2020-01-01") == f"{f} >= '2020-01-01'"
+
+    # Comparisons
+    assert conv("f", ">100") == f"{f} > 100"
+    assert conv("f", "<=5") == f"{f} <= 5"
+    assert conv("f", "<>0") == f"{f} != 0"
+
+    # Wildcards, incl. negated -> NOT LIKE (was != '%foo%')
+    assert conv("f", "%foo%") == f"{f} LIKE '%foo%'"
+    assert conv("f", "-%foo%") == f"{f} NOT LIKE '%foo%'"
+
+    # Lists
+    assert conv("f", "a,b") == f"{f} IN ('a', 'b')"
+    assert conv("f", "1,5,9") == f"{f} IN (1, 5, 9)"
+    assert conv("f", "-a,-b") == f"{f} NOT IN ('a', 'b')"
+    # A single leading NOT negates the whole list
+    assert conv("f", "NOT 66, 99, 4") == f"{f} NOT IN (66, 99, 4)"
+    # Mixed-operator list is no longer silently mangled (valid SQL)
+    assert conv("f", ">1,<5") == f"({f} > 1 OR {f} < 5)"
+    assert conv("f", "%a%,%b%") == f"({f} LIKE '%a%' OR {f} LIKE '%b%')"
+
+    # yes/no
+    assert conv("f", "yes") == f"{f} = true"
+    assert conv("f", "no") == f"{f} = false"
+
+
+def test_lookml_filter_date_expression_warns(caplog):
+    """Untranslated date/interval filters should warn, not silently string-equal."""
+    import logging
+
+    adapter = LookMLAdapter()
+    with caplog.at_level(logging.WARNING):
+        result = adapter._convert_lookml_filter_to_sql("created_date", "last 7 days")
+    assert "last 7 days" in result  # value preserved
+    assert any("not translated" in rec.getMessage() for rec in caplog.records)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])