Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 192 additions & 97 deletions sidemantic/adapters/lookml.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""LookML adapter for importing Looker semantic models."""

import logging
import re
from pathlib import Path

Expand All @@ -10,6 +11,8 @@
from sidemantic.core.relationship import Relationship
from sidemantic.core.semantic_graph import SemanticGraph

logger = logging.getLogger(__name__)


def _import_lkml():
"""Lazily import lkml, raising a clear error if not installed."""
Expand Down Expand Up @@ -193,107 +196,199 @@ def replace_ref(match: re.Match) -> str:

return resolved

def _convert_lookml_filter_to_sql(self, field: str, value: str) -> str:
"""Convert a LookML filter value to SQL condition.

Handles LookML filter syntax:
- "value" -> field = 'value'
- "val1,val2,val3" -> field IN ('val1', 'val2', 'val3')
- "-value" -> field != 'value' (negation)
- "-val1,-val2" -> field NOT IN ('val1', 'val2')
- "yes"/"no" -> field = true/false (for yesno dimensions)
- ">100", ">=50", "<10", "<=5", "!=0" -> numeric comparisons
- "%pattern%" -> field LIKE '%pattern%' (wildcards)
- "NULL" -> field IS NULL
- "-NULL" -> field IS NOT NULL
- "EMPTY" -> field = ''
- "-EMPTY" -> field != ''

Args:
field: The field name
value: The LookML filter value
@staticmethod
def _filter_is_number(s: str) -> bool:
"""True if ``s`` is a plain numeric literal (incl. signed / decimal)."""
try:
float(s)
return True
except ValueError:
return False

# Tokens that mark a value as a LookML date/interval filter expression
# (e.g. "last 7 days", "3 months ago", "this year"). These are not yet
# translated to SQL; we warn rather than silently string-comparing them.
_DATE_FILTER_RE = re.compile(
r"(?i)\b(ago|day|days|week|weeks|month|months|year|years|quarter|quarters|"
r"hour|hours|minute|minutes|second|seconds|today|yesterday|tomorrow|now|fiscal|"
r"week|month|year)\b"
)

Returns:
SQL condition string
def _convert_lookml_filter_to_sql(self, field: str, value: str) -> str:
"""Convert a LookML filter value to a SQL condition.

Implements the representable parts of Looker's filter expression
language (https://cloud.google.com/looker/docs/filter-expressions):

- ``value`` -> ``field = 'value'`` (single quotes escaped)
- ``a,b,c`` -> ``field IN ('a','b','c')`` (numeric: unquoted)
- ``-value`` / ``not value`` -> ``field <> 'value'``
- ``-a,-b`` -> ``field NOT IN ('a','b')``
- ``-%pat%`` / ``not %pat%`` -> ``field NOT LIKE '%pat%'``
- ``yes`` / ``no`` -> ``field = true`` / ``field = false``
- ``>100``, ``>=5``, ``<=5``, ``<10``, ``!=0``, ``<>0`` -> comparisons
- ``5 to 10`` -> ``field >= 5 AND field <= 10`` (open: ``5 to`` / ``to 10``)
- ``[1,10]`` ``(1,10)`` ``[1,10)`` ``(1,10]`` -> inclusive/exclusive ranges
- ``%pat%`` / ``_at`` -> ``field LIKE '%pat%'``
- ``NULL`` / ``-NULL`` -> ``IS NULL`` / ``IS NOT NULL``
- ``EMPTY`` -> ``(field IS NULL OR field = '')`` (and the negation)
- ``before X`` / ``after X`` -> ``field < X`` / ``field > X``
- mixed comma lists (operators/wildcards) -> includes OR'd, excludes AND'd

Date/interval expressions (``last 7 days``, ``3 months ago`` ...) are not
yet translated: they emit a literal equality but log a warning so the
silent zero-row match is at least surfaced.
"""
# Handle NULL special values
if value.upper() == "NULL":
return f"{{model}}.{field} IS NULL"
if value.upper() == "-NULL":
return f"{{model}}.{field} IS NOT NULL"

# Handle EMPTY special values
if value.upper() == "EMPTY":
return f"{{model}}.{field} = ''"
if value.upper() == "-EMPTY":
return f"{{model}}.{field} != ''"

# Handle yes/no boolean values
if value.lower() == "yes":
return f"{{model}}.{field} = true"
if value.lower() == "no":
return f"{{model}}.{field} = false"

# Check if this is a comma-separated list of values (OR condition)
# But be careful: ">100,<200" is two comparison operators, not a list
if "," in value:
parts = [p.strip() for p in value.split(",")]

# Check if all parts are negations (NOT IN)
if all(p.startswith("-") for p in parts):
# Remove the - prefix from each
clean_parts = [p[1:] for p in parts]
# Check if they're all simple strings (not operators)
if all(not re.match(r"^(>=|<=|!=|<>|>|<)", p) for p in clean_parts):
quoted = ", ".join(f"'{p}'" for p in clean_parts)
return f"{{model}}.{field} NOT IN ({quoted})"

# Check if all parts are simple values (no operators) -> IN clause
if all(not p.startswith("-") and not re.match(r"^(>=|<=|!=|<>|>|<)", p) for p in parts):
# Check if all parts are numeric
if all(p.replace(".", "").replace("-", "").isdigit() for p in parts):
# Numeric IN clause (no quotes)
return f"{{model}}.{field} IN ({', '.join(parts)})"
else:
# String IN clause (with quotes)
quoted = ", ".join(f"'{p}'" for p in parts)
return f"{{model}}.{field} IN ({quoted})"

# Mixed operators - this is actually multiple filter conditions
# LookML doesn't really support this in a single filter value
# Fall through to single value handling (will be slightly wrong but safer)

# Handle negation prefix for single values
if value.startswith("-") and not re.match(r"^-(>=|<=|!=|<>|>|<|\d)", value):
negated_value = value[1:]
if negated_value.replace(".", "").replace("-", "").isdigit():
return f"{{model}}.{field} != {negated_value}"
else:
return f"{{model}}.{field} != '{negated_value}'"

# Handle comparison operators: ">1000", "<=100", ">=5", "<10", "!=0"
if match := re.match(r"^(>=|<=|!=|<>|>|<)(.+)$", value):
operator, operand = match.groups()
operand = operand.strip()
# Normalize <> to !=
if operator == "<>":
operator = "!="
# Check if operand is numeric
if operand.replace(".", "").replace("-", "").isdigit():
return f"{{model}}.{field} {operator} {operand}"
else:
return f"{{model}}.{field} {operator} '{operand}'"

# Handle wildcard patterns (LIKE)
if "%" in value or "_" in value:
return f"{{model}}.{field} LIKE '{value}'"
col = f"{{model}}.{field}"

def q(s: str) -> str:
return "'" + s.replace("'", "''") + "'"

is_number = self._filter_is_number

def single(v: str) -> str:
"""Convert one (non-list) LookML filter token to a SQL condition."""
v = v.strip()
up = v.upper()
if up == "NULL":
return f"{col} IS NULL"
if up == "-NULL":
return f"{col} IS NOT NULL"
if up == "EMPTY":
return f"({col} IS NULL OR {col} = '')"
if up == "-EMPTY":
return f"({col} IS NOT NULL AND {col} <> '')"
if v.lower() == "yes":
return f"{col} = true"
if v.lower() == "no":
return f"{col} = false"

# "not X" / "NOT X" negation
nm = re.match(r"(?i)^not\s+(.+)$", v)
neg = nm.group(1).strip() if nm else None
# single-value "-X" negation (but not a negative number / operator)
if neg is None and v.startswith("-") and len(v) > 1 and not re.match(r"^-(>=|<=|!=|<>|>|<|\d|\.)", v):
neg = v[1:]
if neg is not None:
if "%" in neg or "_" in neg:
return f"{col} NOT LIKE {q(neg)}"
if is_number(neg):
return f"{col} != {neg}"
return f"{col} != {q(neg)}"

# before / after <value> (date or numeric bound). Per Looker, "before"
# is exclusive and "after" is inclusive (on or after the boundary).
bm = re.match(r"(?i)^(before|after)\s+(.+)$", v)
if bm:
op = "<" if bm.group(1).lower() == "before" else ">="
operand = bm.group(2).strip()
rhs = operand if is_number(operand) else q(operand)
return f"{col} {op} {rhs}"
Comment on lines +283 to +286

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Guard relative before/after date filters

When a LookML date filter uses documented relative bounds such as before 3 days ago or after Monday, this branch treats the operand as a literal SQL value before the date-expression warning can run, producing conditions like {model}.created_date < '3 days ago'. On dialects that don't parse those English phrases as dates, imported measures either fail or count the wrong rows; detect relative operands and leave them on the warning/fallback path instead of translating them as absolute bounds.

Useful? React with 👍 / 👎.


# comparison operators ">=", "<=", "!=", "<>", ">", "<"
cm = re.match(r"^(>=|<=|!=|<>|>|<)\s*(.+)$", v)
if cm:
operator, operand = cm.group(1), cm.group(2).strip()
Comment on lines +289 to +291

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Parse numeric AND ranges before single comparisons

When a numeric filter uses Looker's documented AND range syntax inside one condition, for example >1 AND <100, NOT 2, this regex captures 1 AND <100 as the operand for a single > comparison and quotes it because it isn't a number. The mixed-list path then emits a number-to-string comparison instead of field > 1 AND field < 100, so imported filtered measures fail or return nonsense for that supported syntax; split/parse AND subconditions before applying the single-comparison fallback.

Useful? React with 👍 / 👎.

if operator == "<>":
operator = "!="
rhs = operand if is_number(operand) else q(operand)
return f"{col} {operator} {rhs}"

# wildcard LIKE
if "%" in v or "_" in v:
return f"{col} LIKE {q(v)}"

# numeric equality (incl. negative numbers)
if is_number(v):
return f"{col} = {v}"

# date/interval expression we cannot translate yet -> warn instead of
# silently emitting a string equality that matches zero rows.
if self._DATE_FILTER_RE.search(v):
logger.warning(
"LookML date/interval filter %r on field %r is not translated to SQL; "
"emitting a literal equality (will not match as Looker intends).",
v,
field,
)

# Handle numeric values
if value.replace(".", "").replace("-", "").isdigit():
return f"{{model}}.{field} = {value}"
return f"{col} = {q(v)}"

value = (value or "").strip()

# Numeric interval: [a,b] (a,b) [a,b) (a,b]
im = re.match(r"^([\[\(])\s*(-?\d*\.?\d*)\s*,\s*(-?\d*\.?\d*)\s*([\]\)])$", value)
if im:
lb, lo, hi, rb = im.groups()
conds = []
if lo != "":
conds.append(f"{col} >{'=' if lb == '[' else ''} {lo}")
if hi != "":
conds.append(f"{col} <{'=' if rb == ']' else ''} {hi}")
if conds:
return conds[0] if len(conds) == 1 else "(" + " AND ".join(conds) + ")"

# Numeric range: "a to b", "a to", "to b"
rm = re.match(r"(?i)^(-?\d*\.?\d*)\s*to\s*(-?\d*\.?\d*)$", value)
if rm:
lo, hi = rm.group(1), rm.group(2)
conds = []
if lo != "":
conds.append(f"{col} >= {lo}")
if hi != "":
conds.append(f"{col} <= {hi}")
if conds:
return conds[0] if len(conds) == 1 else "(" + " AND ".join(conds) + ")"

# Comma-separated list
if "," in value:
parts = [p.strip() for p in value.split(",") if p.strip() != ""]

def is_plain(p: str) -> bool:
return (
not p.startswith("-")
and not re.match(r"^(>=|<=|!=|<>|>|<)", p)
and not re.match(r"(?i)^not\s", p)
and "%" not in p
and "_" not in p
)

# Default: string equality
return f"{{model}}.{field} = '{value}'"
def is_neg_plain(p: str) -> bool:
return p.startswith("-") and "%" not in p[1:] and "_" not in p[1:] and not re.match(r"^-(\d|\.)", p)

# A single leading "NOT" negates the whole list: "NOT 1, 2, 3" -> NOT IN (1, 2, 3).
lead_not = re.match(r"(?i)^not\s+(.+)$", value)
if lead_not:
neg_parts = [p.strip() for p in lead_not.group(1).split(",") if p.strip() != ""]
if neg_parts and all(is_plain(p) for p in neg_parts):
if all(is_number(p) for p in neg_parts):
return f"{col} NOT IN ({', '.join(neg_parts)})"
return f"{col} NOT IN ({', '.join(q(p) for p in neg_parts)})"

if parts and all(is_plain(p) for p in parts):
if all(is_number(p) for p in parts):
return f"{col} IN ({', '.join(parts)})"
return f"{col} IN ({', '.join(q(p) for p in parts)})"

if parts and all(is_neg_plain(p) for p in parts):
clean = [p[1:] for p in parts]
if all(is_number(p) for p in clean):
return f"{col} NOT IN ({', '.join(clean)})"
return f"{col} NOT IN ({', '.join(q(p) for p in clean)})"

# Mixed list: OR the includes together, AND the exclusions.
includes, excludes = [], []
for p in parts:
cond = single(p)
is_exclude = (p.startswith("-") and not re.match(r"^-(\d|\.)", p)) or bool(re.match(r"(?i)^not\s", p))
(excludes if is_exclude else includes).append(cond)
Comment on lines +381 to +384

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Apply leading NOT to the whole numeric list

Looker's numeric filter syntax treats NOT 66, 99, 4 as excluding all three values (https://docs.cloud.google.com/looker/docs/filter-expressions#number), but this mixed-list path only marks the first NOT 66 token as an exclusion and ORs the remaining plain values as inclusions, producing (f = 99 OR f = 4) AND f != 66. Measures imported from that documented syntax will count only 99/4 instead of everything except 66/99/4; detect a leading NOT with no other NOTs and emit a negated list/range for all parts.

Useful? React with 👍 / 👎.

clauses = []
if includes:
clauses.append("(" + " OR ".join(includes) + ")" if len(includes) > 1 else includes[0])
clauses.extend(excludes)
return "(" + " AND ".join(clauses) + ")" if len(clauses) > 1 else clauses[0]

return single(value)

def _parse_view(self, view_def: dict) -> Model | None:
"""Parse LookML view into Sidemantic model.
Expand Down
74 changes: 74 additions & 0 deletions tests/adapters/lookml/test_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -2028,5 +2028,79 @@ def test_lookml_multiple_refinements_merged():
assert base.get_metric("count") is not None


def test_lookml_filter_grammar_conversion():
"""Lock in the corrected Looker filter-expression -> SQL conversion.

Regression coverage for the audit findings: date/numeric ranges, NOT/negation,
EMPTY's NULL case, wildcard NOT LIKE, mixed lists, and single-quote escaping.
"""
adapter = LookMLAdapter()
conv = adapter._convert_lookml_filter_to_sql
f = "{model}.f"

# Single-quote escaping (was a SQL-injection / breakage bug)
assert conv("f", "O'Brien") == f"{f} = 'O''Brien'"

# EMPTY must include the NULL case
assert conv("f", "EMPTY") == f"({f} IS NULL OR {f} = '')"
assert conv("f", "-EMPTY") == f"({f} IS NOT NULL AND {f} <> '')"

# NULL passthrough
assert conv("f", "NULL") == f"{f} IS NULL"
assert conv("f", "-NULL") == f"{f} IS NOT NULL"

# Numeric ranges and interval syntax (were string-equality / garbage IN)
assert conv("f", "5 to 10") == f"({f} >= 5 AND {f} <= 10)"
assert conv("f", "5 to") == f"{f} >= 5"
assert conv("f", "to 10") == f"{f} <= 10"
assert conv("f", "[1,10]") == f"({f} >= 1 AND {f} <= 10)"
assert conv("f", "(1,10)") == f"({f} > 1 AND {f} < 10)"
assert conv("f", "[1,10)") == f"({f} >= 1 AND {f} < 10)"
assert conv("f", "(1,10]") == f"({f} > 1 AND {f} <= 10)"

# NOT / negation (were treated as literal string equality)
assert conv("f", "NOT 5") == f"{f} != 5"
assert conv("f", "not foo") == f"{f} != 'foo'"
assert conv("f", "-Completed") == f"{f} != 'Completed'"

# before / after bounds (Looker: before exclusive, after inclusive)
assert conv("f", "before 2020-01-01") == f"{f} < '2020-01-01'"
assert conv("f", "after 2020-01-01") == f"{f} >= '2020-01-01'"

# Comparisons
assert conv("f", ">100") == f"{f} > 100"
assert conv("f", "<=5") == f"{f} <= 5"
assert conv("f", "<>0") == f"{f} != 0"

# Wildcards, incl. negated -> NOT LIKE (was != '%foo%')
assert conv("f", "%foo%") == f"{f} LIKE '%foo%'"
assert conv("f", "-%foo%") == f"{f} NOT LIKE '%foo%'"

# Lists
assert conv("f", "a,b") == f"{f} IN ('a', 'b')"
assert conv("f", "1,5,9") == f"{f} IN (1, 5, 9)"
assert conv("f", "-a,-b") == f"{f} NOT IN ('a', 'b')"
# A single leading NOT negates the whole list
assert conv("f", "NOT 66, 99, 4") == f"{f} NOT IN (66, 99, 4)"
# Mixed-operator list is no longer silently mangled (valid SQL)
assert conv("f", ">1,<5") == f"({f} > 1 OR {f} < 5)"
assert conv("f", "%a%,%b%") == f"({f} LIKE '%a%' OR {f} LIKE '%b%')"

# yes/no
assert conv("f", "yes") == f"{f} = true"
assert conv("f", "no") == f"{f} = false"


def test_lookml_filter_date_expression_warns(caplog):
"""Untranslated date/interval filters should warn, not silently string-equal."""
import logging

adapter = LookMLAdapter()
with caplog.at_level(logging.WARNING):
result = adapter._convert_lookml_filter_to_sql("created_date", "last 7 days")
assert "last 7 days" in result # value preserved
assert any("not translated" in rec.getMessage() for rec in caplog.records)


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading