Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2897,6 +2897,13 @@ def main() -> None:
_raw = _json.loads(gp.read_text(encoding="utf-8"))
if "links" not in _raw and "edges" in _raw:
_raw = dict(_raw, links=_raw["edges"])
try:
from graphify.export import check_staleness
_stale_warning = check_staleness(_raw, gp)
if _stale_warning:
print(_stale_warning, file=sys.stderr)
except Exception:
pass # staleness check is diagnostic-only, never blocks a query
try:
G = json_graph.node_link_graph(_raw, edges="links")
except TypeError:
Expand Down Expand Up @@ -3666,7 +3673,7 @@ def main() -> None:
json.dumps(analysis, indent=2, ensure_ascii=False),
encoding="utf-8",
)
to_json(G, communities, str(out / "graph.json"), community_labels=labels)
to_json(G, communities, str(out / "graph.json"), community_labels=labels, indexed_repo_root=watch_path)
labels_path.write_text(json.dumps({str(k): v for k, v in labels.items()}, ensure_ascii=False), encoding="utf-8")
# Membership signatures beside the labels so a later cluster-only can detect
# which communities changed and avoid reusing a stale label (see reuse above).
Expand Down Expand Up @@ -3860,7 +3867,7 @@ def _load_graph(p: str):
except TypeError:
return _jg.node_link_graph(data), data
try:
G_cur, _ = _load_graph(_current_path)
G_cur, _data_cur = _load_graph(_current_path)
G_oth, _ = _load_graph(_other_path)
except Exception as exc:
print(f"[graphify merge-driver] error loading graphs: {exc}", file=sys.stderr)
Expand All @@ -3877,6 +3884,29 @@ def _load_graph(p: str):
out_data = _jg.node_link_data(merged, edges="links")
except TypeError:
out_data = _jg.node_link_data(merged)
# Stamp before writing (d1692f4's chokepoint) so a merged graph.json
# still carries a fresh generated_at/built_at_commit for check_staleness.
# There's no single "indexed root" argument here — git invokes merge
# drivers with (base, current, other) as throwaway temp file paths, not
# the real graphify-out/graph.json location, so _current_path's parent
# dir can't be used the way _infer_merge_root normally would. Instead,
# resolve the actual repo root git is merging in: git runs merge
# drivers with cwd set to the top of the work tree, so `rev-parse
# --show-toplevel` from here IS the indexed repo root. Fall back to
# whatever root the current side was already stamped with, then to cwd.
import subprocess as _sp
try:
_mr = _sp.run(
["git", "rev-parse", "--show-toplevel"],
capture_output=True, text=True, timeout=3,
)
_merge_repo_root = _mr.stdout.strip() if _mr.returncode == 0 and _mr.stdout.strip() else None
except Exception:
_merge_repo_root = None
if not _merge_repo_root:
_merge_repo_root = _data_cur.get("indexed_repo_root") or str(Path.cwd())
from graphify.export import stamp_graph_metadata as _stamp_graph_metadata
_stamp_graph_metadata(out_data, indexed_repo_root=_merge_repo_root)
Path(_current_path).write_text(json.dumps(out_data, indent=2), encoding="utf-8")
sys.exit(0)

Expand Down Expand Up @@ -4907,6 +4937,8 @@ def _progress(idx: int, total: int, _result: dict) -> None:
_node_sf.get(_e.get("source")) or _node_sf.get(_e.get("target")) or ""
)
_backup(graphify_out)
from graphify.export import stamp_graph_metadata as _stamp_graph_metadata
_stamp_graph_metadata(merged, indexed_repo_root=target)
graph_json_path.write_text(
json.dumps(merged, indent=2), encoding="utf-8"
)
Expand Down Expand Up @@ -4991,7 +5023,7 @@ def _progress(idx: int, total: int, _result: dict) -> None:

from graphify.export import backup_if_protected as _backup
_backup(graphify_out)
_to_json(G, communities, str(graph_json_path), force=True)
_to_json(G, communities, str(graph_json_path), force=True, indexed_repo_root=target)
stages.mark("export")
if merged.get("output_tokens", 0) > 0:
(graphify_out / ".graphify_semantic_marker").write_text(
Expand Down
86 changes: 81 additions & 5 deletions graphify/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import shutil
import sys
from collections import Counter
from datetime import date
from datetime import date, datetime, timezone
from pathlib import Path
import networkx as nx
from networkx.readwrite import json_graph
Expand Down Expand Up @@ -482,7 +482,85 @@ def _git_head() -> str | None:
return None


def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None, community_labels: dict[int, str] | None = None) -> bool:
def check_staleness(raw: dict, graph_path: Path) -> str | None:
"""Return a one-line stale-graph warning if graph.json is out of date, else None.

"Out of date" means the on-disk graph.json predates the last git commit of
the repo it indexes. Missing/unreadable stamps (graphs built by an older
graphify version) also warn, telling the caller to regenerate. Never
raises — any failure resolving the repo root or running git is treated as
"can't tell", not an error, so this is safe to call outside a git repo.
"""
generated_at = raw.get("generated_at")
if not generated_at:
return (
f"[graphify] warning: {graph_path} has no generation timestamp "
"(built by an older graphify version) - no stamp, regenerate with `graphify .`"
)
try:
stamp = datetime.fromisoformat(generated_at)
except (TypeError, ValueError):
return (
f"[graphify] warning: {graph_path} has an unreadable generation "
"timestamp - no stamp, regenerate with `graphify .`"
)

import subprocess as _sp
# Prefer the indexed repo root recorded IN the graph at write time (stamp_graph_metadata) —
# correct even when the graph was written to --out <elsewhere>. Only legacy graphs
# (written before this field existed) fall back to inferring the root from the
# graph file's own location, which is wrong for an out-of-tree --out path.
repo_root = raw.get("indexed_repo_root")
if not repo_root:
from graphify.build import _infer_merge_root
repo_root = _infer_merge_root(graph_path)
if repo_root is None:
return None # can't resolve the indexed root - nothing to compare against
try:
r = _sp.run(
["git", "-C", str(repo_root), "log", "-1", "--format=%cI"],
capture_output=True, text=True, timeout=5,
)
except Exception:
return None # git not available - nothing to compare against
if r.returncode != 0 or not r.stdout.strip():
return None # not a git repo (or no commits yet)
try:
last_commit = datetime.fromisoformat(r.stdout.strip())
except ValueError:
return None

if last_commit > stamp:
return (
f"[graphify] warning: graph.json was generated {generated_at} but "
f"the indexed repo's last commit is {r.stdout.strip()} - graph is "
"stale, run `graphify .` (or `graphify update`) to refresh"
)
return None


def stamp_graph_metadata(data: dict, *, indexed_repo_root: "str | os.PathLike | None" = None, built_at_commit: str | None = None) -> None:
"""Stamp graph.json metadata that ``check_staleness`` reads: generation
timestamp, build commit, and the indexed repo's root.

This is the ONE chokepoint every graph.json writer must call — clustered
(``to_json``) and raw ``--no-cluster`` writers alike — so staleness detection
works regardless of which code path produced the file or where ``--out`` put
it (#1618-followup: a prior fix stamped only ``to_json``, leaving --no-cluster
writes and --out-elsewhere graphs unstamped/unresolvable).
"""
commit = built_at_commit if built_at_commit is not None else _git_head()
if commit:
data["built_at_commit"] = commit
data["generated_at"] = datetime.now(timezone.utc).isoformat()
if indexed_repo_root is not None:
try:
data["indexed_repo_root"] = str(Path(indexed_repo_root).resolve())
except OSError:
data["indexed_repo_root"] = str(indexed_repo_root)


def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None, community_labels: dict[int, str] | None = None, indexed_repo_root: "str | os.PathLike | None" = None) -> bool:
# Safety check: refuse to silently shrink an existing graph (#479)
existing_path = Path(output_path)
if not force and existing_path.exists():
Expand Down Expand Up @@ -534,9 +612,7 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *,
link["source"] = true_src
link["target"] = true_tgt
data["hyperedges"] = getattr(G, "graph", {}).get("hyperedges", [])
commit = built_at_commit if built_at_commit is not None else _git_head()
if commit:
data["built_at_commit"] = commit
stamp_graph_metadata(data, indexed_repo_root=indexed_repo_root, built_at_commit=built_at_commit)
with open(output_path, "w", encoding="utf-8") as f: # nosec
json.dump(data, f, indent=2)
return True
Expand Down
11 changes: 10 additions & 1 deletion graphify/watch.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,12 @@ def _node_community_map(graph_data: dict) -> dict[str, int]:

def _canonical_graph_for_compare(graph_data: dict) -> dict:
canonical = dict(graph_data)
# Build-time metadata stamped by stamp_graph_metadata() varies every run
# (timestamp always changes; commit/root can too) without reflecting an
# actual graph-content change, so it must never affect a same-graph verdict.
canonical.pop("built_at_commit", None)
canonical.pop("generated_at", None)
canonical.pop("indexed_repo_root", None)
for key in ("nodes", "links", "edges", "hyperedges"):
if key in canonical and isinstance(canonical[key], list):
canonical[key] = sorted(
Expand All @@ -285,6 +290,8 @@ def _canonical_graph_for_compare(graph_data: dict) -> dict:
def _canonical_topology_for_compare(graph_data: dict) -> dict:
canonical = dict(graph_data)
canonical.pop("built_at_commit", None)
canonical.pop("generated_at", None)
canonical.pop("indexed_repo_root", None)

nodes = canonical.get("nodes")
if isinstance(nodes, list):
Expand Down Expand Up @@ -713,11 +720,13 @@ def _edge_evicted(e: dict) -> bool:
# without it, --no-cluster + repeated `update` accumulate duplicates and edge
# counts diverge across build modes (#1317).
from graphify.build import dedupe_edges as _dedupe_edges, dedupe_nodes as _dedupe_nodes
from graphify.export import stamp_graph_metadata as _stamp_graph_metadata
candidate_graph_data = {
**{k: v for k, v in result.items() if k not in ("edges", "nodes")},
"nodes": _dedupe_nodes(result.get("nodes", [])),
"links": _dedupe_edges(result.get("edges", [])),
}
_stamp_graph_metadata(candidate_graph_data, indexed_repo_root=project_root, built_at_commit=commit)
candidate_graph_text = _json_text(candidate_graph_data)
same_graph = False
if existing_graph.exists():
Expand Down Expand Up @@ -817,7 +826,7 @@ def _edge_evicted(e: dict) -> bool:
report_path = out / "GRAPH_REPORT.md"
labels_json = json.dumps({str(k): v for k, v in sorted(labels.items())}, ensure_ascii=False, indent=2) + "\n"
graph_tmp = out / ".graph.tmp.json"
json_written = to_json(G, communities, str(graph_tmp), force=True, built_at_commit=commit)
json_written = to_json(G, communities, str(graph_tmp), force=True, built_at_commit=commit, indexed_repo_root=project_root)
if not json_written:
return False
candidate_graph_data = json.loads(graph_tmp.read_text(encoding="utf-8"))
Expand Down
123 changes: 123 additions & 0 deletions tests/test_export.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import math
import os
import re
import tempfile
from pathlib import Path
Expand Down Expand Up @@ -603,3 +604,125 @@ def test_backup_env_disable(tmp_path, monkeypatch):
(tmp_path / "graph.json").write_text('{"nodes":[],"links":[]}')
(tmp_path / ".graphify_semantic_marker").write_text("{}")
assert backup_if_protected(tmp_path) is None


# --- staleness chokepoint: every graph.json writer must stamp via
# stamp_graph_metadata, and check_staleness must use the recorded indexed
# repo root rather than inferring it from the graph file's own location ---


def test_to_json_stamps_generated_at_and_indexed_repo_root(tmp_path):
"""to_json (the clustered writer) records both generated_at and, when the
caller passes indexed_repo_root, the resolved indexed repo root."""
G = make_graph()
communities = cluster(G)
out = tmp_path / "graph.json"
repo_root = tmp_path / "somewhere-else"
repo_root.mkdir()
to_json(G, communities, str(out), indexed_repo_root=repo_root)
data = json.loads(out.read_text())
assert "generated_at" in data
assert data["indexed_repo_root"] == str(repo_root.resolve())


def test_to_json_omits_indexed_repo_root_when_not_given(tmp_path):
"""Without an explicit indexed_repo_root, to_json still stamps generated_at
but doesn't fabricate a root (legacy-graph shape stays legacy)."""
G = make_graph()
communities = cluster(G)
out = tmp_path / "graph.json"
to_json(G, communities, str(out))
data = json.loads(out.read_text())
assert "generated_at" in data
assert "indexed_repo_root" not in data


def _init_repo_with_one_commit(repo_dir: Path) -> None:
import subprocess
repo_dir.mkdir(parents=True, exist_ok=True)
subprocess.run(["git", "init", "-q", str(repo_dir)], check=True)
subprocess.run(["git", "-C", str(repo_dir), "config", "user.email", "t@example.com"], check=True)
subprocess.run(["git", "-C", str(repo_dir), "config", "user.name", "Test"], check=True)
(repo_dir / "a.py").write_text("def a():\n return 1\n", encoding="utf-8")
subprocess.run(["git", "-C", str(repo_dir), "add", "a.py"], check=True)
subprocess.run(["git", "-C", str(repo_dir), "commit", "-q", "-m", "init"], check=True)


def _last_commit_iso(repo_dir: Path) -> str:
import subprocess
r = subprocess.run(
["git", "-C", str(repo_dir), "log", "-1", "--format=%cI"],
capture_output=True, text=True, check=True,
)
return r.stdout.strip()


def _advance_repo(repo_dir: Path) -> None:
"""Commit again with an explicit committer date safely past the repo's
current HEAD, so ordering doesn't depend on git's 1-second timestamp
resolution racing the test's wall-clock stamp."""
import subprocess
from datetime import datetime, timedelta, timezone
current = datetime.fromisoformat(_last_commit_iso(repo_dir))
later = (current + timedelta(seconds=5)).isoformat()
(repo_dir / "a.py").write_text("def a():\n return 2\n", encoding="utf-8")
subprocess.run(["git", "-C", str(repo_dir), "add", "a.py"], check=True)
env = {**os.environ, "GIT_AUTHOR_DATE": later, "GIT_COMMITTER_DATE": later}
subprocess.run(["git", "-C", str(repo_dir), "commit", "-q", "-m", "advance"], check=True, env=env)


def test_check_staleness_uses_recorded_root_for_out_elsewhere_graph(tmp_path):
"""#2 repro: a graph.json written via `--out <elsewhere>` (unrelated to the
indexed repo by directory structure) must still warn once the indexed repo
advances, because the root was recorded IN the graph at write time rather
than inferred from the graph file's own location."""
from graphify.export import check_staleness

repo_dir = tmp_path / "repo"
_init_repo_with_one_commit(repo_dir)

# graph.json lives in a completely unrelated directory - inferring the
# root from graph_path's location (parent.parent) would resolve to
# tmp_path itself, which is not a git repo at all.
elsewhere = tmp_path / "elsewhere" / "nested"
elsewhere.mkdir(parents=True)
graph_path = elsewhere / "graph.json"

from datetime import datetime, timezone
stamp = datetime.now(timezone.utc).isoformat()
raw = {"generated_at": stamp, "indexed_repo_root": str(repo_dir.resolve())}
graph_path.write_text(json.dumps(raw))

# Not stale yet - graph was just generated, repo hasn't moved since.
assert check_staleness(raw, graph_path) is None

_advance_repo(repo_dir)
warning = check_staleness(raw, graph_path)
assert warning is not None
assert "stale" in warning


def test_check_staleness_legacy_graph_without_recorded_root_cannot_detect_out_elsewhere(tmp_path):
"""Documents the pre-fix limitation for LEGACY graphs (no indexed_repo_root):
location-based inference is the only option left, and for an --out-elsewhere
graph it resolves to a non-repo directory, so staleness silently can't be
determined (no false positive, no false negative - just "can't tell")."""
from graphify.export import check_staleness

repo_dir = tmp_path / "repo"
_init_repo_with_one_commit(repo_dir)

elsewhere = tmp_path / "elsewhere" / "nested"
elsewhere.mkdir(parents=True)
graph_path = elsewhere / "graph.json"

from datetime import datetime, timezone
stamp = datetime.now(timezone.utc).isoformat()
raw = {"generated_at": stamp} # no indexed_repo_root - legacy shape
graph_path.write_text(json.dumps(raw))

_advance_repo(repo_dir)
# location-based fallback resolves to `elsewhere` (parent.parent of
# graph_path), which isn't a git repo, so git fails and the check
# abstains rather than warning.
assert check_staleness(raw, graph_path) is None
Loading