From edcd229a3a4821c1acbdfd3049dde2593fd9493d Mon Sep 17 00:00:00 2001 From: Arkaprava De Date: Sat, 16 May 2026 22:08:09 +0000 Subject: [PATCH] feat: add hyp ssh command for direct terminal SSH into HyperPod spaces Add 'hyp ssh hyp-space --name ' command that establishes an interactive SSH session to a HyperPod Space via AWS Systems Manager (SSM) Session Manager. This extends the existing vscode-remote/kiro-remote space access mechanism to support direct terminal SSH without requiring any IDE. The command: 1. Verifies the space exists and is in Available status 2. Creates a WorkspaceConnection CRD with 'ssh-remote' connection type 3. Parses the returned connection URL for the SSM managed instance target 4. Invokes 'aws ssm start-session' as an interactive subprocess Also adds: - ssh() SDK method on HPSpace class - Unit tests for URL parsing and CLI command - Design document X-AI-Tool: Kiro (MeshClaw) X-AI-Prompt: Implement hyp ssh command for direct terminal SSH into HyperPod notebook spaces via SSM --- doc/design/hyp_ssh_design.md | 127 ++++++++++++++ src/sagemaker/hyperpod/cli/commands/ssh.py | 166 ++++++++++++++++++ src/sagemaker/hyperpod/cli/hyp_cli.py | 9 + .../hyperpod/space/hyperpod_space.py | 35 ++++ test/unit_tests/cli/test_ssh.py | 94 ++++++++++ 5 files changed, 431 insertions(+) create mode 100644 doc/design/hyp_ssh_design.md create mode 100644 src/sagemaker/hyperpod/cli/commands/ssh.py create mode 100644 test/unit_tests/cli/test_ssh.py diff --git a/doc/design/hyp_ssh_design.md b/doc/design/hyp_ssh_design.md new file mode 100644 index 00000000..93496c21 --- /dev/null +++ b/doc/design/hyp_ssh_design.md @@ -0,0 +1,127 @@ +# Design: `hyp ssh` — Direct Terminal SSH into HyperPod Spaces + +## Overview + +Add a `hyp ssh hyp-space --name ` command that establishes an interactive SSH session to a HyperPod Space (notebook workspace) via AWS Systems Manager (SSM) Session Manager. + +Today, the CLI supports `vscode-remote` and `kiro-remote` connection types which open IDE-specific protocol URLs. This feature extends that to provide a direct terminal SSH experience without requiring any IDE. + +## User Experience + +```bash +# SSH into a workspace +hyp ssh hyp-space --name my-workspace + +# With explicit namespace +hyp ssh hyp-space --name my-workspace --namespace team-ns + +# With region override +hyp ssh hyp-space --name my-workspace --region us-west-2 +``` + +Output: +``` +Connecting to space 'my-workspace'... +Starting SSH session to 'mi-0abc123def456'... +Use 'exit' or Ctrl+D to end the session. + +root@workspace-pod:~# +``` + +## Architecture + +``` +┌─────────────┐ ┌──────────────────┐ ┌─────────────────────┐ +│ hyp ssh │────▶│ WorkspaceConnection│────▶│ SSM Managed Instance│ +│ (CLI) │ │ CRD (ssh-remote) │ │ (Space Pod) │ +└─────────────┘ └──────────────────┘ └─────────────────────┘ + │ │ │ + │ 1. create CRD │ 2. returns URL │ + │─────────────────────▶ │ + │ │ │ + │ 3. parse target │ │ + │◀────────────────────│ │ + │ │ + │ 4. aws ssm start-session --target │ + │──────────────────────────────────────────────▶│ + │ │ + │ 5. Interactive terminal session │ + │◀─────────────────────────────────────────────▶│ +``` + +### Flow + +1. CLI calls `HPSpace.get(name)` to verify the space exists and is Available +2. CLI calls `space.create_space_access(connection_type="ssh-remote")` which creates a `WorkspaceConnection` CRD +3. The SageMaker Spaces operator provisions/returns the SSM managed instance target +4. CLI parses the `workspaceConnectionUrl` from the CRD status to extract the SSM target ID +5. CLI invokes `aws ssm start-session --target --region ` as a subprocess +6. User gets an interactive terminal session + +### Connection URL Format + +The `workspaceConnectionUrl` returned by the operator for `ssh-remote` type: + +``` +ssm://?documentName=AWS-StartSSHSession&portNumber=22 +``` + +Or alternatively: +``` +https:///session?target=&documentName=AWS-StartSSHSession +``` + +## Implementation + +### New Files + +| File | Purpose | +|------|---------| +| `src/sagemaker/hyperpod/cli/commands/ssh.py` | CLI command + SSM session logic | +| `test/unit_tests/cli/test_ssh.py` | Unit tests | + +### Modified Files + +| File | Change | +|------|--------| +| `src/sagemaker/hyperpod/cli/hyp_cli.py` | Register `ssh` command group | +| `src/sagemaker/hyperpod/space/hyperpod_space.py` | Add `ssh()` SDK method | + +### Key Design Decisions + +1. **Reuse `create_space_access` with `ssh-remote` type** — No new CRD or API needed. The existing `WorkspaceConnection` CRD supports arbitrary `{ide}-remote` patterns. We use `ssh-remote` as the connection type. + +2. **SSM as transport** — HyperPod doesn't expose SSH ports. The Spaces operator registers an SSM Advanced On-Premises Instance for each space. This is the same mechanism used by `vscode-remote`. + +3. **Subprocess for session** — We use `subprocess.run()` with stdin/stdout/stderr passthrough for a fully interactive terminal. This is the same pattern used by `aws ssm start-session` directly. + +4. **No SSH key management** — SSM handles authentication via IAM. No SSH keys needed. + +## Prerequisites + +- AWS Session Manager Plugin installed locally +- Valid AWS credentials with `ssm:StartSession` permission +- Space must be in `Available` status (running) +- SageMaker Spaces Add-on installed on the cluster + +## SDK Usage + +```python +from sagemaker.hyperpod.space.hyperpod_space import HPSpace + +space = HPSpace.get(name="my-workspace") +ssh_info = space.ssh() +# Returns: {"SpaceConnectionType": "ssh-remote", "SpaceConnectionUrl": "ssm://mi-..."} +``` + +## Testing + +- Unit tests mock the K8s API and subprocess calls +- Integration tests require a live HyperPod cluster with Spaces add-on +- Manual testing: `hyp ssh hyp-space --name --debug` shows the SSM command + +## Future Extensions + +- `hyp ssh hyp-space --name ws --command "nvidia-smi"` — run a single command +- `hyp scp` — file transfer via SSM +- SSH config generation for `~/.ssh/config` ProxyCommand integration diff --git a/src/sagemaker/hyperpod/cli/commands/ssh.py b/src/sagemaker/hyperpod/cli/commands/ssh.py new file mode 100644 index 00000000..7502e099 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/commands/ssh.py @@ -0,0 +1,166 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. + +import click +import json +import subprocess +import sys +from urllib.parse import urlparse, parse_qs + +from sagemaker.hyperpod.space.hyperpod_space import HPSpace +from sagemaker.hyperpod.common.telemetry.telemetry_logging import _hyperpod_telemetry_emitter +from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions +from sagemaker.hyperpod.common.utils import _resolve_region as resolve_region + + +@click.command("hyp-space") +@click.option("--name", required=True, help="Name of the space to SSH into") +@click.option("--namespace", "-n", default="default", help="Kubernetes namespace") +@click.option("--region", default=None, help="AWS region (defaults to configured region)") +@click.option("--debug", is_flag=True, default=False, help="Enable debug mode") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "ssh_space") +@handle_cli_exceptions() +def space_ssh(name, namespace, region, debug): + """SSH into a running HyperPod space via SSM Session Manager. + + Establishes an interactive SSH session to the specified workspace + using AWS Systems Manager Session Manager as the transport layer. + This provides secure, auditable access without requiring inbound + firewall rules or bastion hosts. + + Prerequisites: + - AWS Session Manager Plugin installed + - Space must be in 'Available' status + - Valid AWS credentials with ssm:StartSession permission + + Example: + hyp ssh hyp-space --name my-workspace + hyp ssh hyp-space --name my-workspace --namespace team-ns + """ + region = region or resolve_region() + + # Get the space and verify it's available + space = HPSpace.get(name=name, namespace=namespace) + + if space.status: + conditions = space.status.get("conditions", []) + is_available = any( + c.get("type") == "Available" and c.get("status") == "True" + for c in conditions + ) + if not is_available: + raise click.ClickException( + f"Space '{name}' is not in Available status. " + f"Start it with: hyp start hyp-space --name {name}" + ) + + # Create a space access to get the SSM connection details + click.echo(f"Connecting to space '{name}'...") + access_info = space.create_space_access(connection_type="ssh-remote") + connection_url = access_info.get("SpaceConnectionUrl", "") + + if debug: + click.echo(f"Connection URL: {connection_url}") + + # Parse the connection URL to extract SSM target details + ssm_target, ssm_params = _parse_ssh_connection_url(connection_url) + + if not ssm_target: + raise click.ClickException( + f"Could not determine SSM target from connection URL. " + f"URL received: {connection_url}" + ) + + # Start SSM session + _start_ssm_session(ssm_target, ssm_params, region, debug) + + +def _parse_ssh_connection_url(connection_url: str) -> tuple: + """Parse the space connection URL to extract SSM target and parameters. + + The connection URL format from the WorkspaceConnection CRD contains + the SSM managed instance ID and optional document/parameters for + establishing the SSH session. + + Returns: + tuple: (target_id, parameters_dict) + """ + if not connection_url: + return None, {} + + parsed = urlparse(connection_url) + params = parse_qs(parsed.query) + + # The URL may contain the target directly or in query params + # Format: ssm://?documentName=...¶meters=... + # Or: https:///ssm?target=&... + target = None + ssm_params = {} + + if parsed.scheme == "ssm": + target = parsed.hostname or parsed.path.strip("/") + elif "target" in params: + target = params["target"][0] + elif "instanceId" in params: + target = params["instanceId"][0] + else: + # Fallback: treat the path as the target + target = parsed.path.strip("/") if parsed.path else None + + if "documentName" in params: + ssm_params["documentName"] = params["documentName"][0] + if "portNumber" in params: + ssm_params["portNumber"] = params["portNumber"][0] + + return target, ssm_params + + +def _start_ssm_session(target: str, params: dict, region: str, debug: bool = False): + """Start an interactive SSM session to the workspace. + + Uses the AWS Session Manager Plugin to establish a direct + SSH-like terminal session to the managed instance backing + the HyperPod space. + """ + cmd = [ + "aws", "ssm", "start-session", + "--target", target, + "--region", region, + ] + + if params.get("documentName"): + cmd.extend(["--document-name", params["documentName"]]) + + if params.get("portNumber"): + cmd.extend([ + "--parameters", + json.dumps({"portNumber": [params["portNumber"]]}) + ]) + + if debug: + click.echo(f"Running: {' '.join(cmd)}") + + click.echo(f"Starting SSH session to '{target}'...") + click.echo("Use 'exit' or Ctrl+D to end the session.\n") + + try: + result = subprocess.run(cmd, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr) + if result.returncode != 0: + if result.returncode == 127: + raise click.ClickException( + "AWS Session Manager Plugin not found. Install it from: " + "https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html" + ) + raise click.ClickException( + f"SSM session exited with code {result.returncode}. " + f"Ensure you have valid AWS credentials and ssm:StartSession permissions." + ) + except FileNotFoundError: + raise click.ClickException( + "AWS CLI not found. Install it from: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html" + ) + except KeyboardInterrupt: + click.echo("\nSession terminated.") diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index 4b2107c0..c9766693 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -54,6 +54,7 @@ space_template_update, ) from sagemaker.hyperpod.cli.commands.space_access import space_access_create +from sagemaker.hyperpod.cli.commands.ssh import space_ssh from sagemaker.hyperpod.cli.commands.init import ( init, @@ -165,6 +166,12 @@ def stop(): pass +@cli.group(cls=CLICommand) +def ssh(): + """SSH into space pods via SSM Session Manager.""" + pass + + @cli.group(cls=CLICommand) def portforward(): """Port forward for space resources.""" @@ -274,6 +281,8 @@ def exec(): portforward.add_command(space_portforward) +ssh.add_command(space_ssh) + get_operator_logs.add_command(pytorch_get_operator_logs) recipe_get_operator_logs_cmd = copy.copy(pytorch_get_operator_logs) recipe_get_operator_logs_cmd.help = "Get operator logs for HyperPod recipe jobs." diff --git a/src/sagemaker/hyperpod/space/hyperpod_space.py b/src/sagemaker/hyperpod/space/hyperpod_space.py index c19ccb4f..cf60548f 100644 --- a/src/sagemaker/hyperpod/space/hyperpod_space.py +++ b/src/sagemaker/hyperpod/space/hyperpod_space.py @@ -986,3 +986,38 @@ def portforward_space(self, local_port: str, remote_port: str = DEFAULT_SPACE_PO logger.debug("Stopping space port forward...") finally: pf.stop() + + @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "ssh_space") + def ssh(self, region: Optional[str] = None) -> Dict[str, str]: + """Create an SSH connection access for this space. + + Creates a space access resource with 'ssh-remote' connection type + and returns the connection details needed to establish an SSM session. + + **Parameters:** + + .. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Type + - Description + * - region + - str, optional + - AWS region. If not specified, uses the configured default. + + **Returns:** + + Dict[str, str]: Dictionary containing 'SpaceConnectionType' and 'SpaceConnectionUrl' keys + + .. dropdown:: Usage Examples + :open: + + .. code-block:: python + + >>> space = HPSpace.get("my-space") + >>> ssh_info = space.ssh() + >>> print(f"SSH URL: {ssh_info['SpaceConnectionUrl']}") + """ + return self.create_space_access(connection_type="ssh-remote") diff --git a/test/unit_tests/cli/test_ssh.py b/test/unit_tests/cli/test_ssh.py new file mode 100644 index 00000000..5e31c2a8 --- /dev/null +++ b/test/unit_tests/cli/test_ssh.py @@ -0,0 +1,94 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. + +import unittest +from unittest.mock import patch, Mock, MagicMock +from click.testing import CliRunner + +from sagemaker.hyperpod.cli.commands.ssh import ( + space_ssh, + _parse_ssh_connection_url, + _start_ssm_session, +) + + +class TestParseSSHConnectionUrl(unittest.TestCase): + """Test URL parsing for various connection URL formats.""" + + def test_ssm_scheme_url(self): + url = "ssm://mi-0abc123def456?documentName=AWS-StartSSHSession&portNumber=22" + target, params = _parse_ssh_connection_url(url) + self.assertEqual(target, "mi-0abc123def456") + self.assertEqual(params["documentName"], "AWS-StartSSHSession") + self.assertEqual(params["portNumber"], "22") + + def test_https_with_target_param(self): + url = "https://ssm.us-west-2.amazonaws.com/session?target=mi-0abc123&documentName=AWS-StartSSHSession" + target, params = _parse_ssh_connection_url(url) + self.assertEqual(target, "mi-0abc123") + self.assertEqual(params["documentName"], "AWS-StartSSHSession") + + def test_https_with_instance_id_param(self): + url = "https://endpoint.example.com/connect?instanceId=mi-0abc123" + target, params = _parse_ssh_connection_url(url) + self.assertEqual(target, "mi-0abc123") + + def test_empty_url(self): + target, params = _parse_ssh_connection_url("") + self.assertIsNone(target) + self.assertEqual(params, {}) + + def test_none_url(self): + target, params = _parse_ssh_connection_url(None) + self.assertIsNone(target) + self.assertEqual(params, {}) + + +class TestSpaceSSHCommand(unittest.TestCase): + """Test the CLI command integration.""" + + @patch("sagemaker.hyperpod.cli.commands.ssh._start_ssm_session") + @patch("sagemaker.hyperpod.cli.commands.ssh._parse_ssh_connection_url") + @patch("sagemaker.hyperpod.cli.commands.ssh.HPSpace") + @patch("sagemaker.hyperpod.cli.commands.ssh.resolve_region") + def test_ssh_success(self, mock_resolve_region, mock_hp_space, mock_parse, mock_start): + mock_resolve_region.return_value = "us-west-2" + + mock_space = Mock() + mock_space.status = {"conditions": [{"type": "Available", "status": "True"}]} + mock_space.create_space_access.return_value = { + "SpaceConnectionType": "ssh-remote", + "SpaceConnectionUrl": "ssm://mi-0abc123?documentName=AWS-StartSSHSession" + } + mock_hp_space.get.return_value = mock_space + + mock_parse.return_value = ("mi-0abc123", {"documentName": "AWS-StartSSHSession"}) + + runner = CliRunner() + result = runner.invoke(space_ssh, ["--name", "my-space"]) + + mock_hp_space.get.assert_called_once_with(name="my-space", namespace="default") + mock_space.create_space_access.assert_called_once_with(connection_type="ssh-remote") + mock_start.assert_called_once_with( + "mi-0abc123", {"documentName": "AWS-StartSSHSession"}, "us-west-2", False + ) + + @patch("sagemaker.hyperpod.cli.commands.ssh.HPSpace") + @patch("sagemaker.hyperpod.cli.commands.ssh.resolve_region") + def test_ssh_space_not_available(self, mock_resolve_region, mock_hp_space): + mock_resolve_region.return_value = "us-west-2" + + mock_space = Mock() + mock_space.status = {"conditions": [{"type": "Available", "status": "False"}]} + mock_hp_space.get.return_value = mock_space + + runner = CliRunner() + result = runner.invoke(space_ssh, ["--name", "my-space"]) + + self.assertIn("not in Available status", result.output) + + +if __name__ == "__main__": + unittest.main()