Skip to content

Commit c00f5a7

Browse files
authored
Merge pull request #36775 from dims/dims/fix-lambda-e2e-script
Fix lambda e2e scripts and Prow job config
2 parents b75f82c + 27614b2 commit c00f5a7

File tree

3 files changed

+34
-22
lines changed

3 files changed

+34
-22
lines changed

config/jobs/kubernetes/sig-cloud-provider/lambda/lambda.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ presubmits:
2727
annotations:
2828
testgrid-dashboards: sig-node-gpu
2929
testgrid-tab-name: pull-lambda-device-plugin-gpu
30-
description: "Runs [Feature:GPUDevicePlugin] e2e tests on a Lambda Cloud A100 GPU instance"
30+
description: "Runs [Feature:GPUDevicePlugin] e2e tests on a Lambda Cloud GPU instance"
3131
decorate: true
3232
decoration_config:
3333
timeout: 2h
@@ -42,6 +42,9 @@ presubmits:
4242
- image: us-central1-docker.pkg.dev/k8s-staging-test-infra/images/kubekins-e2e:v20260316-e86cefa561-master
4343
command:
4444
- runner.sh
45+
env:
46+
- name: GPU_TYPE
47+
value: ""
4548
args:
4649
- bash
4750
- -c
@@ -64,7 +67,7 @@ periodics:
6467
annotations:
6568
testgrid-dashboards: sig-node-gpu
6669
testgrid-tab-name: ci-lambda-device-plugin-gpu
67-
description: "Runs [Feature:GPUDevicePlugin] e2e tests on a Lambda Cloud A100 GPU instance"
70+
description: "Runs [Feature:GPUDevicePlugin] e2e tests on a Lambda Cloud GPU instance"
6871
decorate: true
6972
decoration_config:
7073
timeout: 2h
@@ -83,6 +86,9 @@ periodics:
8386
- image: us-central1-docker.pkg.dev/k8s-staging-test-infra/images/kubekins-e2e:v20260316-e86cefa561-master
8487
command:
8588
- runner.sh
89+
env:
90+
- name: GPU_TYPE
91+
value: ""
8692
args:
8793
- bash
8894
- -c

experiment/lambda/e2e-test.sh

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
# Copyright The Kubernetes Authors.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,48 +17,54 @@
1717
#
1818
# Must be run from a kubernetes source checkout directory.
1919
# Requires: LAMBDA_API_KEY_FILE, JOB_NAME, BUILD_ID, ARTIFACTS env vars.
20-
# Optional: GPU_TYPE (default: gpu_1x_a100_sxm4)
20+
# Optional: GPU_TYPE (default: gpu_1x_a10, set empty to accept any available)
2121
set -o errexit
22+
set -o nounset
2223
set -o pipefail
2324

2425
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
25-
SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
26-
GPU_TYPE="${GPU_TYPE:-gpu_1x_a10}"
27-
SSH_KEY_NAME="prow-${JOB_NAME}-${BUILD_ID}"
26+
SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR)
27+
GPU_TYPE="${GPU_TYPE-gpu_1x_a10}"
28+
GPU_ARGS=()
29+
if [ -n "${GPU_TYPE}" ]; then
30+
GPU_ARGS=(--gpu "${GPU_TYPE}")
31+
fi
32+
SSH_KEY_NAME=$(echo -n "prow-${JOB_NAME}-${BUILD_ID}" | sha256sum | cut -c1-64)
33+
SSH_DIR=$(mktemp -d /tmp/lambda-ssh.XXXXXX)
34+
SSH_KEY="${SSH_DIR}/key"
2835

2936
# --- Install lambdactl ---
3037
GOPROXY=direct go install github.com/dims/lambdactl@latest
3138

3239
# --- Generate ephemeral SSH key ---
33-
rm -f /tmp/lambda-ssh /tmp/lambda-ssh.pub
34-
ssh-keygen -t ed25519 -f /tmp/lambda-ssh -N "" -q
35-
SSH_KEY_ID=$(lambdactl --json ssh-keys add "${SSH_KEY_NAME}" /tmp/lambda-ssh.pub | jq -r '.id')
40+
ssh-keygen -t ed25519 -f "${SSH_KEY}" -N "" -q
41+
SSH_KEY_ID=$(lambdactl --json ssh-keys add "${SSH_KEY_NAME}" "${SSH_KEY}.pub" | jq -r '.id')
3642

3743
cleanup() {
3844
echo "Cleaning up..."
3945
[ -n "${INSTANCE_ID:-}" ] && lambdactl stop "${INSTANCE_ID}" --yes 2>/dev/null || true
4046
[ -n "${SSH_KEY_ID:-}" ] && lambdactl ssh-keys rm "${SSH_KEY_ID}" 2>/dev/null || true
47+
rm -rf "${SSH_DIR}"
4148
}
4249
trap cleanup EXIT
4350

44-
# --- Launch instance with retries ---
45-
LAUNCH_OUTPUT=$(lambdactl --json start \
46-
--gpu "${GPU_TYPE}" \
51+
# --- Launch instance (poll until capacity is available) ---
52+
LAUNCH_OUTPUT=$(lambdactl --json watch \
53+
"${GPU_ARGS[@]}" \
4754
--ssh "${SSH_KEY_NAME}" \
48-
--name "prow-${BUILD_ID}" \
49-
--retries 4 \
50-
--retry-delay 60 \
55+
--name "${SSH_KEY_NAME}" \
56+
--interval 30 \
57+
--timeout 900 \
5158
--wait-ssh)
5259
INSTANCE_IP=$(echo "${LAUNCH_OUTPUT}" | jq -r '.ip')
5360
INSTANCE_ID=$(echo "${LAUNCH_OUTPUT}" | jq -r '.id')
5461

55-
remote() { ssh ${SSH_OPTS} -i /tmp/lambda-ssh "ubuntu@${INSTANCE_IP}" "$@"; }
56-
rsync_to() { rsync -e "ssh ${SSH_OPTS} -i /tmp/lambda-ssh" "$@"; }
62+
remote() { ssh "${SSH_OPTS[@]}" -i "${SSH_KEY}" "ubuntu@${INSTANCE_IP}" "$@"; }
63+
rsync_to() { rsync -e "ssh ${SSH_OPTS[*]} -i ${SSH_KEY}" "$@"; }
5764

5865
# --- Build k8s binaries ---
59-
git fetch --tags --depth 1 origin 2>/dev/null || true
60-
KUBE_GIT_VERSION=$(git describe --tags --match='v*' 2>/dev/null || echo "v1.35.0")
61-
make KUBE_GIT_VERSION="${KUBE_GIT_VERSION}" \
66+
git fetch --tags --depth 100 origin 2>/dev/null || true
67+
make \
6268
WHAT="cmd/kubeadm cmd/kubelet cmd/kubectl test/e2e/e2e.test vendor/github.com/onsi/ginkgo/v2/ginkgo"
6369

6470
# --- Transfer binaries to Lambda instance ---

experiment/lambda/setup-cluster.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ sudo systemctl enable kubelet
111111
sudo kubeadm init \
112112
--pod-network-cidr=10.88.0.0/16 \
113113
--cri-socket=unix:///run/containerd/containerd.sock \
114-
--ignore-preflight-errors=NumCPU,Mem,FileContent--proc-sys-net-bridge-bridge-nf-call-iptables,SystemVerification
114+
--ignore-preflight-errors=NumCPU,Mem,FileContent--proc-sys-net-bridge-bridge-nf-call-iptables,SystemVerification,KubeletVersion
115115

116116
# Configure kubectl
117117
mkdir -p "$HOME/.kube"

0 commit comments

Comments
 (0)