Skip to content

Commit 219d1c5

Browse files
RHOAIENG-32532: Add kueue integration and update tests
1 parent ef75f78 commit 219d1c5

23 files changed

+1303
-1135
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- 'release-*'
8+
- "release-*"
99
- ray-jobs-feature
1010
paths-ignore:
11-
- 'docs/**'
12-
- '**.adoc'
13-
- '**.md'
14-
- 'LICENSE'
11+
- "docs/**"
12+
- "**.adoc"
13+
- "**.md"
14+
- "LICENSE"
1515

1616
concurrency:
1717
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -33,9 +33,9 @@ jobs:
3333
- name: Checkout common repo code
3434
uses: actions/checkout@v4
3535
with:
36-
repository: 'project-codeflare/codeflare-common'
37-
ref: 'main'
38-
path: 'common'
36+
repository: "project-codeflare/codeflare-common"
37+
ref: "main"
38+
path: "common"
3939

4040
- name: Checkout CodeFlare operator repository
4141
uses: actions/checkout@v4
@@ -46,7 +46,7 @@ jobs:
4646
- name: Set Go
4747
uses: actions/setup-go@v5
4848
with:
49-
go-version-file: './codeflare-operator/go.mod'
49+
go-version-file: "./codeflare-operator/go.mod"
5050
cache-dependency-path: "./codeflare-operator/go.sum"
5151

5252
- name: Set up gotestfmt
@@ -76,7 +76,7 @@ jobs:
7676
run: |
7777
cd codeflare-operator
7878
echo Setting up CodeFlare stack
79-
make setup-e2e
79+
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
8080
echo Deploying CodeFlare operator
8181
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
8282
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
@@ -95,6 +95,10 @@ jobs:
9595
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
9696
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
9797
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
99+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
100+
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
101+
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
98102
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
99103
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
100104
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
@@ -122,7 +126,7 @@ jobs:
122126
pip install poetry
123127
poetry install --with test,docs
124128
echo "Running e2e tests..."
125-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
129+
poetry run pytest -v -s ./tests/e2e/ -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
126130
env:
127131
GRPC_DNS_RESOLVER: "native"
128132

@@ -146,7 +150,13 @@ jobs:
146150
if: always() && steps.deploy.outcome == 'success'
147151
run: |
148152
echo "Printing KubeRay operator logs"
149-
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
153+
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
154+
155+
- name: Print Kueue controller logs
156+
if: always() && steps.deploy.outcome == 'success'
157+
run: |
158+
echo "Printing Kueue controller logs"
159+
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log
150160
151161
- name: Export all KinD pod logs
152162
uses: ./common/github-actions/kind-export-logs
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# rayjob e2e tests workflow for CodeFlare-SDK
2+
name: rayjob-e2e
3+
4+
on:
5+
pull_request:
6+
branches:
7+
- main
8+
- "release-*"
9+
- ray-jobs-feature
10+
paths-ignore:
11+
- "docs/**"
12+
- "**.adoc"
13+
- "**.md"
14+
- "LICENSE"
15+
16+
concurrency:
17+
group: ${{ github.head_ref }}-${{ github.workflow }}
18+
cancel-in-progress: true
19+
20+
env:
21+
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
22+
23+
jobs:
24+
kubernetes-rayjob:
25+
runs-on: gpu-t4-4-core
26+
27+
steps:
28+
- name: Checkout code
29+
uses: actions/checkout@v4
30+
with:
31+
submodules: recursive
32+
33+
- name: Checkout common repo code
34+
uses: actions/checkout@v4
35+
with:
36+
repository: "project-codeflare/codeflare-common"
37+
ref: "main"
38+
path: "common"
39+
40+
- name: Checkout CodeFlare operator repository
41+
uses: actions/checkout@v4
42+
with:
43+
repository: project-codeflare/codeflare-operator
44+
path: codeflare-operator
45+
46+
- name: Set Go
47+
uses: actions/setup-go@v5
48+
with:
49+
go-version-file: "./codeflare-operator/go.mod"
50+
cache-dependency-path: "./codeflare-operator/go.sum"
51+
52+
- name: Set up gotestfmt
53+
uses: gotesttools/gotestfmt-action@v2
54+
with:
55+
token: ${{ secrets.GITHUB_TOKEN }}
56+
57+
- name: Set up specific Python version
58+
uses: actions/setup-python@v5
59+
with:
60+
python-version: "3.11"
61+
cache: "pip" # caching pip dependencies
62+
63+
- name: Setup NVidia GPU environment for KinD
64+
uses: ./common/github-actions/nvidia-gpu-setup
65+
66+
- name: Setup and start KinD cluster
67+
uses: ./common/github-actions/kind
68+
with:
69+
worker-nodes: 1
70+
71+
- name: Install NVidia GPU operator for KinD
72+
uses: ./common/github-actions/nvidia-gpu-operator
73+
74+
- name: Deploy CodeFlare stack
75+
id: deploy
76+
run: |
77+
cd codeflare-operator
78+
echo Setting up CodeFlare stack
79+
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
80+
echo Deploying CodeFlare operator
81+
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
82+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83+
cd ..
84+
85+
- name: Add user to KinD
86+
uses: ./common/github-actions/kind-add-user
87+
with:
88+
user-name: sdk-user
89+
90+
- name: Configure RBAC for sdk user with limited permissions
91+
run: |
92+
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
93+
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94+
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95+
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
97+
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
99+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
100+
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
101+
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
102+
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
103+
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
104+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
105+
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
106+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
107+
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
108+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
109+
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
110+
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
111+
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
112+
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
113+
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
114+
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
115+
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
116+
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
117+
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
118+
kubectl config use-context sdk-user
119+
120+
- name: Run RayJob E2E tests
121+
run: |
122+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
123+
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
124+
125+
set -euo pipefail
126+
pip install poetry
127+
poetry install --with test,docs
128+
echo "Running RayJob e2e tests..."
129+
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1
130+
131+
- name: Switch to kind-cluster context to print logs
132+
if: always() && steps.deploy.outcome == 'success'
133+
run: kubectl config use-context kind-cluster
134+
135+
- name: Print Pytest output log
136+
if: always() && steps.deploy.outcome == 'success'
137+
run: |
138+
echo "Printing Pytest output logs"
139+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log
140+
141+
- name: Print CodeFlare operator logs
142+
if: always() && steps.deploy.outcome == 'success'
143+
run: |
144+
echo "Printing CodeFlare operator logs"
145+
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
146+
147+
- name: Print KubeRay operator logs
148+
if: always() && steps.deploy.outcome == 'success'
149+
run: |
150+
echo "Printing KubeRay operator logs"
151+
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
152+
153+
- name: Print Kueue controller logs
154+
if: always() && steps.deploy.outcome == 'success'
155+
run: |
156+
echo "Printing Kueue controller logs"
157+
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log
158+
159+
- name: Export all KinD pod logs
160+
uses: ./common/github-actions/kind-export-logs
161+
if: always() && steps.deploy.outcome == 'success'
162+
with:
163+
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
164+
165+
- name: Upload logs
166+
uses: actions/upload-artifact@v4
167+
if: always() && steps.deploy.outcome == 'success'
168+
with:
169+
name: logs
170+
retention-days: 10
171+
path: |
172+
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log

src/codeflare_sdk/common/utils/k8s_utils.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77
from ..kubernetes_cluster import config_check, _kube_api_error_handling
88

99

10-
def get_current_namespace():
10+
def get_current_namespace(): # pragma: no cover
1111
"""
1212
Retrieves the current Kubernetes namespace.
1313
14-
This function attempts to detect the current namespace by:
15-
1. First checking if running inside a pod (reading from service account namespace file)
16-
2. Falling back to reading from the current kubeconfig context
17-
1814
Returns:
1915
str:
2016
The current namespace or None if not found.

src/codeflare_sdk/ray/cluster/build_ray_cluster.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
133133
"enableIngress": False,
134134
"rayStartParams": {
135135
"dashboard-host": "0.0.0.0",
136-
"dashboard-port": "8265",
137136
"block": "true",
138137
"num-gpus": str(head_gpu_count),
139138
"resources": head_resources,

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,10 @@ def apply(self, force=False):
208208
self._throw_for_no_raycluster()
209209
namespace = self.config.namespace
210210
name = self.config.name
211+
212+
# Regenerate resource_yaml to reflect any configuration changes
213+
self.resource_yaml = self.create_resource()
214+
211215
try:
212216
self.config_check()
213217
api_instance = client.CustomObjectsApi(get_api_client())
@@ -387,16 +391,25 @@ def is_dashboard_ready(self) -> bool:
387391
bool:
388392
True if the dashboard is ready, False otherwise.
389393
"""
394+
395+
dashboard_uri = self.cluster_dashboard_uri()
396+
if dashboard_uri is None:
397+
return False
398+
390399
try:
391400
response = requests.get(
392-
self.cluster_dashboard_uri(),
401+
dashboard_uri,
393402
headers=self._client_headers,
394403
timeout=5,
395404
verify=self._client_verify_tls,
396405
)
397406
except requests.exceptions.SSLError: # pragma no cover
398407
# SSL exception occurs when oauth ingress has been created but cluster is not up
399408
return False
409+
except Exception: # pragma no cover
410+
# Any other exception (connection errors, timeouts, etc.)
411+
return False
412+
400413
if response.status_code == 200:
401414
return True
402415
else:
@@ -504,6 +517,8 @@ def cluster_dashboard_uri(self) -> str:
504517
):
505518
protocol = "https" if route["spec"].get("tls") else "http"
506519
return f"{protocol}://{route['spec']['host']}"
520+
# No route found for this cluster
521+
return "Dashboard not available yet, have you run cluster.up()?"
507522
else:
508523
try:
509524
api_instance = client.NetworkingV1Api(get_api_client())
@@ -522,7 +537,8 @@ def cluster_dashboard_uri(self) -> str:
522537
protocol = "http"
523538
elif "route.openshift.io/termination" in annotations:
524539
protocol = "https"
525-
return f"{protocol}://{ingress.spec.rules[0].host}"
540+
return f"{protocol}://{ingress.spec.rules[0].host}"
541+
526542
return "Dashboard not available yet, have you run cluster.up()?"
527543

528544
def list_jobs(self) -> List:
@@ -783,6 +799,7 @@ def remove_autogenerated_fields(resource):
783799
del resource[key]
784800
else:
785801
remove_autogenerated_fields(resource[key])
802+
786803
elif isinstance(resource, list):
787804
for item in resource:
788805
remove_autogenerated_fields(item)

src/codeflare_sdk/ray/cluster/test_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2022-2025 IBM, Red Hat
1+
# Copyright 2024 IBM, Red Hat
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.

0 commit comments

Comments
 (0)