Skip to content

Commit 762494b

Browse files
Merge pull request #645 from NVIDIA/master
20.08.1 Release, pull a bunch of bugfixes for HA K8S, CentOS, etc. into 20.08
2 parents 6e716ab + 478f529 commit 762494b

23 files changed

+186
-109
lines changed

.jenkins-scripts/get-k8s-debug.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
set -x
3+
source .jenkins-scripts/jenkins-common.sh
4+
5+
# Ensure working directory is root
6+
cd "${ROOT_DIR}"
7+
8+
export KF_DIR=${ROOT_DIR}/config/kubeflow
9+
export KFCTL=${ROOT_DIR}/config/kfctl
10+
11+
# Get some basic info about all nodes
12+
kubectl describe nodes
13+
kubectl get nodes
14+
15+
# Get some basic info about all running pods
16+
kubectl get pods -A
17+
kubectl get daemonsets -A
18+
19+
# Get helm status (requires helm install)
20+
helm list

.jenkins-scripts/test-dashboard.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ source ./scripts/k8s_deploy_dashboard_user.sh
1313
timeout=120
1414
time=0
1515
while [ ${time} -lt ${timeout} ]; do
16-
curl -ks --raw -L "${dashboard_url}" && \
16+
curl -ks --raw -kL "${dashboard_url}" | grep "Kubernetes Dashboard" && \
1717
echo "Dashboard URLs are all responding" && exit 0
1818
let time=$time+15
1919
sleep 15

.jenkins-scripts/test-kubeflow-pipeline.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,28 @@
1010
def test_kubeflow_op():
1111
op = kfp.dsl.ContainerOp(
1212
name='kubeflow-test-op',
13-
image='nvcr.io/nvidia/rapidsai/rapidsai:cuda10.1-runtime-centos7',
14-
command=["/bin/bash", "-cx"],
13+
image='busybox',
14+
command=["/bin/sh", "-cx"],
1515
arguments=["echo 'Container started!'"],
1616
file_outputs={}
1717
)
1818
kfp.compiler.Compiler().compile(test_kubeflow_op, 'kubeflow-test.yml')
1919

2020
# Connect to Kubeflow and create job, this simply rungs RAPIDS and prints out a message
2121
while True:
22+
time.sleep(30) # Occassionally Kubeflow fails to respond even when all deployments are up. I don't know why, sometimes it is a 403, sometimes a 500, and sometimes it works. So we will just wait and re-try until the test/script times out.
2223
try:
2324
print("Submitting Kubeflow pipeline")
2425
run_result = kfp.Client(host=None).create_run_from_pipeline_package('kubeflow-test.yml', arguments={})
2526
break # This means it worked!
2627
except kfp_server_api.rest.ApiException as e:
2728
print("Hit an error, waiting and trying again: {}".format(e))
28-
time.sleep(30) # Occassionally Kubeflow fails to respond even when all deployments are up. I don't know why, sometimes it is a 403, sometimes a 500, and sometimes it works. So we will just wait and re-try until the test/script times out.
2929

30-
for i in range(70): # The test .sh times out after 600 seconds. So we run a little longer than that. This accounts mostly for NGC download time.
30+
for i in range(70): # The test eventually times out. So we run a little longer than that. This accounts mostly for NGC download time.
3131
print("Polling for pipeline status: {} - {}".format(run_result, i))
32-
status = kfp.Client(host=None).get_run(run_result.run_id).run.status
33-
if status == "Succeeded":
32+
run = kfp.Client(host=None).get_run(run_result.run_id).run
33+
if run.status == "Succeeded":
3434
print("SUCCESS: Kubeflow launched a container successfully")
3535
break
36-
print("Got {}, waiting some more...".format(status))
36+
print("Got {}, waiting some more... {}".format(run.status, run))
3737
time.sleep(10) # Wait 10 seconds and poll

.jenkins-scripts/test-kubeflow-pipeline.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@ source .jenkins-scripts/jenkins-common.sh
55
# Ensure working directory is root
66
cd "${ROOT_DIR}"
77

8+
export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db" # TODO: We will only poll for these, because other services currently fail to come up in Jenkins due to low disk space
9+
810
# Install the optional kfp package
911
sudo pip3 install kfp
1012

11-
# Wait for the kubeflow pipeline service to be ready, and then wait another 30 seconds for other random Kubeflow initialization
12-
# Don't wait for katib or a few other things that take longer to initialize
13-
export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db"
13+
# Wait for the kubeflow pipeline service to be ready
1414
./scripts/k8s_deploy_kubeflow.sh -w
1515

1616
kubectl get pods -n kubeflow # Do this for debug purposes
1717

1818
# Run the Kubeflow pipeline test, this will build a pipeline that launches an NGC container
1919
# For some reason the initial pipeline creation hangs sometime (and doesn't timeout or error out or provide any logging) so we run this twice until success or timeout
20-
timeout 600 python3 .jenkins-scripts/test-kubeflow-pipeline.py || timeout 600 python3 .jenkins-scripts/test-kubeflow-pipeline.py
20+
python3 .jenkins-scripts/test-kubeflow-pipeline.py
21+
kubectl get pods -n kubeflow # Do this for debug purposes

.jenkins-scripts/test-kubeflow.sh

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,37 @@
11
#!/bin/bash
22
set -x
33
source .jenkins-scripts/jenkins-common.sh
4+
cp /var/lib/jenkins/kustomize ${ROOT_DIR}/config # kustomize is saved off on the Jenkins server because the kustomize servers often rate-limit causing failed downloads
45

56
# Ensure working directory is root
67
cd "${ROOT_DIR}"
78

89
export KF_DIR=${ROOT_DIR}/config/kubeflow
910
export KFCTL=${ROOT_DIR}/config/kfctl
11+
export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db" # TODO: We will only poll for these, because other services currently fail to come up in Jenkins due to low disk space
12+
13+
# Deploy Kubflow with Dex
14+
source ./scripts/k8s_deploy_kubeflow.sh -x
15+
16+
# The deployment script exports the http endpoints, verify it returns a 200
17+
# It typically takes ~5 minutes for all pods and services to start, so we poll
18+
timeout=600
19+
time=0
20+
while [ ${time} -lt ${timeout} ]; do
21+
curl -s --raw -L "${kf_url}" && \
22+
echo "Kubeflow is homepage is up " && break
23+
let time=$time+15
24+
sleep 15
25+
done
26+
curl -s --raw -L "${kf_url}" || exit 1 # If Kubeflow didn't come up in 600 seconds, fail
27+
28+
# Wait for it to come up and view pods
29+
./scripts/k8s_deploy_kubeflow.sh -w
30+
kubectl get pods -n kubeflow
31+
32+
# Delete Kubflow and view namespaces
33+
./scripts/k8s_deploy_kubeflow.sh -d
34+
kubectl get ns
1035

1136
# Deploy Kubflow
1237
source ./scripts/k8s_deploy_kubeflow.sh
@@ -17,11 +42,8 @@ timeout=600
1742
time=0
1843
while [ ${time} -lt ${timeout} ]; do
1944
curl -s --raw -L "${kf_url}" && \
20-
echo "Kubeflow is homepage is up " && exit 0
45+
echo "Kubeflow is homepage is up " && exit 0 # Rather than poll here, we wait for the later kubeflow-pipeline test to poll and proceed to save testing time; kubeflow will continue coming up as monitoring and k8s dashboard tests run
2146
let time=$time+15
2247
sleep 15
2348
done
24-
25-
# Kubeflow deployment failure
26-
echo "Kubeflow did not come up in time"
27-
exit 1
49+
curl -s --raw -L "${kf_url}" || exit 1 # If Kubeflow didn't come up in 600 seconds, fail

.jenkins-scripts/test-monitoring.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ while [ ${time} -lt ${timeout} ]; do
1616
curl -s --raw -L "${prometheus_url}" && \
1717
curl -s --raw -L "${grafana_url}" && \
1818
curl -s --raw -L "${alertmanager_url}" && \
19-
echo "Monitoring URLs are all responding" && exit 0
19+
echo "Monitoring URLs are all responding" && break
2020
let time=$time+15
2121
sleep 15
2222
done
2323

24+
# Delete Monitoring
25+
source ./scripts/k8s_deploy_monitoring.sh -d && exit 0
26+
2427
# Monitoring deployment failure
2528
echo "Monitoring did not come up in time"
2629
exit 1

config.example/group_vars/k8s-cluster.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,12 @@ deepops_gpu_operator_enabled: false
2626
# Addons deployed in kube-system namespaces are handled.
2727
#podsecuritypolicy_enabled: false
2828

29-
# kubespray v2.12.2 deploys dashboard 1.10.1 which is no longer supported in k8s 1.16
30-
# https://github.com/kubernetes/dashboard/issues/4401#issuecomment-540476478
29+
# Pin the version of kubespray dashboard https://github.com/kubernetes/dashboard/releases/tag/v2.0.3
3130
dashboard_enabled: true
32-
dashboard_image_tag: "v2.0.0-rc5"
31+
dashboard_image_tag: "v2.0.3"
3332
dashboard_image_repo: "kubernetesui/dashboard"
33+
dashboard_metrics_scrape_tagr: "v1.0.4"
34+
dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper"
3435

3536
# kubespray v2.13.1 deploys helm v3.1.2
3637
helm_version: "v3.1.2"

config.example/group_vars/slurm-cluster.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,4 +131,4 @@ allow_user_set_gpu_clocks: no
131131
################################################################################
132132
slurm_install_enroot: true
133133
slurm_install_pyxis: true
134-
slurm_pyxis_version: 0.8.0
134+
slurm_pyxis_version: 0.8.1

jenkins/Jenkinsfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ pipeline {
3636
bash -x ./.jenkins-scripts/test-cluster-up.sh
3737
'''
3838

39+
echo "Get K8S Cluster Status"
40+
sh '''
41+
bash -x ./.jenkins-scripts/get-k8s-debug.sh
42+
'''
43+
3944
echo "Verify we can run a GPU job"
4045
sh '''
4146
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh

jenkins/Jenkinsfile-multi-nightly

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ pipeline {
3636
bash -x ./.jenkins-scripts/test-cluster-up.sh
3737
'''
3838

39+
echo "Get K8S Cluster Status"
40+
sh '''
41+
bash -x ./.jenkins-scripts/get-k8s-debug.sh
42+
'''
43+
3944
echo "Verify we can run a GPU job"
4045
sh '''
4146
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -53,7 +58,7 @@ pipeline {
5358

5459
echo "Test Kubeflow installation"
5560
sh '''
56-
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
61+
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
5762
'''
5863

5964
echo "Test Monitoring installation"
@@ -124,6 +129,11 @@ pipeline {
124129
bash -x ./.jenkins-scripts/test-cluster-up.sh
125130
'''
126131

132+
echo "Get K8S Cluster Status"
133+
sh '''
134+
bash -x ./.jenkins-scripts/get-k8s-debug.sh
135+
'''
136+
127137
echo "Verify we can run a GPU job"
128138
sh '''
129139
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -141,7 +151,7 @@ pipeline {
141151

142152
echo "Test Kubeflow installation"
143153
sh '''
144-
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
154+
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
145155
'''
146156

147157
echo "Test Monitoring installation"

jenkins/Jenkinsfile-nightly

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ pipeline {
3636
bash -x ./.jenkins-scripts/test-cluster-up.sh
3737
'''
3838

39+
echo "Get K8S Cluster Status"
40+
sh '''
41+
bash -x ./.jenkins-scripts/get-k8s-debug.sh
42+
'''
43+
3944
echo "Verify we can run a GPU job"
4045
sh '''
4146
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -53,7 +58,7 @@ pipeline {
5358

5459
echo "Test Kubeflow installation"
5560
sh '''
56-
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
61+
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
5762
'''
5863

5964
echo "Test Monitoring installation"
@@ -124,6 +129,11 @@ pipeline {
124129
bash -x ./.jenkins-scripts/test-cluster-up.sh
125130
'''
126131

132+
echo "Get K8S Cluster Status"
133+
sh '''
134+
bash -x ./.jenkins-scripts/get-k8s-debug.sh
135+
'''
136+
127137
echo "Verify we can run a GPU job"
128138
sh '''
129139
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -141,7 +151,7 @@ pipeline {
141151

142152
echo "Test Kubeflow installation"
143153
sh '''
144-
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
154+
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
145155
'''
146156

147157
echo "Test Monitoring installation"

playbooks/k8s-cluster.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,4 +200,15 @@
200200
tags:
201201
- local
202202

203+
# Remove taint from kube-master nodes.
204+
# This keeps backwards compatibility and allows a few services (monitoring/etc.) to run properly.
205+
- hosts: kube-master
206+
gather_facts: false
207+
vars:
208+
ansible_become: no
209+
tasks:
210+
- name: kubeadm | Remove taint for master with node role
211+
command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/master:NoSchedule-"
212+
delegate_to: localhost
213+
failed_when: false # Taint will not be present if kube-master also under kube-node
203214

playbooks/nvidia-gpu-operator.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
- nvidia-gpu-operator
1515

1616
# GPU operator
17-
- hosts: kube-master
17+
- hosts: kube-master[0]
1818
become: yes
1919
tasks:
2020
- name: Install helm chart for GPU operator

playbooks/nvidia-k8s-gpu-device-plugin.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
- hosts: kube-master
2+
- hosts: kube-master[0]
33
become: true
44
tasks:
55
- name: install k8s GPU plugin

playbooks/slurm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
authorized_key:
1313
user: root
1414
state: present
15-
key: "{{ lookup('file', lookup('env','HOME') + '/.ssh/id_rsa.pub') }}"
15+
key: "{{ lookup('file', ansible_ssh_private_key_file | default(lookup('env','HOME') + '/.ssh/id_rsa') + '.pub') }}"
1616

1717
# Build slurm first on all nodes
1818
- hosts: slurm-cluster

roles/lmod/defaults/main.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
lmod_rhel_epel_repo_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/"
3+
lmod_rhel_epel_repo_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
4+
5+
# include some reasonable defaults for module paths
6+
sm_prefix: "/sw"
7+
sm_module_root: "{{ sm_prefix }}/modules"
8+
sm_module_path: "{{ sm_module_root }}/all"
9+
sm_software_path: "{{ sm_prefix }}/software"

0 commit comments

Comments
 (0)