-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtrain.yaml
101 lines (76 loc) · 2.38 KB
/
train.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
name: quakeflow
workdir: .
num_nodes: 1
resources:
cloud: gcp
region: us-west1
zone: us-west1-b
# instance_type: n2-highmem-16
accelerators: P100:4
# cpus: 16+
cpus: 64+
disk_size: 300
disk_tier: high
use_spot: True
# spot_recovery: FAILOVER
# image_id: docker:zhuwq0/quakeflow:latest
envs:
JOB: quakeflow
NCPU: 1
ROOT_PATH: /data
MODEL_NAME: phasenet_plus
WANDB_API_KEY: cb014c63ac451036ca406582b41d32ae83154289
file_mounts:
# /data:
# # source: s3://scedc-pds/
# # source: s3://ncedc-pds/
# source: gs://quakeflow_dataset/
# # source: gs://quakeflow_share/
# # source: gs://das_arcata/
# mode: MOUNT
# /dataset:
# source: gs://quakeflow_dataset
# mode: MOUNT
# /checkpoint:
# name: quakeflow_model # Name of the bucket
# store: gcs
# mode: MOUNT
/checkpoint:
source: gs://quakeflow_model
mode: MOUNT
/dataset/waveform_ps_h5:
source: gs://quakeflow_dataset/NC/waveform_ps_h5
mode: COPY
/dataset/train.h5:
source: gs://quakeflow_dataset/NC/waveform_ps_train.h5
mode: COPY
/dataset/test.h5:
source: gs://quakeflow_dataset/NC/waveform_ps_test.h5
mode: COPY
~/.ssh/id_rsa.pub: ~/.ssh/id_rsa.pub
~/.ssh/id_rsa: ~/.ssh/id_rsa
~/.config/rclone/rclone.conf: ~/.config/rclone/rclone.conf
setup: |
echo "Begin setup."
echo export WANDB_API_KEY=$WANDB_API_KEY >> ~/.bashrc
# sudo apt install rclone
pip install fsspec gcsfs
pip install obspy pyproj
pip install h5py tqdm wandb
# pip install torch torchvision torchaudio
pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
run: |
num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`
[[ ${SKYPILOT_NUM_GPUS_PER_NODE} -gt $NCPU ]] && nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} || nproc_per_node=$NCPU
if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
ls -al /dataset
fi
torchrun \
--nproc_per_node=${nproc_per_node} \
--node_rank=${SKYPILOT_NODE_RANK} \
--nnodes=$num_nodes \
--master_addr=$master_addr \
--master_port=8008 \
train.py --model $MODEL_NAME --batch-size=256 --hdf5-file /dataset/train.h5 --test-hdf5-file /dataset/test.h5 \
--workers 12 --stack-event --flip-polarity --drop-channel --output /checkpoint/$MODEL_NAME --wandb --wandb-project $MODEL_NAME --resume True