From 3a7fd2b46dd8b71622269c0549e860e481ffc39c Mon Sep 17 00:00:00 2001 From: Yiyu Ni Date: Thu, 16 May 2024 21:35:18 -0700 Subject: [PATCH 1/5] update tutorials for aws batch Signed-off-by: Yiyu Ni --- tutorials/cloud/README.md | 29 ++ tutorials/cloud/aws-batch.md | 63 ---- tutorials/cloud/aws-ec2.md | 100 ------- tutorials/cloud/checklist.md | 75 ----- tutorials/cloud/compute_environment.yaml | 10 +- tutorials/cloud/config.yaml | 40 +++ tutorials/cloud/job.yaml | 6 +- tutorials/cloud/job_cc.yaml | 8 +- tutorials/cloud/job_definition.yaml | 8 +- tutorials/cloud/job_queue.yaml | 6 +- tutorials/cloud/job_stack.yaml | 8 +- tutorials/cloud/noisepy_aws_batch.ipynb | 348 +++++++++++++++++++++++ 12 files changed, 440 insertions(+), 261 deletions(-) create mode 100644 tutorials/cloud/README.md delete mode 100644 tutorials/cloud/aws-batch.md delete mode 100644 tutorials/cloud/aws-ec2.md delete mode 100644 tutorials/cloud/checklist.md create mode 100644 tutorials/cloud/config.yaml create mode 100644 tutorials/cloud/noisepy_aws_batch.ipynb diff --git a/tutorials/cloud/README.md b/tutorials/cloud/README.md new file mode 100644 index 00000000..16f391ca --- /dev/null +++ b/tutorials/cloud/README.md @@ -0,0 +1,29 @@ +# Running NoisePy with AWS + +## EC2 and Jupyter Lab +Please refer to [SCOPED HPS Book](https://seisscoped.org/HPS-book/chapters/cloud/AWS_101.html) for full detailed instruction on launching an AWS EC2 instance and/or running the notebooks within a containerized environment. + +## Submit Batch Job +For large job load, please refer to the [notebook tutorial](./noisepy_aws_batch.ipynb) for more instruction. + +## Command Line Interface +You may create or edit the [config.yml](../config.yml) file with appropriate parameters. The cross-correlation function is written to the `ccf_path`. + +```bash +noisepy cross_correlate --format numpy --raw_data_path s3://scedc-pds/continuous_waveforms/ \ +--xml_path s3://scedc-pds/FDSNstationXML/CI/ \ +--ccf_path s3:/// \ +--stations=SBC,RIO,DEV \ +--start=2022-02-02 \ +--end=2022-02-03 +``` + +This toy problem gathers the all the cross-correlations calculated and stack them into the NumPy format on the S3 bucket, specificed by the `stack_path`. + +```bash +noisepy stack \ +--format numpy \ +--ccf_path s3:/// \ +--stack_path s3:/// \ +``` + diff --git a/tutorials/cloud/aws-batch.md b/tutorials/cloud/aws-batch.md deleted file mode 100644 index 05501f1e..00000000 --- a/tutorials/cloud/aws-batch.md +++ /dev/null @@ -1,63 +0,0 @@ -# Running NoisePy with AWS Batch Service (Advanced) - -## Pre-requisites -* You are not required to run this on a AWS EC2 instance, but you would need [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html) as well as the [jq tool](https://jqlang.github.io/jq/download/) installed. - -* AWS Batch requires a special IAM role to be created for running the jobs. This can be done from the IAM console. See [instructions](./checklist.md) to create the role. - -* Be sure to go to the S3 bucket where you'll be writing the results of the jobs and [modify the permissions](./checklist.md) accordingly. - -## Create a Compute Environment -You'll need two pieces of information to create the compute environment. The list of subnets in your VPC and the default security group ID. You can use the following commands to retrieve them: - -``` -aws ec2 describe-subnets | jq ".Subnets[] | .SubnetId" -``` -``` -aws ec2 describe-security-groups --filters "Name=group-name,Values=default" | jq ".SecurityGroups[0].GroupId" -``` - -Use this values to update the missing fields in `compute_environment.yaml` and the run: - -``` -aws batch create-compute-environment --no-cli-pager --cli-input-yaml file://compute_environment.yaml -``` - -Make a note of the compute environment ARN to use in the next step. - -### Create a Job queue -Add the compute environment and a name to `job_queue.yaml` and then run: - -``` -aws batch create-job-queue --no-cli-pager --cli-input-yaml file://job_queue.yaml -``` - -### Create a Job Definition -Update the `jobRoleArn` and `executionRoleArn` fields in the `job_definition.yaml` file with the ARN of the role created in the first step. Add a name for the `jobDefinition`. Finally, run: - -``` -aws batch register-job-definition --no-cli-pager --cli-input-yaml file://job_definition.yaml -``` - -### Submit a Cross-Correlation job -Update `job_cc.yaml` with the names of your `jobQueue` and `jobDefinition` created in the last steps. Then update the S3 bucket paths -to the locations you want to use for the output and your `config.yaml` file. - -``` -aws batch submit-job --no-cli-pager --cli-input-yaml file://job_cc.yaml --job-name "" -``` - -### Submit a Stacking job -Update `job_stack.yaml` with the names of your `jobQueue` and `jobDefinition` created in the last steps. Then update the S3 bucket paths -to the locations you want to use for your input CCFs (e.g. the output of the previous CC run), and the stack output. By default, NoisePy will look for a config -file in the `--ccf_path` location to use the same configuration for stacking that was used for cross-correlation. - -``` -aws batch submit-job --no-cli-pager --cli-input-yaml file://job_stack.yaml --job-name "" -``` - -### Multi-node (array) jobs -See comment above `arrayProperties` in `job_cc.yaml` and `job_stack.yaml` for instructions on how to process in parallel across multiple nodes. - -## Plotting Results -See chapter TBD to read and plot results. diff --git a/tutorials/cloud/aws-ec2.md b/tutorials/cloud/aws-ec2.md deleted file mode 100644 index b309dc9a..00000000 --- a/tutorials/cloud/aws-ec2.md +++ /dev/null @@ -1,100 +0,0 @@ -# Running NoisePy with AWS EC2 Service - -## Pre-requisites -See our [checklist](./checklist.md) - -## Setup the Virtual Machine -### Create an EC2 Instance -- Log into your AWS account and go into the EC2 Dashboard -- Click on Launch Instance -- Application and OS images: - - Select the AWS Linux -- Instance type: - - t2.micro (free) or bigger machines (RAM recommended TBD) -- Key pair for SSH: - - Create a new Key pair (RSA) -- Network settings: - - You can use most defaults but we recomment `Allow SSH traffic from` to `My IP` - - In order to access Jupyter notebook on the instance, click `Allow HTTPS traffic from the internet`. -- Advanced details: - - If applicable, select `IAM instance profile` to the appropriate role for EC2 service. See [IAM Role](./checklist.md) for reference. - -More information about getting on the cloud in the [SCOPED HPS Book](https://seisscoped.org/HPS/softhardware/AWS_101.html). - -### SSH Into Your Instance -Make your private key file only readable by you (assuming it's named/downloaded to `~/Downloads/ec2.pem`). Go to your instance's summary page and copy the `Public IPv4 DNS` in the format of `XXXXX.us-west-2.compute.amazonaws.com`. -``` -cd ~/Downloads -chmod 400 ec2.pem -ssh -i ec2.pem ec2-user@ -``` - -### Install NoisePy -This tutorial focuses on small, toy problems to be ran on notebooks or simple CLI. We include jupyter notebook instructions to explore the data and results. Options are available to install NoisePy for different purposes. - -You may save your environment using AWS AMI. Then subsequent launcing of instances can re-use your environment. - -```bash -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -chmod +x Miniconda3-latest-Linux-x86_64.sh -./Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda3 -./miniconda3/bin/conda init bash -bash -``` - -#### Through Pip -```bash -conda create -y -n noisepy python==3.10 -conda activate noisepy -pip install ipykernel jupyter noisepy-seis -``` - -#### Through Git -Download the entire development version of NoisePy repository from GitHub. The directory includes source codes and all tutorials. -```bash -sudo yum install -y git -git clone https://github.com/noisepy/NoisePy -cd NoisePy -pip install . -``` - -#### Through Docker -```bash -sudo yum install -y git docker -sudo systemctl start docker -sudo docker pull ghcr.io/noisepy/noisepy:latest -``` - -```bash -sudo docker run -v ~/tmp:/tmp cross_correlate --path /tmp -``` - -## Running Cross-correlation as a Toy Problem -Below we use stations from SCEDC public data archive on the AWS S3 to run a tiny cross-correlation workflow using NoisePy. The continuous waveform is publicized at `s3://scedc-pds/continuous_waveforms/` with associated StationXML file at `s3://scedc-pds/FDSNstationXML/`. - -### Exploration Using Jupyter Notebooks -We recommend starting off with a notebook to explore simple jobs and the desirable configuration (e.g., noise pre-processing). Refer to the [SCOPED HPS Book](https://seisscoped.org/HPS/softhardware/AWS_101.html) to open a Jupyter notebook. - -### Exploration Using CLI -You may create or edit the [config.yml](../config.yml) file with appropriate parameters. The cross-correlation function is written to the `ccf_path`. - -```bash -noisepy cross_correlate --format numpy --raw_data_path s3://scedc-pds/continuous_waveforms/ \ ---xml_path s3://scedc-pds/FDSNstationXML/CI/ \ ---ccf_path s3:/// \ ---stations=SBC,RIO,DEV \ ---start=2022-02-02 \ ---end=2022-02-03 -``` - -This toy problem gathers the all the cross-correlations calculated and stack them into the NumPy format on the S3 bucket, specificed by the `stack_path`. - -```bash -noisepy stack \ ---format numpy \ ---ccf_path s3:/// \ ---stack_path s3:/// \ -``` - -## Plotting Results -See chapter TBD to read and plot results. diff --git a/tutorials/cloud/checklist.md b/tutorials/cloud/checklist.md deleted file mode 100644 index 2342704f..00000000 --- a/tutorials/cloud/checklist.md +++ /dev/null @@ -1,75 +0,0 @@ -# General Checklist Running NoisePy on the AWS -This will be a frequently referred chapter for running any (AWS) Cloud-native codes. Check through each of the items below and make sure you have them configured right. - -## AWS Account -* ``: A 12-digit number uniquely identify your account. - -Make sure you have an account on AWS idenfitied by a 12-digit number. AWS requires particular credentials to connect. - -## IAM Role and Permission -* ``: A virtual identity that has specific permissions. The role ARN is in the format of `arn:aws:iam:::role/`. - -AWS batch requires an IAM role to be created for running the jobs. This can be done from the IAM console on the AWS web console. Depending on the type of service to use, separate roles may be created. - -* **EC2 service** generally uses the following configuration: - - Trusted Entity Type: AWS Service - - Use Case: EC2 - - Permission Policies, search and add: - - AmazonEC2FullAccess - - AmazonS3FullAccess - -* **Batch service** generally uses the following configuration: - - Trusted Entity Type: AWS Service - - Use Case: Elastic Container Service - - Elastic Container Service Task - - Permission Policies, search and add: - - AmazonECSTaskExecutionRolePolicy - - AmazonS3FullAccess - - Once the role is created, one more permission is needed: - - Go to: Permissions tab --> Add Permissions --> Create inline policy - - Search for "batch" - - Click on **Batch** - - Select Read / Describe Jobs - - Click Next - - Add a policy name, e.g. "Describe_Batch_Jobs" - - Click Create Policy - -## S3 Object Storage and Policy -* ``: A dedicated container on S3 with specific permissions. - -NoisePy uses S3 Cloudstore to store the cross correlations and stacked data. For this step, it is important that your **user/role** and the **bucket** have the appropriate permissions for users to read/write into the bucket. - -The following statement in the JSON format is called **policy**. It explicitly defined which operation is allowed/denied by which user/role. In the case below, all operation are allowed (specified by the `s3:*` argument in the `Action` field) by services under your account with attached role (speicified by the `"arn:aws:iam:::role/"` argument) on any file/resources in the bucket (speified by `"arn:aws:s3:::/*"`). -```json -{ - "Version": "2012-10-17", - "Id": "Policy1674832359797", - "Statement": [ - { - "Sid": "Stmt1674832357905", - "Effect": "Allow", - "Principal": { - "AWS": "arn:aws:iam:::role/" - }, - "Action": "s3:*", - "Resource": "arn:aws:s3:::/*" - } - ] -} -``` -## AWS Commmand Line Interface (CLI) -In order to check whether the user can read/write in the bucket, we recommend testing from local. The AWS CLI is required (install [here](https://aws.amazon.com/cli/)). This tool is already installed if you are on a EC2 instance running Amazon Linux. - -```bash -# list the bucket -aws s3 ls s3:// - -# add a temporary file -aws s3 cp temp s3:// - -# remove a temporary file -aws s3 rm s3:///temp -``` - -If this step works, and if your role and user account are attached to the bucket policy, the rest of the AWS NoisePy tutorial should work. diff --git a/tutorials/cloud/compute_environment.yaml b/tutorials/cloud/compute_environment.yaml index 5e4c2a00..aa5355ad 100644 --- a/tutorials/cloud/compute_environment.yaml +++ b/tutorials/cloud/compute_environment.yaml @@ -1,10 +1,10 @@ -computeEnvironmentName: '' # [REQUIRED] The name for your compute environment. +computeEnvironmentName: '' # [REQUIRED] Specify a name for your compute environment. type: MANAGED state: ENABLED computeResources: # Details about the compute resources managed by the compute environment. type: FARGATE maxvCpus: 256 # [REQUIRED] The maximum number of Amazon EC2 vCPUs that a compute environment can reach. - subnets: # [REQUIRED] The VPC subnets where the compute resources are launched. - - '' - securityGroupIds: # [REQUIRED] The Amazon EC2 security groups that are associated with instances launched in the compute environment. - - '' + subnets: + - '' # [REQUIRED] The VPC subnets where the compute resources are launched. + securityGroupIds: + - '' # [REQUIRED] The Amazon EC2 security groups that are associated with instances launched in the compute environment. diff --git a/tutorials/cloud/config.yaml b/tutorials/cloud/config.yaml new file mode 100644 index 00000000..c65c2785 --- /dev/null +++ b/tutorials/cloud/config.yaml @@ -0,0 +1,40 @@ +acorr_only: false +cc_len: 1800 +cc_method: xcorr +channels: [BHE, BHN, BHZ] +client_url_key: SCEDC +correction: false +correction_csv: null +down_list: false +start_date: '2004-01-01T00:00:00Z' +end_date: '2004-01-03T00:00:00Z' +freq_norm: rma +freqmax: 2.0 +freqmin: 0.05 +inc_hours: 24 +keep_substack: false +lamax: 36.0 +lamin: 31.0 +lomax: -115.0 +lomin: -122.0 +max_over_std: 10 +maxlag: 200 +ncomp: 3 +net_list: [CI] +respdir: null +rm_resp: inv +rm_resp_out: VEL +rotation: true +samp_freq: 20.0 +single_freq: true +smooth_N: 10 +smoothspect_N: 10 +stack_method: linear +stations: ["*"] +stationxml: false +step: 450.0 +storage_options: {} +substack: false +substack_len: 1800 +time_norm: no +xcorr_only: true diff --git a/tutorials/cloud/job.yaml b/tutorials/cloud/job.yaml index 423a60fa..d7441ec6 100644 --- a/tutorials/cloud/job.yaml +++ b/tutorials/cloud/job.yaml @@ -1,5 +1,5 @@ -jobName: '' -jobQueue: '' +jobName: '' # [REQUIRED] Specify a name for the job. +jobQueue: '' # [REQUIRED] The job queue into which the job is submitted. jobDefinition: '' # [REQUIRED] The job definition used by this job. containerOverrides: # An object with various properties that override the defaults for the job definition that specify the name of a container in the specified job definition and the overrides it should receive. command: # The command to send to the container that overrides the default command from the Docker image or the job definition. @@ -7,7 +7,7 @@ containerOverrides: # An object with various properties that override the defaul - --format=numpy - --raw_data_path=s3://scedc-pds/continuous_waveforms/ - --xml_path=s3://scedc-pds/FDSNstationXML/CI/ - - --ccf_path=s3:/// + - --ccf_path=s3://// - --net_list=CI - --stations=* - --start=2022-02-02 diff --git a/tutorials/cloud/job_cc.yaml b/tutorials/cloud/job_cc.yaml index a05db14b..f3d61243 100644 --- a/tutorials/cloud/job_cc.yaml +++ b/tutorials/cloud/job_cc.yaml @@ -1,5 +1,5 @@ -jobName: 'noisepy-cross-correlate' -jobQueue: '' +jobName: '' # [REQUIRED] Specify a name for the cross-correlation job. +jobQueue: '' # [REQUIRED] The job queue into which the job is submitted. jobDefinition: '' # [REQUIRED] The job definition used by this job. # Uncomment to run a job across multiple nodes. The days in the time range will be split across the nodes. # arrayProperties: @@ -12,7 +12,7 @@ containerOverrides: # An object with various properties that override the defaul - cross_correlate - --raw_data_path=s3://scedc-pds/continuous_waveforms/ - --xml_path=s3://scedc-pds/FDSNstationXML/CI/ - - --ccf_path=s3:/// - - --config=s3:////config.yaml + - --ccf_path=s3://// + - --config=s3:////config.yaml timeout: attemptDurationSeconds: 36000 # 10 hrs diff --git a/tutorials/cloud/job_definition.yaml b/tutorials/cloud/job_definition.yaml index 14fe0fac..cf5728ff 100644 --- a/tutorials/cloud/job_definition.yaml +++ b/tutorials/cloud/job_definition.yaml @@ -1,4 +1,4 @@ -jobDefinitionName: '' # [REQUIRED] The name of the job definition to register. +jobDefinitionName: '' # [REQUIRED] Specify a name for the job definition to register. type: container platformCapabilities: - FARGATE @@ -6,8 +6,8 @@ containerProperties: image: 'ghcr.io/noisepy/noisepy' command: - '--help' - jobRoleArn: '' - executionRoleArn: '' + jobRoleArn: '' # [REQUIRED] The Amazon Resource Name (ARN) of the IAM role that the container can assume for AWS permissions. + executionRoleArn: '' # [REQUIRED] The Amazon Resource Name (ARN) of the IAM role that the Amazon ECS container agent and the Docker daemon can assume. resourceRequirements: # The type and amount of resources to assign to a container. - value: '16' type: VCPU @@ -16,7 +16,7 @@ containerProperties: networkConfiguration: # The network configuration for jobs that are running on Fargate resources. assignPublicIp: ENABLED # Indicates whether the job has a public IP address. Valid values are: ENABLED, DISABLED. ephemeralStorage: # The amount of ephemeral storage to allocate for the task. - sizeInGiB: 21 # [REQUIRED] The total amount, in GiB, of ephemeral storage to set for the task. + sizeInGiB: 21 # The total amount, in GiB, of ephemeral storage to set for the task. retryStrategy: # The retry strategy to use for failed jobs that are submitted with this job definition. attempts: 1 # The number of times to move a job to the RUNNABLE status. propagateTags: true # Specifies whether to propagate the tags from the job or job definition to the corresponding Amazon ECS task. diff --git a/tutorials/cloud/job_queue.yaml b/tutorials/cloud/job_queue.yaml index 822c9f73..4591ac27 100644 --- a/tutorials/cloud/job_queue.yaml +++ b/tutorials/cloud/job_queue.yaml @@ -1,6 +1,6 @@ -jobQueueName: '' # [REQUIRED] The name of the job queue. +jobQueueName: '' # [REQUIRED] Specify a name for the job queue. state: ENABLED priority: 0 -computeEnvironmentOrder: # [REQUIRED] The set of compute environments mapped to a job queue and their order relative to each other. -- order: 0 # [REQUIRED] The order of the compute environment. +computeEnvironmentOrder: # The set of compute environments mapped to a job queue and their order relative to each other. +- order: 0 # The order of the compute environment. computeEnvironment: '' # [REQUIRED] The Amazon Resource Name (ARN) of the compute environment. diff --git a/tutorials/cloud/job_stack.yaml b/tutorials/cloud/job_stack.yaml index 458e86fc..6b4d5ec3 100644 --- a/tutorials/cloud/job_stack.yaml +++ b/tutorials/cloud/job_stack.yaml @@ -1,5 +1,5 @@ -jobName: 'noisepy-stack' -jobQueue: '' +jobName: '' # [REQUIRED] Specify a name for the stacking job. +jobQueue: '' # [REQUIRED] The job queue into which the job is submitted. jobDefinition: '' # [REQUIRED] The job definition used by this job. # Uncomment to run a job across multiple nodes. The station pairs to be stacked will be split across the nodes. # arrayProperties: @@ -10,7 +10,7 @@ containerOverrides: # An object with various properties that override the defaul type: MEMORY command: # The command to send to the container that overrides the default command from the Docker image or the job definition. - stack - - --ccf_path=s3:/// - - --stack_path=s3:/// + - --ccf_path=s3://// + - --stack_path=s3://// timeout: attemptDurationSeconds: 7200 # 2 hrs diff --git a/tutorials/cloud/noisepy_aws_batch.ipynb b/tutorials/cloud/noisepy_aws_batch.ipynb new file mode 100644 index 00000000..70d6003f --- /dev/null +++ b/tutorials/cloud/noisepy_aws_batch.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f4b0af4c", + "metadata": {}, + "source": [ + "# NoisePy tutorial: AWS Batch\n", + "\n", + "Here's a tutorial on using Amazon EC2 Batch with Fargate Spot and containers to perform a job that involves writing to and reading from AWS S3." + ] + }, + { + "cell_type": "markdown", + "id": "8c0b78dd", + "metadata": {}, + "source": [ + "## 1. Checklist and prerequisites\n", + "\n", + "### 1.1 Tools\n", + "You are not required to run this on a AWS EC2 instance, but two tools are required for this tutorail: AWS Command Line Tool (CLI) and JQ. Note that the code cell below only works for x86_64 CentOS where you have sudo permission. You can find installation instructions for other OS below.\n", + "\n", + "* AWS CLI: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html\n", + "* jq: https://jqlang.github.io/jq/download/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1287aed4", + "metadata": {}, + "outputs": [], + "source": [ + "# Install AWS CLI (Command line interface)\n", + "# This tool may already be installed if you are on a EC2 instance running Amazon Linux\n", + "\n", + "! curl \"https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip\" -o \"awscliv2.zip\"\n", + "! unzip awscliv2.zip\n", + "! sudo ./aws/install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bce934d", + "metadata": {}, + "outputs": [], + "source": [ + "# You may check the correct installation of CLI with the following command, \n", + "# which lists the files in SCEDC public bucket.\n", + "\n", + "! aws s3 ls s3://scedc-pds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "803cd99b-bad3-4003-9f60-1e04fa6dce44", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Install jq\n", + "\n", + "! sudo yum install -y jq" + ] + }, + { + "cell_type": "markdown", + "id": "139d9595", + "metadata": {}, + "source": [ + "### 1.2 AWS Account\n", + "\n", + "The account ID is a 12-digit number uniquely identify your account. You can find it on your AWS web console.\n", + "\n", + "⚠️ Save the workshop `` here: `REPLACE_ME`" + ] + }, + { + "cell_type": "markdown", + "id": "10479daa", + "metadata": {}, + "source": [ + "### 1.3 Role\n", + "\n", + "AWS role is a virtual identity that has specific permissions where its ID (called `ARN`) is in the format of `arn:aws:iam:::role/`. AWS batch requires a role to be created for running the jobs. This can be done from the IAM panel on the AWS web console. Depending on the type of service to use, separate roles may be created. A specific role is required for **AWS Batch Service**.\n", + "- Trusted Entity Type: AWS Service\n", + "- Use Case: Elastic Container Service\n", + " - Elastic Container Service Task\n", + "- Permission Policies, search and add:\n", + " - AmazonECSTaskExecutionRolePolicy\n", + " - AmazonS3FullAccess\n", + "\n", + "Once the role is created, one more permission is needed:\n", + "- Go to: Permissions tab --> Add Permissions --> Create inline policy\n", + "- Search for \"batch\"\n", + "- Click on **Batch**\n", + "- Select Read / Describe Jobs\n", + "- Click Next\n", + "- Add a policy name, e.g. \"Describe_Batch_Jobs\"\n", + "- Click Create Policy\n", + "\n", + "⚠️ Workshop participants please use `arn:aws:iam:::role/NoisePyBatchRole`" + ] + }, + { + "cell_type": "markdown", + "id": "a9ba5c9f", + "metadata": {}, + "source": [ + "### 1.4 S3 Storage\n", + "\n", + "NoisePy uses S3 cloud store to store the cross correlations and stacked data. For this step, it is important that your **role** and the **bucket** have the appropriate permissions for users to read/write into the bucket.\n", + "\n", + "The following statement in the JSON format is called a **policy**. It explicitly defined which operation is allowed/denied by which user/role. The following bucket policy defines that \n", + "* all operations (`\"s3:*\"`) are allowed by your account with attached role (`\"arn:aws:iam:::role/\"`) on any file in the bucket (`\"arn:aws:s3:::/*\"`).\n", + "* anyone is allowed to read the data within the bucket (`\"s3:GetObject\"`,`\"s3:GetObjectVersion\"`)\n", + "* anyone is allowed to list the file within the bucket (`\"s3:ListBucket\"`)\n", + "\n", + "```json\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Id\": \"Policy1674832359797\",\n", + " \"Statement\": [\n", + " {\n", + " \"Sid\": \"Stmt1674832357905\",\n", + " \"Effect\": \"Allow\",\n", + " \"Principal\": {\n", + " \"AWS\": \"arn:aws:iam:::role/\"\n", + " },\n", + " \"Action\": \"s3:*\",\n", + " \"Resource\": \"arn:aws:s3:::/*\"\n", + " },\n", + " {\n", + "\t\t\t\"Effect\": \"Allow\",\n", + "\t\t\t\"Principal\": {\n", + "\t\t\t\t\"AWS\": \"*\"\n", + "\t\t\t},\n", + "\t\t\t\"Action\": [\n", + "\t\t\t\t\"s3:GetObject\",\n", + "\t\t\t\t\"s3:GetObjectVersion\"\n", + "\t\t\t],\n", + "\t\t\t\"Resource\": \"arn:aws:s3:::/*\"\n", + "\t\t},\n", + "\t\t{\n", + "\t\t\t\"Effect\": \"Allow\",\n", + "\t\t\t\"Principal\": {\n", + "\t\t\t\t\"AWS\": \"*\"\n", + "\t\t\t},\n", + "\t\t\t\"Action\": \"s3:ListBucket\",\n", + "\t\t\t\"Resource\": \"arn:aws:s3:::\"\n", + "\t\t}\n", + " ]\n", + "}\n", + "```\n", + "\n", + "⚠️ Save your `` name here: `REPLACE_ME`" + ] + }, + { + "cell_type": "markdown", + "id": "6a0745af", + "metadata": {}, + "source": [ + "## 2. Setup Batch Jobs\n", + "\n", + "### 2.1 Compute Environment\n", + "You'll need two pieces of information to create the compute environment. The list of subnets in your VPC and the default security group ID. You can use the following commands to retrieve them. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbd41df3", + "metadata": {}, + "outputs": [], + "source": [ + "! aws ec2 describe-subnets | jq \".Subnets[] | .SubnetId\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab280446", + "metadata": {}, + "outputs": [], + "source": [ + "! aws ec2 describe-security-groups --filters \"Name=group-name,Values=default\" | jq \".SecurityGroups[0].GroupId\"" + ] + }, + { + "cell_type": "markdown", + "id": "6e126e65", + "metadata": {}, + "source": [ + "Use these values to update the missing fields `subnets` and `securityGroupIds` in [compute_environment.yaml](./compute_environment.yaml) and run the code afterwards. If you have multiple subnets, choose one of them.\n", + "\n", + "For HPS-book reader, the file is also available [here](https://github.com/noisepy/NoisePy/blob/main/tutorials/cloud/compute_environment.yaml) on GitHub." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af330592", + "metadata": {}, + "outputs": [], + "source": [ + "! aws batch create-compute-environment --no-cli-pager --cli-input-yaml file://compute_environment.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "68826bd9", + "metadata": {}, + "source": [ + "### 2.2 Create a Job Queue\n", + "Add the `computeEnvironment` and the `jobQueueName` in [job_queue.yaml](./job_queue.yaml) and then run the following command. \n", + "\n", + "For HPS-book reader, the file is also available [here](https://github.com/noisepy/NoisePy/blob/main/tutorials/cloud/job_queue.yaml) on GitHub." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "230c7189", + "metadata": {}, + "outputs": [], + "source": [ + "! aws batch create-job-queue --no-cli-pager --cli-input-yaml file://job_queue.yaml " + ] + }, + { + "cell_type": "markdown", + "id": "9750529a", + "metadata": {}, + "source": [ + "### 2.3 Create a Job Definition\n", + "Update the `jobRoleArn` and `executionRoleArn` fields in the [job_definition.yaml](./job_definition.yaml) file with the ARN of the role created in the first step (they should be the same in this case). Add a name for the `jobDefinition` and run the code below.\n", + "\n", + "For HPS-book reader, the file is also available [here](https://github.com/noisepy/NoisePy/blob/main/tutorials/cloud/job_definition.yaml)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a019f8", + "metadata": {}, + "outputs": [], + "source": [ + "! aws batch register-job-definition --no-cli-pager --cli-input-yaml file://job_definition.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "49cb7db6", + "metadata": {}, + "source": [ + "## 3. Submit the Job\n", + "### 3.1 Cross-correlation Configuration\n", + "Update [config.yaml](./config.yaml) for NoisePy configuration. Then copy the file to S3 so that the batch job can access it after launching. Replace the `` with the bucket we just used, as well as an intermediate `` to separate your runs from others. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fcff491", + "metadata": {}, + "outputs": [], + "source": [ + "! aws s3 cp ./config.yaml s3:////config.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "26bdbf2d", + "metadata": {}, + "source": [ + "### 3.2 Run Cross-correlation\n", + "Update [job_cc.yaml](./job_cc.yaml) with the names of your `jobQueue` and `jobDefinition` created in the last steps. Also give your job a name in `jobName`. Then update the S3 bucket paths to the locations you want to use for the output and your `config.yaml` file.\n", + "\n", + "For HPS-book reader, the file is also available [here](https://github.com/noisepy/NoisePy/blob/main/tutorials/cloud/job_cc.yaml)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "185a869f", + "metadata": {}, + "outputs": [], + "source": [ + "! aws batch submit-job --no-cli-pager --cli-input-yaml file://job_cc.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "1427c920", + "metadata": {}, + "source": [ + "### 3.3 Run Stacking\n", + "Update [job_stack.yaml](./job_stack.yaml) with the names of your `jobQueue` and `jobDefinition` created in the last steps. Also give your job a name in `jobName`. Then update the S3 bucket paths to the locations you want to use for your input CCFs (e.g. the output of the previous CC run), and the stack output. By default, NoisePy will look for a config file in the `--ccf_path` location to use the same configuration for stacking that was used for cross-correlation.\n", + "\n", + "For HPS-book reader, the file is also available [here](https://github.com/noisepy/NoisePy/blob/main/tutorials/cloud/job_stack.yaml)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "608c0af5", + "metadata": {}, + "outputs": [], + "source": [ + "! aws batch submit-job --no-cli-pager --cli-input-yaml file://job_stack.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "57a053a1", + "metadata": {}, + "source": [ + "### 4. Visualization\n", + "You can use [plot_stacks.ipynb](../plot_stacks.ipynb) for cross-correlation visualization after all jobs return." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".envs", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a0623eaafecac570db596ae86b683d0c7e341921 Mon Sep 17 00:00:00 2001 From: Yiyu Ni Date: Thu, 16 May 2024 21:38:02 -0700 Subject: [PATCH 2/5] pre-commit Signed-off-by: Yiyu Ni --- tutorials/cloud/README.md | 1 - tutorials/cloud/compute_environment.yaml | 4 ++-- tutorials/cloud/noisepy_aws_batch.ipynb | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tutorials/cloud/README.md b/tutorials/cloud/README.md index 16f391ca..acea8ca2 100644 --- a/tutorials/cloud/README.md +++ b/tutorials/cloud/README.md @@ -26,4 +26,3 @@ noisepy stack \ --ccf_path s3:/// \ --stack_path s3:/// \ ``` - diff --git a/tutorials/cloud/compute_environment.yaml b/tutorials/cloud/compute_environment.yaml index aa5355ad..7053134a 100644 --- a/tutorials/cloud/compute_environment.yaml +++ b/tutorials/cloud/compute_environment.yaml @@ -4,7 +4,7 @@ state: ENABLED computeResources: # Details about the compute resources managed by the compute environment. type: FARGATE maxvCpus: 256 # [REQUIRED] The maximum number of Amazon EC2 vCPUs that a compute environment can reach. - subnets: + subnets: - '' # [REQUIRED] The VPC subnets where the compute resources are launched. - securityGroupIds: + securityGroupIds: - '' # [REQUIRED] The Amazon EC2 security groups that are associated with instances launched in the compute environment. diff --git a/tutorials/cloud/noisepy_aws_batch.ipynb b/tutorials/cloud/noisepy_aws_batch.ipynb index 70d6003f..7c4246d3 100644 --- a/tutorials/cloud/noisepy_aws_batch.ipynb +++ b/tutorials/cloud/noisepy_aws_batch.ipynb @@ -56,9 +56,7 @@ "cell_type": "code", "execution_count": null, "id": "803cd99b-bad3-4003-9f60-1e04fa6dce44", - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "# Install jq\n", From e9de369dd67408bc443da88745e03124bc266933 Mon Sep 17 00:00:00 2001 From: Yiyu Ni Date: Thu, 16 May 2024 22:05:29 -0700 Subject: [PATCH 3/5] skip running aws batch notebook Signed-off-by: Yiyu Ni --- tutorials/_config.yml | 2 ++ tutorials/_toc.yml | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/_config.yml b/tutorials/_config.yml index ac50e557..84b94055 100644 --- a/tutorials/_config.yml +++ b/tutorials/_config.yml @@ -9,6 +9,8 @@ logo: ../docs_old/figures/logo.png # See https://jupyterbook.org/content/execute.html execute: execute_notebooks: force + exclude_patterns: + - noisepy_aws_batch.ipynb timeout: 360 only_build_toc_files: true diff --git a/tutorials/_toc.yml b/tutorials/_toc.yml index 2eec0804..77093053 100644 --- a/tutorials/_toc.yml +++ b/tutorials/_toc.yml @@ -10,6 +10,4 @@ chapters: - file: noisepy_ncedc_tutorial.ipynb - file: noisepy_compositestore_tutorial.ipynb - file: CLI.md -- file: cloud/checklist.md -- file: cloud/aws-ec2.md -- file: cloud/aws-batch.md +- file: cloud/noisepy_aws_batch.ipynb From cb7842f0b65f30ee94438affea62a63529eebcb6 Mon Sep 17 00:00:00 2001 From: Yiyu Ni Date: Thu, 16 May 2024 22:07:11 -0700 Subject: [PATCH 4/5] update tutorials Signed-off-by: Yiyu Ni --- tutorials/_toc.yml | 2 +- tutorials/{noise_configuration.md => noisepy_configuration.md} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tutorials/{noise_configuration.md => noisepy_configuration.md} (100%) diff --git a/tutorials/_toc.yml b/tutorials/_toc.yml index 77093053..d621fd29 100644 --- a/tutorials/_toc.yml +++ b/tutorials/_toc.yml @@ -2,7 +2,7 @@ # Learn more at https://jupyterbook.org/customize/toc.html format: jb-book -root: noise_configuration.md +root: noisepy_configuration.md chapters: - file: get_started.ipynb - file: noisepy_datastore.ipynb diff --git a/tutorials/noise_configuration.md b/tutorials/noisepy_configuration.md similarity index 100% rename from tutorials/noise_configuration.md rename to tutorials/noisepy_configuration.md From 2c53bd95759e5005fb7c5dd1bf1696361919b2ca Mon Sep 17 00:00:00 2001 From: Yiyu Ni Date: Thu, 16 May 2024 22:21:49 -0700 Subject: [PATCH 5/5] update tutorials Signed-off-by: Yiyu Ni --- tutorials/cloud/noisepy_aws_batch.ipynb | 2 +- tutorials/noisepy_compositestore_tutorial.ipynb | 2 +- tutorials/noisepy_datastore.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/cloud/noisepy_aws_batch.ipynb b/tutorials/cloud/noisepy_aws_batch.ipynb index 7c4246d3..20b0d8b9 100644 --- a/tutorials/cloud/noisepy_aws_batch.ipynb +++ b/tutorials/cloud/noisepy_aws_batch.ipynb @@ -318,7 +318,7 @@ "metadata": {}, "source": [ "### 4. Visualization\n", - "You can use [plot_stacks.ipynb](../plot_stacks.ipynb) for cross-correlation visualization after all jobs return." + "You can use plot_stacks tutorials for cross-correlation visualization after all jobs return. " ] } ], diff --git a/tutorials/noisepy_compositestore_tutorial.ipynb b/tutorials/noisepy_compositestore_tutorial.ipynb index 49ec908c..14bf67c6 100644 --- a/tutorials/noisepy_compositestore_tutorial.ipynb +++ b/tutorials/noisepy_compositestore_tutorial.ipynb @@ -6,7 +6,7 @@ "id": "PIA2IaqUOeOA" }, "source": [ - "# NoisePy Composite DatastoreTutorial\n", + "# NoisePy Composite Datastore Tutorial\n", "\n", "Noisepy is a python software package to process ambient seismic noise cross correlations. This tutorial aims to introduce the use of noisepy for a toy problem on a composite data store. It can be ran locally or on the cloud.\n", "\n", diff --git a/tutorials/noisepy_datastore.ipynb b/tutorials/noisepy_datastore.ipynb index a77aabdb..94b91a80 100644 --- a/tutorials/noisepy_datastore.ipynb +++ b/tutorials/noisepy_datastore.ipynb @@ -6,7 +6,7 @@ "id": "PIA2IaqUOeOA" }, "source": [ - "# DataStore \n", + "# NoisePy DataStore Tutorial\n", "Introduction to the NoisePy DataStore class." ] },