Skip to content

Commit

Permalink
Workflow and configuration for Trino 420
Browse files Browse the repository at this point in the history
  • Loading branch information
jcamachor committed Feb 21, 2024
1 parent 7911f79 commit e36491c
Show file tree
Hide file tree
Showing 27 changed files with 698 additions and 61 deletions.
29 changes: 29 additions & 0 deletions run/trino-420/azure-pipelines/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<!--
{% comment %}
Copyright (c) Microsoft Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
{% endcomment %}
-->

# Azure Pipelines Deployment for LST-Bench on Trino 420
This directory comprises the necessary tooling for executing LST-Bench on Trino 420 with different LSTs using Azure Pipelines. The included tooling consists of:
- `run-lst-bench.yml`:
An Azure Pipelines script designed to deploy Apache Spark with various LSTs and execute LST-Bench.
- `sh/`:
A directory containing shell scripts and engine configuration files supporting the deployment of Spark with different LSTs and the execution of experiments.
- `config/`:
A directory with LST-Bench configuration files necessary for executing the experiments that are part of the results.

## Prerequisites
TODO
9 changes: 9 additions & 0 deletions run/trino-420/azure-pipelines/config/connections_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Description: Connections Configuration
---
version: 1
connections:
- id: trino_0
driver: io.trino.jdbc.TrinoDriver
url: jdbc:trino://${TRINO_MASTER_HOST}:8080
username: admin
password: ''
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Description: Experiment Configuration
---
version: 1
id: "${EXP_NAME}"
repetitions: 1
# Metadata accepts any key-value that we want to register together with the experiment run.
metadata:
system: trino
system_version: 420
table_format: delta
table_format_version: undefined
scale_factor: "${EXP_SCALE_FACTOR}"
mode: cow
machine: "${EXP_MACHINE}"
cluster_size: "${EXP_CLUSTER_SIZE}"
# The following parameter values will be used to replace the variables in the workload statements.
parameter_values:
external_catalog: hive
external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
external_table_format: textfile
external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
external_options_suffix: ''
external_tblproperties_suffix: ", textfile_field_separator=',', null_format='', skip_header_line_count=1"
catalog: delta
database: "delta_${EXP_NAME}"
table_format: delta
data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/delta/sf_${EXP_SCALE_FACTOR}/'
options_suffix: ''
tblproperties_suffix: ''
partition_spec_keyword: 'partitioned_by'
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Description: Experiment Configuration
---
version: 1
id: "${EXP_NAME}"
repetitions: 1
# Metadata accepts any key-value that we want to register together with the experiment run.
metadata:
system: trino
system_version: 420
table_format: iceberg
table_format_version: undefined
scale_factor: "${EXP_SCALE_FACTOR}"
mode: mor
machine: "${EXP_MACHINE}"
cluster_size: "${EXP_CLUSTER_SIZE}"
# The following parameter values will be used to replace the variables in the workload statements.
parameter_values:
external_catalog: hive
external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
external_table_format: textfile
external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
external_options_suffix: ''
external_tblproperties_suffix: ", textfile_field_separator=',', null_format='', skip_header_line_count=1"
catalog: iceberg
database: "iceberg_${EXP_NAME}"
table_format: iceberg
data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/iceberg/sf_${EXP_SCALE_FACTOR}/'
options_suffix: ''
tblproperties_suffix: ''
partition_spec_keyword: 'partitioning'
20 changes: 20 additions & 0 deletions run/trino-420/azure-pipelines/config/setup_experiment_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Description: Experiment Configuration
---
version: 1
id: setup_experiment
repetitions: 1
# Metadata accepts any key-value that we want to register together with the experiment run.
metadata:
system: trino
system_version: 420
scale_factor: "${EXP_SCALE_FACTOR}"
machine: "${EXP_MACHINE}"
cluster_size: "${EXP_CLUSTER_SIZE}"
# The following parameter values will be used to replace the variables in the workload statements.
parameter_values:
external_catalog: hive
external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
external_table_format: textfile
external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
external_options_suffix: ''
external_tblproperties_suffix: ", textfile_field_separator=',', null_format='', skip_header_line_count=1"
13 changes: 13 additions & 0 deletions run/trino-420/azure-pipelines/config/telemetry_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Description: Telemetry Configuration
---
version: 1
connection:
id: duckdb_0
driver: org.duckdb.DuckDBDriver
url: jdbc:duckdb:./telemetry-trino-420
execute_ddl: true
ddl_file: 'src/main/resources/scripts/logging/duckdb/ddl.sql'
insert_file: 'src/main/resources/scripts/logging/duckdb/insert.sql'
# The following parameter values will be used to replace the variables in the logging statements.
parameter_values:
data_path: ''
249 changes: 249 additions & 0 deletions run/trino-420/azure-pipelines/run-lst-bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trigger: none

parameters:
- name: lsts
type: object
default:
- table_format: "delta"
mode: "cow"
- table_format: "iceberg"
mode: "mor"
- name: workloads
type: object
default:
- "wp1_longevity"
- "wp2_resilience"
- "wp3_rw_concurrency"
- name: exp_scale_factor
type: number
default: 100
- name: exp_machine
type: string
default: "Standard_E8s_v5"
- name: exp_cluster_size
type: number
default: 8

variables:
MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository
MAVEN_OPTS: '-ntp -B -Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)'
EXP_SCALE_FACTOR: ${{ parameters.exp_scale_factor }}
EXP_MACHINE: ${{ parameters.exp_machine }}
EXP_CLUSTER_SIZE: ${{ parameters.exp_cluster_size }}

stages:
# Build LST-Bench and create artifact to deploy to target VM
- stage: build
jobs:
- job: Build
pool:
vmImage: 'ubuntu-latest'
steps:
- task: Cache@2
displayName: Cache Maven local repo
inputs:
key: 'maven | "$(Agent.OS)" | **/pom.xml'
restoreKeys: |
maven | "$(Agent.OS)"
maven
path: $(MAVEN_CACHE_FOLDER)
- task: Maven@4
inputs:
mavenPomFile: 'pom.xml'
options: $(MAVEN_OPTS)
javaHomeOption: 'JDKVersion'
jdkVersionOption: '1.11'
publishJUnitResults: false
goals: 'package -DskipTests -Ptrino-jdbc'
- task: CopyFiles@2
displayName: 'Copy Artifacts to: $(TargetFolder)'
inputs:
SourceFolder: '$(Build.SourcesDirectory)'
TargetFolder: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/'
- task: PublishPipelineArtifact@1
inputs:
targetPath: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/'
artifact: lst-bench-0.1-SNAPSHOT

# Set up engine and deploy LST-Bench
- stage: deploy
jobs:
- deployment: EngineDeploy
displayName: 'Deploying engine'
workspace:
clean: all
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-head'
strategy:
runOnce:
deploy:
steps:
- bash: |
echo 'Deploy engine'
mkdir -p ~/trino-420
cp $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/run/trino-420/azure-pipelines/sh/* ~/trino-420/
cd ~/trino-420
chmod +x ./*
trino_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
./init.sh 'true' "${trino_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)"
./hms.sh "$(hms_jdbc_driver)" "$(hms_jdbc_url)" "$(hms_jdbc_user)" "$(hms_jdbc_password)" "$(hms_storage_account)" "$(hms_storage_account_shared_key)" "$(hms_storage_account_container)"
./dist-setup.sh
./dist-exec.sh trino-420 init.sh 'false' "${trino_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)"
- deployment: ClientDeploy
displayName: 'Deploying LST-Bench client'
workspace:
clean: all
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-client'
strategy:
runOnce:
deploy:
steps:
- bash: |
echo 'Deploy LST-Bench client'
sudo apt install -y openjdk-11-jdk
mkdir -p ~/lst-bench-0.1-SNAPSHOT
cp -rf $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/* ~/lst-bench-0.1-SNAPSHOT/
chmod +x ~/lst-bench-0.1-SNAPSHOT/launcher.sh
# Run LST-Bench (setup external tables)
- stage: setup_experiment
jobs:
- deployment: StartEngine
displayName: "Starting Engine"
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-head'
variables:
process.clean: false
strategy:
runOnce:
deploy:
steps:
- download: none
- bash: |
cd ~/trino-420
./stop-cluster.sh && ./start-cluster.sh
sleep 20
trino_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
echo "##vso[task.setvariable variable=trino_head_node;isOutput=true]${trino_head_node}"
name: engine_start_step
- deployment: RunSetupExperiment
dependsOn: StartEngine
displayName: "Setup Experiment"
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-client'
variables:
trino_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.trino_head_node'] ]
timeoutInMinutes: 0
strategy:
runOnce:
deploy:
steps:
- download: none
- bash: |
cd ~/lst-bench-0.1-SNAPSHOT
./launcher.sh -c run/trino-420/azure-pipelines/config/connections_config.yaml \
-e run/trino-420/azure-pipelines/config/setup_experiment_config.yaml \
-t run/trino-420/azure-pipelines/config/telemetry_config.yaml \
-l run/trino-420/config/tpcds/library.yaml \
-w run/trino-420/config/tpcds/setup_experiment.yaml
- deployment: StopEngine
dependsOn: RunSetupExperiment
displayName: "Stopping Engine"
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-head'
strategy:
runOnce:
deploy:
steps:
- download: none
- bash: |
cd ~/trino-420
./stop-cluster.sh
# Run LST-Bench
- ${{ each lst in parameters.lsts }}:
- ${{ each workload in parameters.workloads }}:
- stage: test_${{ lst.mode }}_${{ lst.table_format }}_${{ workload }}
jobs:
- deployment: StartEngine
displayName: "Starting Engine (${{ lst.mode }}, ${{ lst.table_format }}, ${{ workload }})"
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-head'
variables:
process.clean: false
strategy:
runOnce:
deploy:
steps:
- download: none
- bash: |
cd ~/trino-420
./stop-cluster.sh && ./start-cluster.sh ${{ lst.table_format }}
sleep 20
trino_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
echo "##vso[task.setvariable variable=trino_head_node;isOutput=true]${trino_head_node}"
name: engine_start_step
- deployment: RunExperiment
dependsOn: StartEngine
displayName: "Running Experiment (${{ lst.mode }}, ${{ lst.table_format }}, ${{ workload }})"
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-client'
variables:
trino_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.trino_head_node'] ]
timeoutInMinutes: 0
strategy:
runOnce:
deploy:
steps:
- download: none
- bash: |
cd ~/lst-bench-0.1-SNAPSHOT
echo "${{ workload }}"
export EXP_NAME="${{ workload }}"
./launcher.sh -c run/trino-420/azure-pipelines/config/connections_config.yaml \
-e run/trino-420/azure-pipelines/config/experiment_config-${{ lst.mode }}-${{ lst.table_format }}.yaml \
-t run/trino-420/azure-pipelines/config/telemetry_config.yaml \
-l run/trino-420/config/tpcds/library.yaml \
-w run/trino-420/config/tpcds/${{ workload }}.yaml
- deployment: StopEngine
dependsOn: RunExperiment
displayName: "Stopping Engine (${{ lst.mode }}, ${{ lst.table_format }}, ${{ workload }})"
environment:
name: 'lst-bench-github'
resourceType: VirtualMachine
resourceName: 'lst-bench-head'
strategy:
runOnce:
deploy:
steps:
- download: none
- bash: |
cd ~/trino-420
./stop-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
coordinator=true
node-scheduler.include-coordinator=false
http-server.http.port=8080
discovery.uri=http://$TRINO_MASTER_HOST:8080
query.max-memory=378GB
Loading

0 comments on commit e36491c

Please sign in to comment.