diff --git a/run/README.md b/run/README.md index 14e74f6c..aa45d28c 100644 --- a/run/README.md +++ b/run/README.md @@ -24,9 +24,9 @@ This folder contains configurations for running LST-Bench on various systems as - [x] Delta Lake 2.2.0 - [x] Apache Hudi 0.12.2 - [x] Apache Iceberg 1.1.0 -- [ ] Trino 420 - - [ ] Delta Lake - - [ ] Apache Iceberg +- [x] Trino 420 + - [x] Delta Lake + - [x] Apache Iceberg ## Folder Structure While the folder for each engine may have a slightly different structure, they generally contain the following: diff --git a/run/spark-3.3.1/azure-pipelines/README.md b/run/spark-3.3.1/azure-pipelines/README.md index 4488a12e..6e5e17de 100644 --- a/run/spark-3.3.1/azure-pipelines/README.md +++ b/run/spark-3.3.1/azure-pipelines/README.md @@ -32,10 +32,11 @@ This directory comprises the necessary tooling for executing LST-Bench on Apache - A VMSS cluster, that will serve as the Spark worker nodes, within the same VNet as the head node. - An Azure Storage Account accessible by both the VMSS and head node. - An Azure SQL Database (or SQL Server flavored RDBMS) that will be running Hive Metastore. - The Hive Metastore schema for version 2.3.0 should already be installed in the instance. + The Hive Metastore schema for version 2.3.9 should already be installed in the instance. - Prior to running the pipeline, several variables need definition in your Azure Pipeline: - `data_storage_account`: Name of the Azure Blob Storage account where the source data for the experiment is stored. - `data_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the source data for the experiment is stored. + - `data_storage_account_container`: Name of the container in the Azure Blob Storage account where the source data for the experiment is stored. - `hms_jdbc_driver`: JDBC driver for the Hive Metastore. - `hms_jdbc_url`: JDBC URL for the Hive Metastore. - `hms_jdbc_user`: Username for the Hive Metastore. diff --git a/run/spark-3.3.1/azure-pipelines/sh/hms.sh b/run/spark-3.3.1/azure-pipelines/sh/hms.sh index 4d78cbff..531b57a0 100755 --- a/run/spark-3.3.1/azure-pipelines/sh/hms.sh +++ b/run/spark-3.3.1/azure-pipelines/sh/hms.sh @@ -5,6 +5,10 @@ if [ "$#" -ne 7 ]; then fi source env.sh +if [ -z "${USER}" ]; then + echo "ERROR: USER is not defined." + exit 1 +fi if [ -z "${HADOOP_HOME}" ]; then echo "ERROR: HADOOP_HOME is not defined." exit 1 diff --git a/run/trino-420/azure-pipelines/README.md b/run/trino-420/azure-pipelines/README.md new file mode 100644 index 00000000..17330460 --- /dev/null +++ b/run/trino-420/azure-pipelines/README.md @@ -0,0 +1,56 @@ + + +# Azure Pipelines Deployment for LST-Bench on Trino 420 +This directory comprises the necessary tooling for executing LST-Bench on Trino 420 with different LSTs using Azure Pipelines. The included tooling consists of: +- `run-lst-bench.yml`: + An Azure Pipelines script designed to deploy Trino and execute LST-Bench. +- `sh/`: + A directory containing shell scripts and engine configuration files supporting the deployment of Trino and the execution of experiments. +- `config/`: + A directory with LST-Bench configuration files necessary for executing the experiments that are part of the results. + +## Prerequisites +- Automation for deploying the infrastructure in Azure to run LST-Bench is not implemented. As a result, the Azure Pipeline script expects the following setup: + - A VM named 'lst-bench-client' connected to the pipeline environment to run the LST-Bench client. + - A VM named 'lst-bench-head' to run the coordinator node of the Trino cluster, also connected to the pipeline environment. + - A VMSS cluster, that will serve as the Trino worker nodes, within the same VNet as the coordinator node. + - An Azure Storage Account accessible by both the VMSS and coordinator node. + - An Azure SQL Database (or SQL Server flavored RDBMS) that will be running Hive Metastore. + The Hive Metastore schema for version 2.3.9 should already be installed in the instance. +- Prior to running the pipeline, several variables need definition in your Azure Pipeline: + - `data_storage_account`: Name of the Azure Blob Storage account where the source data for the experiment is stored. + - `data_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the source data for the experiment is stored. + - `data_storage_account_container`: Name of the container in the Azure Blob Storage account where the source data for the experiment is stored. + - `hms_jdbc_driver`: JDBC driver for the Hive Metastore. + - `hms_jdbc_url`: JDBC URL for the Hive Metastore. + - `hms_jdbc_user`: Username for the Hive Metastore. + - `hms_jdbc_password` (secret): Password for the Hive Metastore. + - `hms_storage_account`: Name of the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog (can be the same as the data_storage_account). + - `hms_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog. + - `hms_storage_account_container`: Name of the container in the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog. +- The LSTs to run experiments on can be modified via input parameters for the pipelines in the Azure Pipelines YAML file or from the Web UI. + Default values are assigned to these parameters. + Parameters also include experiment scale factor, machine type, and cluster size. + Note that these parameters are not used to deploy the data or the infrastructure, as this process is not automated in the pipeline. + Instead, they are recorded in the experiment telemetry for proper categorization and visualization of results later on. + +## Additional Notes +For workloads within LST-Bench that include an `optimize` step, particularly those involving partitioned tables, a [custom task](/docs/workloads.md#custom-tasks) is used to execute this step. +The task divides the `optimize` operation into batches, each containing up to 100 partitions (the parameter value is configurable). +This approach was implemented to address issues where Trino would crash if the optimization step were applied to the entire table. diff --git a/run/trino-420/azure-pipelines/config/connections_config.yaml b/run/trino-420/azure-pipelines/config/connections_config.yaml new file mode 100644 index 00000000..e73d8451 --- /dev/null +++ b/run/trino-420/azure-pipelines/config/connections_config.yaml @@ -0,0 +1,9 @@ +# Description: Connections Configuration +--- +version: 1 +connections: +- id: trino_0 + driver: io.trino.jdbc.TrinoDriver + url: jdbc:trino://${TRINO_MASTER_HOST}:8080 + username: admin + password: '' diff --git a/run/trino-420/azure-pipelines/config/experiment_config-cow-delta.yaml b/run/trino-420/azure-pipelines/config/experiment_config-cow-delta.yaml new file mode 100644 index 00000000..ef8dbd07 --- /dev/null +++ b/run/trino-420/azure-pipelines/config/experiment_config-cow-delta.yaml @@ -0,0 +1,30 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: trino + system_version: 420 + table_format: delta + table_format_version: undefined + scale_factor: "${EXP_SCALE_FACTOR}" + mode: cow + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: hive + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: textfile + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: '' + external_tblproperties_suffix: ", textfile_field_separator=',', null_format='', skip_header_line_count=1" + catalog: delta + database: "delta_${EXP_NAME}" + table_format: delta + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/delta/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: '' + partition_spec_keyword: 'partitioned_by' diff --git a/run/trino-420/azure-pipelines/config/experiment_config-mor-iceberg.yaml b/run/trino-420/azure-pipelines/config/experiment_config-mor-iceberg.yaml new file mode 100644 index 00000000..502f7fa8 --- /dev/null +++ b/run/trino-420/azure-pipelines/config/experiment_config-mor-iceberg.yaml @@ -0,0 +1,30 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: trino + system_version: 420 + table_format: iceberg + table_format_version: undefined + scale_factor: "${EXP_SCALE_FACTOR}" + mode: mor + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: hive + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: textfile + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: '' + external_tblproperties_suffix: ", textfile_field_separator=',', null_format='', skip_header_line_count=1" + catalog: iceberg + database: "iceberg_${EXP_NAME}" + table_format: iceberg + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/iceberg/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: '' + partition_spec_keyword: 'partitioning' diff --git a/run/trino-420/azure-pipelines/config/setup_experiment_config.yaml b/run/trino-420/azure-pipelines/config/setup_experiment_config.yaml new file mode 100644 index 00000000..b164151b --- /dev/null +++ b/run/trino-420/azure-pipelines/config/setup_experiment_config.yaml @@ -0,0 +1,20 @@ +# Description: Experiment Configuration +--- +version: 1 +id: setup_experiment +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: trino + system_version: 420 + scale_factor: "${EXP_SCALE_FACTOR}" + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: hive + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: textfile + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: '' + external_tblproperties_suffix: ", textfile_field_separator=',', null_format='', skip_header_line_count=1" diff --git a/run/trino-420/azure-pipelines/config/telemetry_config.yaml b/run/trino-420/azure-pipelines/config/telemetry_config.yaml new file mode 100644 index 00000000..baa9e63c --- /dev/null +++ b/run/trino-420/azure-pipelines/config/telemetry_config.yaml @@ -0,0 +1,13 @@ +# Description: Telemetry Configuration +--- +version: 1 +connection: + id: duckdb_0 + driver: org.duckdb.DuckDBDriver + url: jdbc:duckdb:./telemetry-trino-420 +execute_ddl: true +ddl_file: 'src/main/resources/scripts/logging/duckdb/ddl.sql' +insert_file: 'src/main/resources/scripts/logging/duckdb/insert.sql' +# The following parameter values will be used to replace the variables in the logging statements. +parameter_values: + data_path: '' \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/run-lst-bench.yml b/run/trino-420/azure-pipelines/run-lst-bench.yml new file mode 100644 index 00000000..6e5e7871 --- /dev/null +++ b/run/trino-420/azure-pipelines/run-lst-bench.yml @@ -0,0 +1,249 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +trigger: none + +parameters: +- name: lsts + type: object + default: + - table_format: "delta" + mode: "cow" + - table_format: "iceberg" + mode: "mor" +- name: workloads + type: object + default: + - "wp1_longevity" + - "wp2_resilience" + - "wp3_rw_concurrency" +- name: exp_scale_factor + type: number + default: 100 +- name: exp_machine + type: string + default: "Standard_E8s_v5" +- name: exp_cluster_size + type: number + default: 8 + +variables: + MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository + MAVEN_OPTS: '-ntp -B -Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)' + EXP_SCALE_FACTOR: ${{ parameters.exp_scale_factor }} + EXP_MACHINE: ${{ parameters.exp_machine }} + EXP_CLUSTER_SIZE: ${{ parameters.exp_cluster_size }} + +stages: +# Build LST-Bench and create artifact to deploy to target VM +- stage: build + jobs: + - job: Build + pool: + vmImage: 'ubuntu-latest' + steps: + - task: Cache@2 + displayName: Cache Maven local repo + inputs: + key: 'maven | "$(Agent.OS)" | **/pom.xml' + restoreKeys: | + maven | "$(Agent.OS)" + maven + path: $(MAVEN_CACHE_FOLDER) + - task: Maven@4 + inputs: + mavenPomFile: 'pom.xml' + options: $(MAVEN_OPTS) + javaHomeOption: 'JDKVersion' + jdkVersionOption: '1.11' + publishJUnitResults: false + goals: 'package -DskipTests -Ptrino-jdbc' + - task: CopyFiles@2 + displayName: 'Copy Artifacts to: $(TargetFolder)' + inputs: + SourceFolder: '$(Build.SourcesDirectory)' + TargetFolder: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/' + - task: PublishPipelineArtifact@1 + inputs: + targetPath: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/' + artifact: lst-bench-0.1-SNAPSHOT + +# Set up engine and deploy LST-Bench +- stage: deploy + jobs: + - deployment: EngineDeploy + displayName: 'Deploying engine' + workspace: + clean: all + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - bash: | + echo 'Deploy engine' + mkdir -p ~/trino-420 + cp $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/run/trino-420/azure-pipelines/sh/* ~/trino-420/ + cd ~/trino-420 + chmod +x ./* + trino_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') + ./init.sh 'true' "${trino_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)" + ./hms.sh "$(hms_jdbc_driver)" "$(hms_jdbc_url)" "$(hms_jdbc_user)" "$(hms_jdbc_password)" "$(hms_storage_account)" "$(hms_storage_account_shared_key)" "$(hms_storage_account_container)" + ./dist-setup.sh + ./dist-exec.sh trino-420 init.sh 'false' "${trino_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)" + - deployment: ClientDeploy + displayName: 'Deploying LST-Bench client' + workspace: + clean: all + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-client' + strategy: + runOnce: + deploy: + steps: + - bash: | + echo 'Deploy LST-Bench client' + sudo apt install -y openjdk-11-jdk + mkdir -p ~/lst-bench-0.1-SNAPSHOT + cp -rf $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/* ~/lst-bench-0.1-SNAPSHOT/ + chmod +x ~/lst-bench-0.1-SNAPSHOT/launcher.sh + +# Run LST-Bench (setup external tables) +- stage: setup_experiment + jobs: + - deployment: StartEngine + displayName: "Starting Engine" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + variables: + process.clean: false + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/trino-420 + ./stop-cluster.sh && ./start-cluster.sh + sleep 20 + trino_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') + echo "##vso[task.setvariable variable=trino_head_node;isOutput=true]${trino_head_node}" + name: engine_start_step + - deployment: RunSetupExperiment + dependsOn: StartEngine + displayName: "Setup Experiment" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-client' + variables: + trino_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.trino_head_node'] ] + timeoutInMinutes: 0 + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/lst-bench-0.1-SNAPSHOT + ./launcher.sh -c run/trino-420/azure-pipelines/config/connections_config.yaml \ + -e run/trino-420/azure-pipelines/config/setup_experiment_config.yaml \ + -t run/trino-420/azure-pipelines/config/telemetry_config.yaml \ + -l run/trino-420/config/tpcds/library.yaml \ + -w run/trino-420/config/tpcds/setup_experiment.yaml + - deployment: StopEngine + dependsOn: RunSetupExperiment + displayName: "Stopping Engine" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/trino-420 + ./stop-cluster.sh + +# Run LST-Bench +- ${{ each lst in parameters.lsts }}: + - ${{ each workload in parameters.workloads }}: + - stage: test_${{ lst.mode }}_${{ lst.table_format }}_${{ workload }} + jobs: + - deployment: StartEngine + displayName: "Starting Engine (${{ lst.mode }}, ${{ lst.table_format }}, ${{ workload }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + variables: + process.clean: false + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/trino-420 + ./stop-cluster.sh && ./start-cluster.sh ${{ lst.table_format }} + sleep 20 + trino_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') + echo "##vso[task.setvariable variable=trino_head_node;isOutput=true]${trino_head_node}" + name: engine_start_step + - deployment: RunExperiment + dependsOn: StartEngine + displayName: "Running Experiment (${{ lst.mode }}, ${{ lst.table_format }}, ${{ workload }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-client' + variables: + trino_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.trino_head_node'] ] + timeoutInMinutes: 0 + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/lst-bench-0.1-SNAPSHOT + echo "${{ workload }}" + export EXP_NAME="${{ workload }}" + ./launcher.sh -c run/trino-420/azure-pipelines/config/connections_config.yaml \ + -e run/trino-420/azure-pipelines/config/experiment_config-${{ lst.mode }}-${{ lst.table_format }}.yaml \ + -t run/trino-420/azure-pipelines/config/telemetry_config.yaml \ + -l run/trino-420/config/tpcds/library.yaml \ + -w run/trino-420/config/tpcds/${{ workload }}.yaml + - deployment: StopEngine + dependsOn: RunExperiment + displayName: "Stopping Engine (${{ lst.mode }}, ${{ lst.table_format }}, ${{ workload }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/trino-420 + ./stop-cluster.sh diff --git a/run/trino-420/azure-pipelines/sh/coordinator-config.properties.template b/run/trino-420/azure-pipelines/sh/coordinator-config.properties.template new file mode 100644 index 00000000..a09f60f8 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/coordinator-config.properties.template @@ -0,0 +1,5 @@ +coordinator=true +node-scheduler.include-coordinator=false +http-server.http.port=8080 +discovery.uri=http://$TRINO_MASTER_HOST:8080 +query.max-memory=378GB \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/delta.properties.template b/run/trino-420/azure-pipelines/sh/delta.properties.template new file mode 100644 index 00000000..efd7cef8 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/delta.properties.template @@ -0,0 +1,6 @@ +connector.name=delta_lake +hive.metastore.uri=thrift://${TRINO_MASTER_HOST}:9083 +hive.azure.abfs-storage-account=${DATA_STORAGE_ACCOUNT} +hive.azure.abfs-access-key=${DATA_STORAGE_ACCOUNT_SHARED_KEY} +delta.max-partitions-per-writer=2500 +delta.compression-codec=GZIP \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/dist-exec.sh b/run/trino-420/azure-pipelines/sh/dist-exec.sh new file mode 100755 index 00000000..bd7c3ca6 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/dist-exec.sh @@ -0,0 +1,18 @@ +#!/bin/bash -e +source env.sh +if [ -z "${HOSTS}" ]; then + echo "ERROR: HOSTS is not defined." + exit 1 +fi + +if [ "$#" -lt 2 ]; then + echo "Error: Please provide at least two input parameters." + exit 1 +fi +deploy_dir=$1 +script_file=$2 + +for node in $HOSTS ; do ssh -t $node "mkdir -p ~/$deploy_dir" ; done +for node in $HOSTS ; do scp *.template $node:~/$deploy_dir ; done +for node in $HOSTS ; do scp $script_file $node:~/$deploy_dir ; done +for node in $HOSTS ; do ssh -t $node "cd ~/$deploy_dir && chmod +x ./$script_file && ./$script_file ${@:3}" ; done diff --git a/run/trino-420/azure-pipelines/sh/dist-setup.sh b/run/trino-420/azure-pipelines/sh/dist-setup.sh new file mode 100755 index 00000000..99edc490 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/dist-setup.sh @@ -0,0 +1,21 @@ +#!/bin/bash -e +if [ -z "${HOME}" ]; then + echo "ERROR: HOME is not defined." + exit 1 +fi + +# Install packages +sudo apt install -y net-tools nmap + +# Configure hosts +my_ip=$(/sbin/ifconfig eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') +ip_range=${my_ip%.*}.* +nmap -sn $ip_range | grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' | grep -v "^$my_ip$" > $HOME/hostiplist + +export HOSTS=$(<$HOME/hostiplist) + +for node in $HOSTS ; do scp ~/.ssh/id_rsa* $node:~/.ssh/ ; done + +# Push to environment +echo "export HOSTS=\"${HOSTS}\"" >> env.sh +echo "source $(pwd)/env.sh" >> ~/.bashrc diff --git a/run/trino-420/azure-pipelines/sh/hive-site.xml.template b/run/trino-420/azure-pipelines/sh/hive-site.xml.template new file mode 100644 index 00000000..0e79ed7b --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/hive-site.xml.template @@ -0,0 +1,36 @@ + + + javax.jdo.option.ConnectionURL + ${HMS_JDBC_URL} + + + + javax.jdo.option.ConnectionDriverName + ${HMS_JDBC_DRIVER} + + + + javax.jdo.option.ConnectionUserName + ${HMS_JDBC_USER} + + + + javax.jdo.option.ConnectionPassword + ${HMS_JDBC_PASSWORD} + + + + hive.metastore.warehouse.dir + abfss://${HMS_STORAGE_ACCOUNT_CONTAINER}@${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net/hive/warehouse + + + + fs.azure.account.auth.type.${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net + SharedKey + + + + fs.azure.account.key.${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net + ${HMS_STORAGE_ACCOUNT_SHARED_KEY} + + \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/hive.properties.template b/run/trino-420/azure-pipelines/sh/hive.properties.template new file mode 100644 index 00000000..c052a1c8 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/hive.properties.template @@ -0,0 +1,5 @@ +connector.name=hive +hive.metastore.uri=thrift://${TRINO_MASTER_HOST}:9083 +hive.allow-drop-table=true +hive.azure.abfs-storage-account=${DATA_STORAGE_ACCOUNT} +hive.azure.abfs-access-key=${DATA_STORAGE_ACCOUNT_SHARED_KEY} \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/hms.sh b/run/trino-420/azure-pipelines/sh/hms.sh new file mode 100755 index 00000000..907c2bad --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/hms.sh @@ -0,0 +1,47 @@ +#!/bin/bash -e +if [ "$#" -ne 7 ]; then + echo "Usage: $0 HMS_JDBC_DRIVER HMS_JDBC_URL HMS_JDBC_USER HMS_JDBC_PASSWORD HMS_STORAGE_ACCOUNT HMS_STORAGE_ACCOUNT_SHARED_KEY HMS_STORAGE_ACCOUNT_CONTAINER" + exit 1 +fi + +if [ -z "${USER}" ]; then + echo "ERROR: USER is not defined." + exit 1 +fi + +export HMS_JDBC_DRIVER=$1 +export HMS_JDBC_URL=$2 +export HMS_JDBC_USER=$3 +export HMS_JDBC_PASSWORD=$4 +export HMS_STORAGE_ACCOUNT=$5 +export HMS_STORAGE_ACCOUNT_SHARED_KEY=$6 +export HMS_STORAGE_ACCOUNT_CONTAINER=$7 +export HADOOP_HOME=/home/$USER/hadoop +export HIVE_HOME=/home/$USER/hive + +# Install Hadoop +rm -rf hadoop-3.3.1 +wget -nv -N https://archive.apache.org/dist/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz +tar -xzf hadoop-3.3.1.tar.gz +ln -sf $(pwd)/hadoop-3.3.1 $HADOOP_HOME + +# Install Hive (needed for HMS) +rm -rf apache-hive-2.3.9-bin +wget -nv -N https://downloads.apache.org/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz +tar -xzf apache-hive-2.3.9-bin.tar.gz +ln -sf $(pwd)/apache-hive-2.3.9-bin $HIVE_HOME + +# Configure HMS +envsubst < "hive-site.xml.template" > "$HIVE_HOME/conf/hive-site.xml" + +# Copy Azure dependencies to Hive classpath +cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-azure* $HIVE_HOME/lib/ + +# Install MSSQL driver +wget -nv -N https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/6.2.1.jre8/mssql-jdbc-6.2.1.jre8.jar +ln -sf $(pwd)/mssql-jdbc-6.2.1.jre8.jar $HIVE_HOME/lib/mssql-jdbc.jar + +# Push to environment +echo "export HADOOP_HOME=${HADOOP_HOME} +export HIVE_HOME=${HIVE_HOME}" >> env.sh +echo "source $(pwd)/env.sh" >> ~/.bashrc diff --git a/run/trino-420/azure-pipelines/sh/iceberg.properties.template b/run/trino-420/azure-pipelines/sh/iceberg.properties.template new file mode 100644 index 00000000..d29aa613 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/iceberg.properties.template @@ -0,0 +1,7 @@ +connector.name=iceberg +hive.metastore.uri=thrift://${TRINO_MASTER_HOST}:9083 +hive.azure.abfs-storage-account=${DATA_STORAGE_ACCOUNT} +hive.azure.abfs-access-key=${DATA_STORAGE_ACCOUNT_SHARED_KEY} +iceberg.max-partitions-per-writer=2500 +iceberg.file-format=PARQUET +iceberg.compression-codec=GZIP \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/init.sh b/run/trino-420/azure-pipelines/sh/init.sh new file mode 100755 index 00000000..711cf653 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/init.sh @@ -0,0 +1,66 @@ +#!/bin/bash -e +if [ "$#" -ne 4 ]; then + echo "Usage: $0 IS_COORDINATOR TRINO_MASTER_HOST DATA_STORAGE_ACCOUNT DATA_STORAGE_ACCOUNT_SHARED_KEY" + exit 1 +fi + +if [ -z "${USER}" ]; then + echo "ERROR: USER is not defined." + exit 1 +fi + +export HOSTNAME=$(hostname) +export IS_COORDINATOR=$1 +export TRINO_MASTER_HOST=$2 +export TRINO_HOME=/home/$USER/trino +export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 +export DATA_STORAGE_ACCOUNT=$3 +export DATA_STORAGE_ACCOUNT_SHARED_KEY=$4 + +# Update dependencies and install packages +sudo apt update -y +sudo apt install -y openjdk-17-jdk python wget + +# Install Trino +rm -rf trino-server-420 +wget -nv -N https://repo1.maven.org/maven2/io/trino/trino-server/420/trino-server-420.tar.gz +tar -xzf trino-server-420.tar.gz +ln -sf $(pwd)/trino-server-420 $TRINO_HOME + +# Configure Trino +sudo mkdir -p /mnt/local_resource/ +sudo mkdir -p /mnt/local_resource/trino_data/ +sudo chown $USER:$USER /mnt/local_resource/trino_data +sudo mkdir -p /mnt/local_resource/trino_tmp/ +sudo chown $USER:$USER /mnt/local_resource/trino_tmp + +sudo mkdir ${TRINO_HOME}/etc +sudo chown $USER:$USER ${TRINO_HOME}/etc/ +envsubst < "node.properties.template" > "$TRINO_HOME/etc/node.properties" +envsubst < "jvm.config.template" > "$TRINO_HOME/etc/jvm.config" +if [ "$IS_COORDINATOR" = true ]; then + envsubst < "coordinator-config.properties.template" > "$TRINO_HOME/etc/config.properties" +elif [ "$IS_COORDINATOR" = false ]; then + envsubst < "worker-config.properties.template" > "$TRINO_HOME/etc/config.properties" +else + echo "IS_COORDINATOR must be either 'true' or 'false'" + exit 1 +fi +envsubst < "log.properties.template" > "$TRINO_HOME/etc/log.properties" + +# Configure Trino connectors +sudo mkdir ${TRINO_HOME}/etc/catalog +sudo chown $USER:$USER ${TRINO_HOME}/etc/catalog/ +envsubst < "hive.properties.template" > "$TRINO_HOME/etc/catalog/hive.properties" +envsubst < "delta.properties.template" > "$TRINO_HOME/etc/catalog/delta.properties" +envsubst < "iceberg.properties.template" > "$TRINO_HOME/etc/catalog/iceberg.properties" + +# Set Linux OS limits required for Trino +echo "trino soft nofile 131072 +trino hard nofile 131072" | sudo tee -a /etc/security/limits.conf + +# Push to environment +echo "export TRINO_HOME=${TRINO_HOME} +export JAVA_HOME=${JAVA_HOME} +export PATH=${PATH}:${TRINO_HOME}/bin" >> env.sh +echo "source $(pwd)/env.sh" >> ~/.bashrc diff --git a/run/trino-420/azure-pipelines/sh/jvm.config.template b/run/trino-420/azure-pipelines/sh/jvm.config.template new file mode 100644 index 00000000..4a852a53 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/jvm.config.template @@ -0,0 +1,18 @@ +-server +-Xmx54G +-XX:InitialRAMPercentage=80 +-XX:MaxRAMPercentage=80 +-XX:G1HeapRegionSize=32M +-XX:+ExplicitGCInvokesConcurrent +-XX:+ExitOnOutOfMemoryError +-XX:+HeapDumpOnOutOfMemoryError +-XX:-OmitStackTraceInFastThrow +-XX:ReservedCodeCacheSize=512M +-XX:PerMethodRecompilationCutoff=10000 +-XX:PerBytecodeRecompilationCutoff=10000 +-Djdk.attach.allowAttachSelf=true +-Djdk.nio.maxCachedBufferSize=2000000 +-XX:+UnlockDiagnosticVMOptions +-XX:+UseAESCTRIntrinsics +# Disable Preventive GC for performance reasons (JDK-8293861) +-XX:-G1UsePreventiveGC \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/log.properties.template b/run/trino-420/azure-pipelines/sh/log.properties.template new file mode 100644 index 00000000..d253499a --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/log.properties.template @@ -0,0 +1 @@ +io.trino=INFO \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/node.properties.template b/run/trino-420/azure-pipelines/sh/node.properties.template new file mode 100644 index 00000000..a2a65764 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/node.properties.template @@ -0,0 +1,3 @@ +node.environment=production +node.id=$HOSTNAME +node.data-dir=/mnt/local_resource/trino_data \ No newline at end of file diff --git a/run/trino-420/azure-pipelines/sh/start-cluster.sh b/run/trino-420/azure-pipelines/sh/start-cluster.sh new file mode 100755 index 00000000..0ccbf698 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/start-cluster.sh @@ -0,0 +1,25 @@ +#!/bin/bash -e +source env.sh +if [ -z "${HIVE_HOME}" ]; then + echo "ERROR: HIVE_HOME is not defined." + exit 1 +fi +if [ -z "${TRINO_HOME}" ]; then + echo "ERROR: TRINO_HOME is not defined." + exit 1 +fi +if [ -z "${HOSTS}" ]; then + echo "ERROR: HOSTS is not defined." + exit 1 +fi + +echo "Starting HMS" +cd $HIVE_HOME +./bin/hive --service metastore & + +echo "Starting Trino cluster" +echo "Starting Trino coordinator" +cd $TRINO_HOME +./bin/launcher start +echo "Starting Trino workers" +for node in $HOSTS ; do ssh -t $node "cd ${TRINO_HOME} && ./bin/launcher start" ; done diff --git a/run/trino-420/azure-pipelines/sh/stop-cluster.sh b/run/trino-420/azure-pipelines/sh/stop-cluster.sh new file mode 100755 index 00000000..000acd27 --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/stop-cluster.sh @@ -0,0 +1,20 @@ +#!/bin/bash -e +source env.sh +if [ -z "${HOSTS}" ]; then + echo "ERROR: HOSTS is not defined." + exit 1 +fi +if [ -z "${TRINO_HOME}" ]; then + echo "ERROR: TRINO_HOME is not defined." + exit 1 +fi + +echo "Stopping Trino cluster" +echo "Stopping Trino workers" +for node in $HOSTS ; do ssh -t $node "cd ${TRINO_HOME} && ./bin/launcher stop" ; done +echo "Stopping Trino coordinator" +cd $TRINO_HOME +./bin/launcher stop + +echo "Stopping HMS" +pkill -f "metastore" || true diff --git a/run/trino-420/azure-pipelines/sh/worker-config.properties.template b/run/trino-420/azure-pipelines/sh/worker-config.properties.template new file mode 100644 index 00000000..96a4c6fd --- /dev/null +++ b/run/trino-420/azure-pipelines/sh/worker-config.properties.template @@ -0,0 +1,3 @@ +coordinator=false +http-server.http.port=8080 +discovery.uri=http://$TRINO_MASTER_HOST:8080 \ No newline at end of file diff --git a/run/trino-420/config/tpcds/library.yaml b/run/trino-420/config/tpcds/library.yaml index 38804842..5fd99c01 100644 --- a/run/trino-420/config/tpcds/library.yaml +++ b/run/trino-420/config/tpcds/library.yaml @@ -241,6 +241,7 @@ task_templates: # Execution of optimize on all benchmark tables but splitting optimization # of partitioned tables into batches by relying on dependent task executor - id: optimize_split + custom_task_executor: com.microsoft.lst_bench.task.custom.DependentTaskExecutor files: - run/trino-420/scripts/tpcds/optimize/o_call_center.sql - run/trino-420/scripts/tpcds/optimize/o_catalog_page.sql diff --git a/run/trino-420/config/tpcds/setup_experiment.yaml b/run/trino-420/config/tpcds/setup_experiment.yaml new file mode 100644 index 00000000..d122811f --- /dev/null +++ b/run/trino-420/config/tpcds/setup_experiment.yaml @@ -0,0 +1,32 @@ +# Description: Setup experiment +--- +version: 1 +id: setup_experiment +phases: +- id: setup + sessions: + - tasks: + - template_id: setup +- id: setup_data_maintenance + sessions: + - tasks: + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance diff --git a/run/trino-420/config/tpcds/wp1_longevity.yaml b/run/trino-420/config/tpcds/wp1_longevity.yaml index 936169fd..1a200455 100644 --- a/run/trino-420/config/tpcds/wp1_longevity.yaml +++ b/run/trino-420/config/tpcds/wp1_longevity.yaml @@ -3,23 +3,6 @@ version: 1 id: wp1_longevity phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: diff --git a/run/trino-420/config/tpcds/wp2_resilience.yaml b/run/trino-420/config/tpcds/wp2_resilience.yaml index d95edafe..58b0bd7a 100644 --- a/run/trino-420/config/tpcds/wp2_resilience.yaml +++ b/run/trino-420/config/tpcds/wp2_resilience.yaml @@ -3,25 +3,6 @@ version: 1 id: wp2_resilience phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: @@ -46,7 +27,9 @@ phases: - id: optimize_1 sessions: - tasks: - - template_id: optimize + - template_id: optimize_split + task_executor_arguments: + dependent_task_batch_size: 100 - id: single_user_2o sessions: - tasks: @@ -65,7 +48,9 @@ phases: - id: optimize_2 sessions: - tasks: - - template_id: optimize + - template_id: optimize_split + task_executor_arguments: + dependent_task_batch_size: 100 - id: single_user_3o sessions: - tasks: @@ -86,7 +71,9 @@ phases: - id: optimize_3 sessions: - tasks: - - template_id: optimize + - template_id: optimize_split + task_executor_arguments: + dependent_task_batch_size: 100 - id: single_user_4o sessions: - tasks: diff --git a/run/trino-420/config/tpcds/wp3_rw_concurrency.yaml b/run/trino-420/config/tpcds/wp3_rw_concurrency.yaml index a3ff60db..26afcbb5 100644 --- a/run/trino-420/config/tpcds/wp3_rw_concurrency.yaml +++ b/run/trino-420/config/tpcds/wp3_rw_concurrency.yaml @@ -3,25 +3,6 @@ version: 1 id: wp3_rw_concurrency phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: @@ -42,7 +23,9 @@ phases: - tasks: - template_id: single_user - tasks: - - template_id: optimize + - template_id: optimize_split + task_executor_arguments: + dependent_task_batch_size: 100 - id: single_user_2o_data_maintenance_2 sessions: - tasks: @@ -57,7 +40,9 @@ phases: - tasks: - template_id: single_user - tasks: - - template_id: optimize + - template_id: optimize_split + task_executor_arguments: + dependent_task_batch_size: 100 - id: single_user_3o_data_maintenance_3 sessions: - tasks: @@ -74,4 +59,6 @@ phases: - tasks: - template_id: single_user - tasks: - - template_id: optimize + - template_id: optimize_split + task_executor_arguments: + dependent_task_batch_size: 100 diff --git a/run/trino-420/results/trino-420-2024-02-01-8xStandard_E8s_v5.duckdb b/run/trino-420/results/trino-420-2024-02-01-8xStandard_E8s_v5.duckdb new file mode 100644 index 00000000..a23b7a21 Binary files /dev/null and b/run/trino-420/results/trino-420-2024-02-01-8xStandard_E8s_v5.duckdb differ diff --git a/src/main/java/com/microsoft/lst_bench/client/QueryResult.java b/src/main/java/com/microsoft/lst_bench/client/QueryResult.java index 5c49f935..540642b0 100644 --- a/src/main/java/com/microsoft/lst_bench/client/QueryResult.java +++ b/src/main/java/com/microsoft/lst_bench/client/QueryResult.java @@ -31,11 +31,13 @@ */ public class QueryResult { + private final Map columnTypes; private final Map> valueList; private static final String RESULT = "Result"; public QueryResult() { + this.columnTypes = new HashMap<>(); this.valueList = new HashMap<>(); } @@ -45,6 +47,7 @@ public void populate(ResultSet rs) throws SQLException { ResultSetMetaData rsmd = rs.getMetaData(); for (int j = 1; j <= rsmd.getColumnCount(); j++) { + columnTypes.put(rsmd.getColumnName(j), rsmd.getColumnType(j)); valueList.put(rsmd.getColumnName(j), new ArrayList<>()); } @@ -65,24 +68,30 @@ public Integer getValueListSize() { } public boolean containsEmptyResultColumnOnly() { - if (valueList.keySet().size() == 1 + return valueList.keySet().size() == 1 && valueList.containsKey(RESULT) - && valueList.get(RESULT).size() == 0) { - return true; - } - return false; + && valueList.get(RESULT).isEmpty(); } public Map getStringMappings(int listMin, int listMax) { Map result = new HashMap<>(); - for (String key : this.valueList.keySet()) { + for (String key : valueList.keySet()) { List localList = - this.valueList.get(key).subList(listMin, listMax).stream() - .map(s -> s.toString()) + valueList.get(key).subList(listMin, listMax).stream() + .map(Object::toString) .collect(Collectors.toUnmodifiableList()); - // TODO: This assumes a VARCHAR type (or implicit casting by the engine), - // we should probably handle it more generically using data types. - result.put(key, "'" + String.join("','", localList) + "'"); + switch (columnTypes.get(key)) { + case java.sql.Types.BIGINT: + case java.sql.Types.INTEGER: + case java.sql.Types.SMALLINT: + case java.sql.Types.TINYINT: + result.put(key, String.join(",", localList)); + break; + default: + // Currently assumes String for all other types. + // TODO: Better handling and testing of data types across engines. + result.put(key, "'" + String.join("','", localList) + "'"); + } } return result; }