Skip to content

Commit

Permalink
CAB adapter implementation (#339)
Browse files Browse the repository at this point in the history
  • Loading branch information
jcamachor authored Sep 24, 2024
1 parent 7fee082 commit 53140ad
Show file tree
Hide file tree
Showing 55 changed files with 24,169 additions and 8 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ The core of LST-Bench is organized into two modules:
The Python module performs data processing, analysis, and visualization to facilitate a deeper understanding of the experimental results.

Additionally, the **Adapters** module is designed to handle integration with external tools and systems by converting outputs from third-party benchmarks into formats compatible with LST-Bench.
One example of this is the **CAB to LST-Bench converter**, which transforms results from the Cloud Analytics Benchmark (CAB) into a format that can be used by LST-Bench for further analysis.
One example of this is the **CAB to LST-Bench converter**, which transforms the output files generated by the Cloud Analytics Benchmark (CAB) into the input format used by LST-Bench.

### LST-Bench Concepts
In LST-Bench, we utilize specific concepts to define and organize SQL workloads, with a focus on maximizing flexibility and facilitating reusability across various workloads. For detailed information, refer to our [documentation](docs/workloads.md).
Expand Down
15 changes: 14 additions & 1 deletion adapters/cab-converter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,17 @@ The CAB project is a separate C++ application with its own build process. For pl
Once CAB is built and its output files are generated, you can run the `cab-converter` to transform those files into the format required by LST-Bench.

#### Running the Converter
_TODO: Add more details here._
After building LST-Bench, if you are on Linux/macOS run `cab-converter.sh` or open a Powershell `cab-converter.ps1` if you are on Windows to display the usage options.

```bash
usage: ./cab-converter.sh [-c <mode>] -d <directory> -o <directory> [-s <boolean>]
-c,--connections-gen-mode <mode> Connection generation mode. Options: 'single' (a single connection for all
streams), 'per-db' (one connection per target database), 'per-stream' (one
connection per stream), 'per-stream-type' (one connection per stream type,
i.e., read/write) (default: 'single')
-d,--cab-streams-dir <directory> Path to the directory containing the query streams generated by CAB-gen
-o,--output-dir <directory> Path to the directory where the output files from the CAB conversion will be
saved
-s,--split-read-write-streams <boolean> Whether to split each input query stream into separate read/write streams
(default: false)
```
6 changes: 6 additions & 0 deletions adapters/cab-converter/cab-converter.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Constants
$CAB_CONVERTER_HOME = Get-Location
$CAB_CONVERTER_CLASSPATH = "$CAB_CONVERTER_HOME\target\*;$CAB_CONVERTER_HOME\target\lib\*;$CAB_CONVERTER\target\classes\*"

# Run Java command
java -cp $CAB_CONVERTER_CLASSPATH com.microsoft.lst_bench.cab_converter.Driver $args
8 changes: 8 additions & 0 deletions adapters/cab-converter/cab-converter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash -e

# Constants
# Directory of the script
export CAB_CONVERTER_HOME="$(dirname "$(readlink -f "$0")")"
CAB_CONVERTER_CLASSPATH="$CAB_CONVERTER_HOME/target/*:$CAB_CONVERTER_HOME/target/lib/*:$CAB_CONVERTER_HOME/target/classes/*"

java -cp ${CAB_CONVERTER_CLASSPATH} com.microsoft.lst_bench.cab_converter.Driver "$@"
52 changes: 47 additions & 5 deletions adapters/cab-converter/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,60 @@

<dependencies>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
<groupId>com.microsoft.lst-bench</groupId>
<artifactId>lst-bench-core</artifactId>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-yaml</artifactId>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compile-plugin.version}</version>
<artifactId>maven-dependency-plugin</artifactId>
<version>${maven-dependency-plugin.version}</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<excludeScope>provided</excludeScope>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
Expand Down
191 changes: 191 additions & 0 deletions adapters/cab-converter/sql/spark-3.3.1/build/build.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
CREATE
SCHEMA IF NOT EXISTS ${catalog}.${database};

DROP
TABLE
IF EXISTS ${catalog}.${database}.customer;
CREATE
TABLE
${catalog}.${database}.customer(
c_custkey BIGINT,
c_name VARCHAR(25),
c_address VARCHAR(40),
c_nationkey BIGINT,
c_phone CHAR(15),
c_acctbal DECIMAL,
c_comment VARCHAR(117),
c_mktsegment CHAR(10)
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/customer/'
) TBLPROPERTIES(
'primaryKey' = 'c_custkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.customer
SELECT *
FROM ${external_catalog}.${external_database}.customer;

DROP
TABLE
IF EXISTS ${catalog}.${database}.lineitem;
CREATE
TABLE
${catalog}.${database}.lineitem(
l_orderkey BIGINT,
l_partkey BIGINT,
l_suppkey BIGINT,
l_linenumber INT,
l_quantity DECIMAL,
l_extendedprice DECIMAL,
l_discount DECIMAL,
l_tax DECIMAL,
l_returnflag CHAR(1),
l_linestatus CHAR(1),
l_commitdate DATE,
l_receiptdate DATE,
l_shipinstruct CHAR(25),
l_shipmode CHAR(10),
l_comment VARCHAR(44),
l_shipdate DATE
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/lineitem/'
) PARTITIONED BY(l_shipdate) TBLPROPERTIES(
'primaryKey' = 'l_orderkey,l_linenumber' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.lineitem
SELECT *
FROM ${external_catalog}.${external_database}.lineitem;

DROP
TABLE
IF EXISTS ${catalog}.${database}.nation;
CREATE
TABLE
${catalog}.${database}.nation(
n_nationkey BIGINT,
n_name CHAR(25),
n_regionkey BIGINT,
n_comment VARCHAR(152)
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/nation/'
) TBLPROPERTIES(
'primaryKey' = 'n_nationkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.nation
SELECT *
FROM ${external_catalog}.${external_database}.nation;

DROP
TABLE
IF EXISTS ${catalog}.${database}.orders;
CREATE
TABLE
${catalog}.${database}.orders(
o_orderkey BIGINT,
o_custkey BIGINT,
o_orderstatus CHAR(1),
o_totalprice DECIMAL,
o_orderpriority CHAR(15),
o_clerk CHAR(15),
o_shippriority INT,
o_comment VARCHAR(79),
o_orderdate DATE
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/orders/'
) PARTITIONED BY(o_orderdate) TBLPROPERTIES(
'primaryKey' = 'o_orderkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.orders
SELECT *
FROM ${external_catalog}.${external_database}.orders;

DROP
TABLE
IF EXISTS ${catalog}.${database}.part;
CREATE
TABLE
${catalog}.${database}.part(
p_partkey BIGINT,
p_name VARCHAR(55),
p_mfgr CHAR(25),
p_type VARCHAR(25),
p_size INT,
p_container CHAR(10),
p_retailprice DECIMAL,
p_comment VARCHAR(23),
p_brand CHAR(10)
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/part/'
) TBLPROPERTIES(
'primaryKey' = 'p_partkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.part
SELECT *
FROM ${external_catalog}.${external_database}.part;

DROP
TABLE
IF EXISTS ${catalog}.${database}.partsupp;
CREATE
TABLE
${catalog}.${database}.partsupp(
ps_partkey BIGINT,
ps_suppkey BIGINT,
ps_availqty INT,
ps_supplycost DECIMAL,
ps_comment VARCHAR(199)
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/partsupp/'
) TBLPROPERTIES(
'primaryKey' = 'ps_partkey,ps_suppkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.partsupp
SELECT *
FROM ${external_catalog}.${external_database}.partsupp;

DROP
TABLE
IF EXISTS ${catalog}.${database}.region;
CREATE
TABLE
${catalog}.${database}.region(
r_regionkey BIGINT,
r_name CHAR(25),
r_comment VARCHAR(152)
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/region/'
) TBLPROPERTIES(
'primaryKey' = 'r_regionkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.region
SELECT *
FROM ${external_catalog}.${external_database}.region;

DROP
TABLE
IF EXISTS ${catalog}.${database}.supplier;
CREATE
TABLE
${catalog}.${database}.supplier(
s_suppkey BIGINT,
s_name CHAR(25),
s_address VARCHAR(40),
s_nationkey BIGINT,
s_phone CHAR(15),
s_acctbal DECIMAL,
s_comment VARCHAR(101)
)
USING ${table_format} OPTIONS(
PATH '${data_path}${experiment_start_time}/${repetition}/supplier/'
) TBLPROPERTIES(
'primaryKey' = 's_suppkey' ${tblproperties_suffix}
);
INSERT INTO ${catalog}.${database}.supplier
SELECT *
FROM ${external_catalog}.${external_database}.supplier;
21 changes: 21 additions & 0 deletions adapters/cab-converter/sql/spark-3.3.1/run/query_1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
SELECT
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
FROM
${catalog}.${database}.lineitem
WHERE
l_shipdate <= date '1998-12-01' - interval '${param1}' day
GROUP BY
l_returnflag,
l_linestatus
ORDER BY
l_returnflag,
l_linestatus;
32 changes: 32 additions & 0 deletions adapters/cab-converter/sql/spark-3.3.1/run/query_10.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
SELECT
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) as revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
FROM
${catalog}.${database}.customer,
${catalog}.${database}.orders,
${catalog}.${database}.lineitem,
${catalog}.${database}.nation
WHERE
c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate >= date '${param1}'
and o_orderdate < date '${param1}' + interval '3' month
and l_returnflag = 'R'
and c_nationkey = n_nationkey
GROUP BY
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
ORDER BY
revenue DESC
LIMIT 20;
26 changes: 26 additions & 0 deletions adapters/cab-converter/sql/spark-3.3.1/run/query_11.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
SELECT
ps_partkey,
sum(ps_supplycost * ps_availqty) as value
from
${catalog}.${database}.partsupp, ${catalog}.${database}.supplier, ${catalog}.${database}.nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = '${param1}'
group by
ps_partkey
having
sum(ps_supplycost * ps_availqty) > (
select
sum(ps_supplycost * ps_availqty) * 0.0001 / ${param2}
from
${catalog}.${database}.partsupp,
${catalog}.${database}.supplier,
${catalog}.${database}.nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = '${param1}'
)
order by
value desc;
Loading

0 comments on commit 53140ad

Please sign in to comment.