From f7c0f9f5363a442aa0e8fa77ce026eefff320d69 Mon Sep 17 00:00:00 2001
From: Tiemo Bang <tbang@berkeley.edu>
Date: Fri, 29 Mar 2024 07:42:39 -0700
Subject: [PATCH 1/3] Augment file parser to handle incorrect file paths

---
 .../microsoft/lst_bench/util/FileParser.java  | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 src/main/java/com/microsoft/lst_bench/util/FileParser.java
diff --git a/src/main/java/com/microsoft/lst_bench/util/FileParser.java b/src/main/java/com/microsoft/lst_bench/util/FileParser.java
old mode 100644
new mode 100755
index 825a10a1..cf93223f
--- a/src/main/java/com/microsoft/lst_bench/util/FileParser.java
+++ b/src/main/java/com/microsoft/lst_bench/util/FileParser.java
@@ -33,7 +33,6 @@
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
-import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -172,13 +171,29 @@ public static TelemetryConfig loadTelemetryConfig(String filePath) throws IOExce
    */
   private static <T> T createObject(String filePath, Class<T> objectType, String schemaFilePath)
       throws IOException {
-    String resolvedYAMLContent = StringUtils.replaceEnvVars(new File(filePath));
+
+    // Verify that files exist
+    File file = new File(filePath);
+    File schemaFile = new File(schemaFilePath);
+    if (!file.exists()) {
+      throw new IllegalArgumentException("File does not exist: " + filePath);
+    }
+    if (!schemaFile.exists()) {
+      throw new IllegalArgumentException("Schema file does not exist: " + schemaFilePath);
+    }
+
+    String resolvedYAMLContent = StringUtils.replaceEnvVars(file);
+
+    if (resolvedYAMLContent == null) {
+      throw new IllegalArgumentException("Error resolving environment variables in YAML file");
+    }
+
     // Validate YAML file contents
     JsonSchemaFactory factory =
         JsonSchemaFactory.builder(JsonSchemaFactory.getInstance(SpecVersion.VersionFlag.V202012))
             .objectMapper(YAML_MAPPER)
             .build();
-    JsonSchema schema = factory.getSchema(Files.newInputStream(Paths.get(schemaFilePath)));
+    JsonSchema schema = factory.getSchema(Files.newInputStream(schemaFile.toPath()));
     JsonNode jsonNodeDirect = YAML_MAPPER.readTree(resolvedYAMLContent);
     Set<ValidationMessage> errorsFromFile = schema.validate(jsonNodeDirect);
     if (!errorsFromFile.isEmpty()) {

From 6f56b11718aa98c0e10324553e67b2041783dacd Mon Sep 17 00:00:00 2001
From: Tiemo Bang <tbang@berkeley.edu>
Date: Mon, 1 Apr 2024 18:56:13 -0700
Subject: [PATCH 2/3] Fix: Null handing and custom result paths for dashboard

---
 .gitignore            |   3 +
 metrics/app/README.md |   3 +
 metrics/app/main.py   | 391 ++++++++++++++++++++++++------------------
 metrics/app/utils.py  |  18 ++
 4 files changed, 246 insertions(+), 169 deletions(-)
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 metrics/app/README.md
 mode change 100644 => 100755 metrics/app/main.py
 mode change 100644 => 100755 metrics/app/utils.py

diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
index bee554f5..d7e1e172
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,9 @@ bin/
 # Local configuration file (sdk path, etc)
 local.properties
 
+# Python
+*.pyc
+
 # Others
 *~
 .DS_Store
diff --git a/metrics/app/README.md b/metrics/app/README.md
old mode 100644
new mode 100755
index e61e059c..86884783
--- a/metrics/app/README.md
+++ b/metrics/app/README.md
@@ -36,6 +36,8 @@ To include data from a new system, duplicate one of the directories in the [run
 For a deeper understanding of the directory structure, consult the [README file](/run/README.md). 
 The LST-Bench dashboard web app automatically retrieves results from the .duckdb files within those folders and displays them on the dashboard.
 
+Alternatively, you can provide your own paths to search for results via commandline arguments, see below.
+
 ## Dashboard Development
 To run the LST-Bench dashboard locally and test your changes, follow these steps:
 
@@ -67,6 +69,7 @@ With the dependencies installed, you can now start the Streamlit app by running
 
 ```bash
 python -m streamlit run main.py
+python -m streamlit run main.py -- --result_dirs DIR1 DIR2 ...
 ```
 
 This command will launch the LST-Bench dashboard locally in your browser.
diff --git a/metrics/app/main.py b/metrics/app/main.py
old mode 100644
new mode 100755
index aba25d25..654e4ddb
--- a/metrics/app/main.py
+++ b/metrics/app/main.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
+from typing import List
 import altair as alt
 import collections
 import duckdb
@@ -23,33 +25,49 @@
 
 
 @st.cache_resource
-def get_connection():
+def get_connection(*, result_dirs: List[str] = None):
+    # Either search for results in provided direcotries
+    # or use default assuming that that the CWD is the location of this script
+    result_dirs = result_dirs or ["./", "../../run/"]
+
     connection = duckdb.connect()
     # Get databases and attach them
     databases_list = []
 
     # Function to recursively find DuckDB files in a directory
     def find_duckdb_files(directory: str) -> collections.abc.Iterator[str]:
+        # Warning if the directory does not exist
+        if not os.path.exists(directory):
+            st.warning(f"Directory '{directory}' does not exist.")
+            return
+        
+        if os.path.isfile(directory) and directory.endswith('.duckdb'):
+            yield directory
+            return
+
         for root, dirs, files in os.walk(directory):
             for file in files:
                 if file.endswith('.duckdb'):
                     yield os.path.join(root, file)
 
-    # Combine the results of the current directory (used when deployed)
-    # and find_duckdb_files('../../run/') (used when developing)
-    for database_path in list(find_duckdb_files('./')) + list(find_duckdb_files('../../run/')):
-        database = os.path.basename(database_path)[:-3]
-        connection.execute(f"ATTACH DATABASE '{database_path}' AS \"{database}\" (READ_ONLY)")
-        databases_list.append(database)
+    # Combine the results from all directories
+    for result_dir in result_dirs:
+        for database_path in find_duckdb_files(result_dir):
+            database = os.path.basename(database_path)[:-3]
+            connection.execute(f"ATTACH DATABASE '{database_path}' AS \"{database}\" (READ_ONLY)")
+            databases_list.append(database)
+
+    if not databases_list:
+        st.error("No DuckDB files found in the provided directories.")
+        st.stop()
     # Create view encompassing all experiments
     union_sql = " UNION ".join([f"SELECT * FROM \"{database}\".experiment_telemetry" for database in databases_list])
     connection.execute(f"CREATE VIEW combined_experiment_telemetry AS {union_sql}")
     return connection
 
-
 @st.cache_data
-def get_systems():
-    connection = get_connection()
+def get_systems(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT concat_ws('-', json(event_data)->>'system', json(event_data)->>'system_version') AS system 
@@ -58,12 +76,14 @@ def get_systems():
         ORDER BY system ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df.fillna("N/A")
     return df['system']
 
 
 @st.cache_data
-def get_table_formats():
-    connection = get_connection()
+def get_table_formats(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT concat_ws('-', json(event_data)->>'table_format', json(event_data)->>'table_format_version') AS table_format 
@@ -72,12 +92,14 @@ def get_table_formats():
         ORDER BY table_format ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df.fillna("N/A", inplace=True)
     return df['table_format']
 
 
 @st.cache_data
-def get_modes():
-    connection = get_connection()
+def get_modes(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT json(event_data)->>'mode' AS mode 
@@ -86,12 +108,14 @@ def get_modes():
         ORDER BY mode ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df.fillna("N/A", inplace=True)
     return df['mode']
 
 
 @st.cache_data
-def get_cluster_sizes():
-    connection = get_connection()
+def get_cluster_sizes(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT json(event_data)->>'cluster_size' AS cluster_size 
@@ -100,12 +124,14 @@ def get_cluster_sizes():
         ORDER BY cluster_size ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df.fillna("N/A", inplace=True)
     return df['cluster_size']
 
 
 @st.cache_data
-def get_machines():
-    connection = get_connection()
+def get_machines(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT json(event_data)->>'machine' AS machine 
@@ -114,12 +140,14 @@ def get_machines():
         ORDER BY machine ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df.fillna("N/A", inplace=True)
     return df['machine']
 
 
 @st.cache_data
-def get_workloads():
-    connection = get_connection()
+def get_workloads(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT event_id AS workload 
@@ -128,12 +156,14 @@ def get_workloads():
         ORDER BY workload ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df['workload'] = df['workload'].replace('None', "N/A")
     return df['workload']
 
 
 @st.cache_data
-def get_scale_factors():
-    connection = get_connection()
+def get_scale_factors(*, result_dirs: List[str] = None):
+    connection = get_connection(result_dirs=result_dirs)
     df = connection.execute(
         f"""
         SELECT DISTINCT json(event_data)->>'scale_factor' AS scale_factor 
@@ -142,6 +172,8 @@ def get_scale_factors():
         ORDER BY scale_factor ASC;
         """
     ).df()
+    # Replace None with Pandas NA
+    df.fillna("N/A", inplace=True)
     return df['scale_factor']
 
 
@@ -152,8 +184,10 @@ def get_experiments_selected(
         _modes_selected: list[str],
         _cluster_sizes_selected: list[str],
         _machines_selected: list[str],
-        _scale_factors_selected: list[str]) -> pd.DataFrame:
-    connection = get_connection()
+        _scale_factors_selected: list[str],
+        *, result_dirs: List[str] = None) -> pd.DataFrame:
+    connection = get_connection(result_dirs=result_dirs)
+   
     df = connection.execute(
         f"""
         SELECT run_id, event_start_time, event_end_time, event_id, 
@@ -165,23 +199,32 @@ def get_experiments_selected(
                cast(json(event_data)->>'scale_factor' AS VARCHAR) AS scale_factor 
         FROM combined_experiment_telemetry 
         WHERE event_type = 'EXEC_EXPERIMENT' AND event_status='SUCCESS' AND event_id = '{_workload_selected}'
-              AND concat_ws('-', json(event_data)->>'system', json(event_data)->>'system_version') IN ({', '.join(["'" + system + "'" for system in _systems_selected])}) 
-              AND concat_ws('-', json(event_data)->>'table_format', json(event_data)->>'table_format_version') IN ({', '.join(["'" + table_format + "'" for table_format in _table_formats_selected])}) 
-              AND cast(json(event_data)->>'mode' AS VARCHAR) IN ({', '.join(["'" + mode + "'" for mode in _modes_selected])}) 
-              AND cast(json(event_data)->>'cluster_size' AS VARCHAR) IN ({', '.join(["'" + cluster_size + "'" for cluster_size in _cluster_sizes_selected])}) 
-              AND cast(json(event_data)->>'machine' AS VARCHAR) IN ({', '.join(["'" + machine + "'" for machine in _machines_selected])}) 
-              AND cast(json(event_data)->>'scale_factor' AS VARCHAR) IN ({', '.join(["'" + scale_factor + "'" for scale_factor in _scale_factors_selected])}) 
+        AND {utils.generate_sql_in_with_null('system', _systems_selected)}
+        AND {utils.generate_sql_in_with_null('table_format', _table_formats_selected)}
+        AND {utils.generate_sql_in_with_null('mode', _modes_selected)}
+        AND {utils.generate_sql_in_with_null('cluster_size', _cluster_sizes_selected)}
+        AND {utils.generate_sql_in_with_null('machine', _machines_selected)}
+        AND {utils.generate_sql_in_with_null('scale_factor', _scale_factors_selected)}
         ORDER BY cast(event_start_time AS TIMESTAMP) ASC;
         """
     ).df()
+    df.fillna("N/A", inplace=True)
     logging.debug(df)
+    if len(df) == 0:
+        st.error("No data found for the selected dimensions.")
+        st.stop()
     return df
-
+    #return df_unfiltered
 
 @st.cache_data
-def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str) -> pd.DataFrame:
-    connection = get_connection()
+def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str,
+                         *, result_dirs: List[str] = None) -> pd.DataFrame:
+    connection = get_connection(result_dirs=result_dirs)
     df = experiments_df
+    if len(df) == 0:
+        st.error("Empty experiments data.")
+        st.stop()
+
     granularities = {
         'phase': 'EXEC_PHASE',
         'session': 'EXEC_SESSION',
@@ -212,6 +255,8 @@ def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str)
         df = new_experiments_data_df
         if granularity == target_granularity:
             break
+    # Replace None with Pandas NA
+    df.fillna("N/A", inplace=True)
     logging.debug(df)
     df['configuration'] = df.apply(
         lambda row: (row['system'] + ", "
@@ -225,146 +270,154 @@ def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str)
         axis=1)
     return df
 
+def run(*, result_dirs: List[str] = None):
+    st.set_page_config(
+        page_title="LST-Bench - Dashboard",
+        page_icon=":bar_chart:",
+        layout="wide")
+    st.title('LST-Bench - Dashboard')
+    st.write("[Project Page](https://github.com/microsoft/lst-bench/) | "
+            "[Technical Report](https://arxiv.org/abs/2305.01120) | "
+            "[Evaluation](https://github.com/microsoft/lst-bench/tree/main/metrics/app#evaluation) | "
+            "[Adding a New Result](https://github.com/microsoft/lst-bench/tree/main/metrics/app#adding-a-new-result)")
 
-st.set_page_config(
-    page_title="LST-Bench - Dashboard",
-    page_icon=":bar_chart:",
-    layout="wide")
-st.title('LST-Bench - Dashboard')
-st.write("[Project Page](https://github.com/microsoft/lst-bench/) | "
-         "[Technical Report](https://arxiv.org/abs/2305.01120) | "
-         "[Evaluation](https://github.com/microsoft/lst-bench/tree/main/metrics/app#evaluation) | "
-         "[Adding a New Result](https://github.com/microsoft/lst-bench/tree/main/metrics/app#adding-a-new-result)")
-
-workloads = get_workloads()
-workload_selected = st.sidebar.selectbox('Workload', workloads, index=0)
-
-systems = get_systems()
-systems_selected = st.sidebar.multiselect('System', systems, default=systems)
-
-table_formats = get_table_formats()
-table_formats_selected = st.sidebar.multiselect('Table Format', table_formats, default=table_formats)
-
-modes = get_modes()
-modes_selected = st.sidebar.multiselect('Mode', modes, default=modes)
-
-cluster_sizes = get_cluster_sizes()
-cluster_sizes_selected = st.sidebar.multiselect('Cluster Size', cluster_sizes, default=cluster_sizes)
-
-machines = get_machines()
-machines_selected = st.sidebar.multiselect('Machine', machines, default=machines)
-
-scale_factors = get_scale_factors()
-scale_factors_selected = st.sidebar.multiselect('Scale Factor', scale_factors, default=scale_factors)
-
-# Bail out if any of the dimensions if empty
-if any(len(arr) == 0 for arr in [systems_selected, table_formats_selected,
-                                 modes_selected, cluster_sizes_selected,
-                                 machines_selected, scale_factors_selected]):
-    st.error("Please ensure you have selected at least one option for each dimension.")
-    st.stop()
-
-# Create tabs for current selection
-exec_time_tab = None  # This tab shows execution time.
-performance_degradation_tab = None  # This tab shows degradation rate.
-# TODO
-io_tab = None  # This tab will show I/O metrics, such as bytes read/written.
-io_api_calls_tab = None  # This tab will show I/O API call metrics.
-cpu_utilization_tab = None  # This tab will show CPU utilization metrics.
-
-if workload_selected == 'wp1_longevity':
-    exec_time_tab, performance_degradation_tab = st.tabs(['Execution Time', 'Performance Degradation'])
-else:
-    exec_time_tab = st.tabs(['Execution Time'])[0]
-
-if exec_time_tab is not None:
-    granularity_selected = exec_time_tab.radio(
-        'Granularity:',
-        ['phase', 'session', 'task', 'file'],
-        horizontal=True)
-    regex = exec_time_tab.text_input('Filter Results:', placeholder='Regular Expression (Regex)')
-
-    # --- Data manipulations --- #
-    experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected,
-                                                       modes_selected, cluster_sizes_selected, machines_selected,
-                                                       scale_factors_selected)
-    experiments_data_df = get_experiments_data(experiments_selected_df, granularity_selected)
-    experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.contains(regex, regex=True)]
-
-    if len(experiments_data_df) > 3000:
-        st.error(
-            "Too many rows in the result. "
-            "Please refine your dimension selection or apply a regex filter to narrow down the results.")
+    workloads = get_workloads(result_dirs=result_dirs)
+    workload_selected = st.sidebar.selectbox('Workload', workloads, index=0)
+
+    systems = get_systems(result_dirs=result_dirs)
+    systems_selected = st.sidebar.multiselect('System', systems, default=systems)
+
+    table_formats = get_table_formats(result_dirs=result_dirs)
+    table_formats_selected = st.sidebar.multiselect('Table Format', table_formats, default=table_formats)
+
+    modes = get_modes(result_dirs=result_dirs)
+    modes_selected = st.sidebar.multiselect('Mode', modes, default=modes)
+
+    cluster_sizes = get_cluster_sizes(result_dirs=result_dirs)
+    cluster_sizes_selected = st.sidebar.multiselect('Cluster Size', cluster_sizes, default=cluster_sizes)
+
+    machines = get_machines(result_dirs=result_dirs)
+    machines_selected = st.sidebar.multiselect('Machine', machines, default=machines)
+
+    scale_factors = get_scale_factors(result_dirs=result_dirs)
+    scale_factors_selected = st.sidebar.multiselect('Scale Factor', scale_factors, default=scale_factors)
+
+    # Bail out if any of the dimensions if empty
+    if any(len(arr) == 0 for arr in [systems_selected, table_formats_selected,
+                                    modes_selected, cluster_sizes_selected,
+                                    machines_selected, scale_factors_selected]):
+        st.error("Please ensure you have selected at least one option for each dimension.")
         st.stop()
 
-    # --- Plot the data --- #
-    chart = (
-        alt.Chart(experiments_data_df)
-        .mark_bar()
-        .encode(
-            alt.X("configuration:N", axis=None, title='Configuration', stack=None),
-            alt.Y("time_diff_in_mins:Q", title='Latency (mins)', axis=alt.Axis(titleFontWeight='bold')),
-            alt.Color("configuration:N", legend=alt.Legend(titleFontWeight='bold', labelLimit=400),
-                      title='Configuration'),
-            alt.Column("event_id:N", title="",
-                       header=alt.Header(orient='bottom', labelFontWeight='bold', labelAlign='right',
-                                         labelAngle=-45, labelPadding=20),
-                       sort=alt.SortField("event_start_time", order="ascending"))
-        )
-        .configure_range(
-            category={'scheme': 'dark2'}
-        )
-    )
-    exec_time_tab.markdown('#')
-    exec_time_tab.altair_chart(chart, theme=None)
-
-if performance_degradation_tab is not None:
-    # --- Data manipulations --- #
-    experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected,
-                                                       modes_selected, cluster_sizes_selected, machines_selected,
-                                                       scale_factors_selected)
-    experiments_data_df = get_experiments_data(experiments_selected_df, 'phase')
-    # Filter rows with event_id following the format <name>_<numeric>
-    experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.match(r'^.+_\d+$')]
-    # Extract name part from event_id
-    experiments_data_df['phase_type'] = experiments_data_df['event_id'].str.extract(r'^(.+)_\d+$')
-    # Group by each distinct 'configuration' and 'phase_type'
-    grouped_df = experiments_data_df.groupby(['configuration', 'phase_type'])
-    # Compute performance degradation
-    grouped_df = grouped_df['time_diff_in_mins'].agg(performance_degradation_rate=utils.performance_degradation)
-    grouped_df = grouped_df.reset_index()
-
-    # --- Plot the data --- #
-    # X axis: phase type
-    # Y axis: configuration
-    # score: degradation rate
-    base = (
-        alt.Chart(grouped_df)
-        .encode(
-            alt.X("phase_type:N", title='', axis=alt.Axis(labelFontWeight='bold', labelAngle=-45)),
-            alt.Y("configuration:N", title='Configuration',
-                  axis=alt.Axis(titleFontWeight='bold', maxExtent=430, labelLimit=400))
+    # Create tabs for current selection
+    exec_time_tab = None  # This tab shows execution time.
+    performance_degradation_tab = None  # This tab shows degradation rate.
+    # TODO
+    io_tab = None  # This tab will show I/O metrics, such as bytes read/written.
+    io_api_calls_tab = None  # This tab will show I/O API call metrics.
+    cpu_utilization_tab = None  # This tab will show CPU utilization metrics.
+
+    if workload_selected == 'wp1_longevity':
+        exec_time_tab, performance_degradation_tab = st.tabs(['Execution Time', 'Performance Degradation'])
+    else:
+        exec_time_tab = st.tabs(['Execution Time'])[0]
+
+    if exec_time_tab is not None:
+        granularity_selected = exec_time_tab.radio(
+            'Granularity:',
+            ['phase', 'session', 'task', 'file'],
+            horizontal=True)
+        regex = exec_time_tab.text_input('Filter Results:', placeholder='Regular Expression (Regex)')
+
+        # --- Data manipulations --- #
+        experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected,
+                                                        modes_selected, cluster_sizes_selected, machines_selected,
+                                                        scale_factors_selected,
+                                                        result_dirs=result_dirs)
+        experiments_data_df = get_experiments_data(experiments_selected_df, granularity_selected, result_dirs=result_dirs)
+        experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.contains(regex, regex=True)]
+
+        if len(experiments_data_df) > 3000:
+            st.error(
+                "Too many rows in the result. "
+                "Please refine your dimension selection or apply a regex filter to narrow down the results.")
+            st.stop()
+
+        # --- Plot the data --- #
+        chart = (
+            alt.Chart(experiments_data_df)
+            .mark_bar()
+            .encode(
+                alt.X("configuration:N", axis=None, title='Configuration', stack=None),
+                alt.Y("time_diff_in_mins:Q", title='Latency (mins)', axis=alt.Axis(titleFontWeight='bold')),
+                alt.Color("configuration:N", legend=alt.Legend(titleFontWeight='bold', labelLimit=400),
+                        title='Configuration'),
+                alt.Column("event_id:N", title="",
+                        header=alt.Header(orient='bottom', labelFontWeight='bold', labelAlign='right',
+                                            labelAngle=-45, labelPadding=20),
+                        sort=alt.SortField("event_start_time", order="ascending"))
+            )
+            .configure_range(
+                category={'scheme': 'dark2'}
+            )
         )
-    )
-    heatmap = (
-        base.mark_rect()
-        .encode(
-            alt.Color('performance_degradation_rate:Q',
-                      scale=alt.Scale(scheme='redblue', reverse=True),
-                      title='Performance Degradation Rate',
-                      legend=alt.Legend(titleFontWeight='bold', titleLimit=400, direction="horizontal"))
+        exec_time_tab.markdown('#')
+        exec_time_tab.altair_chart(chart, theme=None)
+
+    if performance_degradation_tab is not None:
+        # --- Data manipulations --- #
+        experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected,
+                                                        modes_selected, cluster_sizes_selected, machines_selected,
+                                                        scale_factors_selected, result_dirs=result_dirs)
+        experiments_data_df = get_experiments_data(experiments_selected_df, 'phase', result_dirs=result_dirs)
+        # Filter rows with event_id following the format <name>_<numeric>
+        experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.match(r'^.+_\d+$')]
+        # Extract name part from event_id
+        experiments_data_df['phase_type'] = experiments_data_df['event_id'].str.extract(r'^(.+)_\d+$')
+        # Group by each distinct 'configuration' and 'phase_type'
+        grouped_df = experiments_data_df.groupby(['configuration', 'phase_type'])
+        # Compute performance degradation
+        grouped_df = grouped_df['time_diff_in_mins'].agg(performance_degradation_rate=utils.performance_degradation)
+        grouped_df = grouped_df.reset_index()
+
+        # --- Plot the data --- #
+        # X axis: phase type
+        # Y axis: configuration
+        # score: degradation rate
+        base = (
+            alt.Chart(grouped_df)
+            .encode(
+                alt.X("phase_type:N", title='', axis=alt.Axis(labelFontWeight='bold', labelAngle=-45)),
+                alt.Y("configuration:N", title='Configuration',
+                    axis=alt.Axis(titleFontWeight='bold', maxExtent=430, labelLimit=400))
+            )
         )
-        .properties(
-            height={"step": 50},
-            width={"step": 50}
+        heatmap = (
+            base.mark_rect()
+            .encode(
+                alt.Color('performance_degradation_rate:Q',
+                        scale=alt.Scale(scheme='redblue', reverse=True),
+                        title='Performance Degradation Rate',
+                        legend=alt.Legend(titleFontWeight='bold', titleLimit=400, direction="horizontal"))
+            )
+            .properties(
+                height={"step": 50},
+                width={"step": 50}
+            )
         )
-    )
-    text = (
-        base.mark_text()
-        .encode(
-            alt.Text('performance_degradation_rate:Q', format=".2f"),
-            color=alt.condition(alt.datum.performance_degradation_rate > 0.8, alt.value("black"), alt.value("white"))
+        text = (
+            base.mark_text()
+            .encode(
+                alt.Text('performance_degradation_rate:Q', format=".2f"),
+                color=alt.condition(alt.datum.performance_degradation_rate > 0.8, alt.value("black"), alt.value("white"))
+            )
         )
-    )
-    performance_degradation_tab.markdown('#')
-    performance_degradation_tab.altair_chart(heatmap + text, theme=None)
+        performance_degradation_tab.markdown('#')
+        performance_degradation_tab.altair_chart(heatmap + text, theme=None)
+
+if __name__ == '__main__':
+    # Parse arguments
+    parser = argparse.ArgumentParser(description='LST-Bench Dashboard')
+    parser.add_argument('--result_dirs', type=str, nargs='+', help='Directories containing the result files')
+    args = parser.parse_args()
+    run(result_dirs=args.result_dirs)
diff --git a/metrics/app/utils.py b/metrics/app/utils.py
old mode 100644
new mode 100755
index 4595fc8b..82463ec6
--- a/metrics/app/utils.py
+++ b/metrics/app/utils.py
@@ -60,3 +60,21 @@ def performance_degradation(values: pd.DataFrame) -> float:
     # TODO: Handle multiple runs for more comprehensive analysis.
 
     return degradation_rate
+
+# -------- SQL GENERATION -------- #
+def generate_sql_in_with_null(lhs: str, values: list, NA_value = "N/A") -> str:
+    """
+    Generates a string of comma-separated values from a list
+    with None values converted to NULL.
+
+    Args:
+    - values (list): A list of values to be converted to a string.
+
+    Returns:
+    - str: A string of comma-separated values.
+    """
+
+    str_list = ', '.join(["'" + str(value) + "'" for value in values if value is not NA_value])
+    null_predicate = '' if NA_value in values else 'NOT'
+
+    return f"({lhs} IN ({str_list}) OR {lhs} IS {null_predicate} NULL)"
\ No newline at end of file

From d1ef6f15345ee52be33db5f9660badd947963766 Mon Sep 17 00:00:00 2001
From: Tiemo Bang <tbang@berkeley.edu>
Date: Fri, 5 Apr 2024 17:32:48 -0700
Subject: [PATCH 3/3] Fix Packaged schemas and path variables for config files

---
 launcher.sh                                   |  8 +++--
 .../microsoft/lst_bench/util/FileParser.java  | 34 ++++++++++++-------
 .../microsoft/lst_bench/util/StringUtils.java | 12 ++++---
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/launcher.sh b/launcher.sh
index c6f26a00..32611c21 100755
--- a/launcher.sh
+++ b/launcher.sh
@@ -1,7 +1,11 @@
 #!/bin/bash -e
 
 # Constants
-LST_BENCH_HOME="$PWD"
+# Directory of the script
+export LST_BENCH_HOME="$(dirname "$(readlink -f "$0")")"
 LST_BENCH_CLASSPATH="$LST_BENCH_HOME/target/*:$LST_BENCH_HOME/target/lib/*:$LST_BENCH_HOME/target/classes/*"
 
-java -cp $LST_BENCH_CLASSPATH com.microsoft.lst_bench.Driver "$@"
+SPARK_CLASSPATH="${SPARK_HOME}/jars/*"
+CLASSPATH="${LST_BENCH_CLASSPATH}:${SPARK_CLASSPATH}"
+
+java -cp ${CLASSPATH} com.microsoft.lst_bench.Driver "$@"
diff --git a/src/main/java/com/microsoft/lst_bench/util/FileParser.java b/src/main/java/com/microsoft/lst_bench/util/FileParser.java
index cf93223f..f6fb950a 100755
--- a/src/main/java/com/microsoft/lst_bench/util/FileParser.java
+++ b/src/main/java/com/microsoft/lst_bench/util/FileParser.java
@@ -30,6 +30,7 @@
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
@@ -44,15 +45,7 @@
 public class FileParser {
 
   private static final ObjectMapper YAML_MAPPER = new YAMLMapper();
-  private static final String SCHEMAS_PATH =
-      "src"
-          + File.separator
-          + "main"
-          + File.separator
-          + "resources"
-          + File.separator
-          + "schemas"
-          + File.separator;
+  private static final String SCHEMAS_PATH = "schemas" + File.separator;
 
   private FileParser() {
     // Defeat instantiation
@@ -126,8 +119,12 @@ public static Map<String, Object> getParameterValues(String parameterValuesFile,
   /**
    * Reads the YAML file and replaces all environment variables (if present). Validates the YAML
    * file according to the schema. Creates and returns a `TaskLibrary` object.
+   *
+   * <p>Exports LIB_PATH for the directory of the file, so that the file contents can reference it
+   * as ${LIB_PATH}.
    */
   public static Library loadLibrary(String filePath) throws IOException {
+    exportFilePath(filePath, "LIB_PATH");
     return createObject(filePath, Library.class, SCHEMAS_PATH + "library.json");
   }
 
@@ -136,6 +133,7 @@ public static Library loadLibrary(String filePath) throws IOException {
    * file according to the schema. Creates and returns a `Workload` object.
    */
   public static Workload loadWorkload(String filePath) throws IOException {
+    exportFilePath(filePath, "WL_PATH");
     return createObject(filePath, Workload.class, SCHEMAS_PATH + "workload.json");
   }
 
@@ -144,6 +142,7 @@ public static Workload loadWorkload(String filePath) throws IOException {
    * file according to the schema. Creates and returns a `ConnectionsConfig` object.
    */
   public static ConnectionsConfig loadConnectionsConfig(String filePath) throws IOException {
+    exportFilePath(filePath, "CON_PATH");
     return createObject(
         filePath, ConnectionsConfig.class, SCHEMAS_PATH + "connections_config.json");
   }
@@ -153,6 +152,7 @@ public static ConnectionsConfig loadConnectionsConfig(String filePath) throws IO
    * file according to the schema. Creates and returns a `ExperimentConfig` object.
    */
   public static ExperimentConfig loadExperimentConfig(String filePath) throws IOException {
+    exportFilePath(filePath, "EXP_PATH");
     return createObject(filePath, ExperimentConfig.class, SCHEMAS_PATH + "experiment_config.json");
   }
 
@@ -161,6 +161,7 @@ public static ExperimentConfig loadExperimentConfig(String filePath) throws IOEx
    * file according to the schema. Creates and returns a `TelemetryConfig` object.
    */
   public static TelemetryConfig loadTelemetryConfig(String filePath) throws IOException {
+    exportFilePath(filePath, "TEL_PATH");
     return createObject(filePath, TelemetryConfig.class, SCHEMAS_PATH + "telemetry_config.json");
   }
 
@@ -174,11 +175,13 @@ private static <T> T createObject(String filePath, Class<T> objectType, String s
 
     // Verify that files exist
     File file = new File(filePath);
-    File schemaFile = new File(schemaFilePath);
     if (!file.exists()) {
       throw new IllegalArgumentException("File does not exist: " + filePath);
     }
-    if (!schemaFile.exists()) {
+
+    InputStream schemaInputStream =
+        FileParser.class.getClassLoader().getResourceAsStream(schemaFilePath);
+    if (schemaInputStream == null) {
       throw new IllegalArgumentException("Schema file does not exist: " + schemaFilePath);
     }
 
@@ -193,7 +196,7 @@ private static <T> T createObject(String filePath, Class<T> objectType, String s
         JsonSchemaFactory.builder(JsonSchemaFactory.getInstance(SpecVersion.VersionFlag.V202012))
             .objectMapper(YAML_MAPPER)
             .build();
-    JsonSchema schema = factory.getSchema(Files.newInputStream(schemaFile.toPath()));
+    JsonSchema schema = factory.getSchema(schemaInputStream);
     JsonNode jsonNodeDirect = YAML_MAPPER.readTree(resolvedYAMLContent);
     Set<ValidationMessage> errorsFromFile = schema.validate(jsonNodeDirect);
     if (!errorsFromFile.isEmpty()) {
@@ -202,4 +205,11 @@ private static <T> T createObject(String filePath, Class<T> objectType, String s
     // Create and return POJO
     return YAML_MAPPER.treeToValue(jsonNodeDirect, objectType);
   }
+
+  /** Exports the directory of the file as an environment variable. */
+  private static void exportFilePath(String file, String variableName) {
+    File f = new File(file);
+    String directory = f.isDirectory() ? file : f.getParent();
+    System.setProperty(variableName, directory);
+  }
 }
diff --git a/src/main/java/com/microsoft/lst_bench/util/StringUtils.java b/src/main/java/com/microsoft/lst_bench/util/StringUtils.java
index e81e21ff..eccddcf6 100644
--- a/src/main/java/com/microsoft/lst_bench/util/StringUtils.java
+++ b/src/main/java/com/microsoft/lst_bench/util/StringUtils.java
@@ -22,6 +22,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
 import java.util.Map;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -92,9 +93,10 @@ public static FileExec replaceRegex(FileExec f, String regex, String replacement
   }
 
   /**
-   * Reads the contents of the `sourceFile` and replaces any environment variables if present. If
-   * the environment variable is not set, the default value is used if specified. All other
-   * parameters are ignored.
+   * Reads the contents of the `sourceFile` and replaces any environment variables and JVM
+   * properties if present. JVM properties take precedence over environment variables. If a
+   * environment variable is not set, the default value is used if specified. All other parameters
+   * are ignored.
    */
   public static String replaceEnvVars(File sourceFile) throws IOException {
     if (sourceFile == null || !sourceFile.isFile()) {
@@ -102,7 +104,9 @@ public static String replaceEnvVars(File sourceFile) throws IOException {
       LOGGER.debug("replaceEnvVars received a null or missing file.");
       return null;
     }
-    StringSubstitutor envSub = new StringSubstitutor(System.getenv());
+    Map<String, String> env = new HashMap<>(System.getenv());
+    System.getProperties().forEach((k, v) -> env.put(k.toString(), v.toString()));
+    StringSubstitutor envSub = new StringSubstitutor(env);
     return envSub.replace(FileUtils.readFileToString(sourceFile, StandardCharsets.UTF_8));
   }
 }