From f7c0f9f5363a442aa0e8fa77ce026eefff320d69 Mon Sep 17 00:00:00 2001 From: Tiemo Bang Date: Fri, 29 Mar 2024 07:42:39 -0700 Subject: [PATCH 1/3] Augment file parser to handle incorrect file paths --- .../microsoft/lst_bench/util/FileParser.java | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) mode change 100644 => 100755 src/main/java/com/microsoft/lst_bench/util/FileParser.java diff --git a/src/main/java/com/microsoft/lst_bench/util/FileParser.java b/src/main/java/com/microsoft/lst_bench/util/FileParser.java old mode 100644 new mode 100755 index 825a10a1..cf93223f --- a/src/main/java/com/microsoft/lst_bench/util/FileParser.java +++ b/src/main/java/com/microsoft/lst_bench/util/FileParser.java @@ -33,7 +33,6 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -172,13 +171,29 @@ public static TelemetryConfig loadTelemetryConfig(String filePath) throws IOExce */ private static T createObject(String filePath, Class objectType, String schemaFilePath) throws IOException { - String resolvedYAMLContent = StringUtils.replaceEnvVars(new File(filePath)); + + // Verify that files exist + File file = new File(filePath); + File schemaFile = new File(schemaFilePath); + if (!file.exists()) { + throw new IllegalArgumentException("File does not exist: " + filePath); + } + if (!schemaFile.exists()) { + throw new IllegalArgumentException("Schema file does not exist: " + schemaFilePath); + } + + String resolvedYAMLContent = StringUtils.replaceEnvVars(file); + + if (resolvedYAMLContent == null) { + throw new IllegalArgumentException("Error resolving environment variables in YAML file"); + } + // Validate YAML file contents JsonSchemaFactory factory = JsonSchemaFactory.builder(JsonSchemaFactory.getInstance(SpecVersion.VersionFlag.V202012)) .objectMapper(YAML_MAPPER) .build(); - JsonSchema schema = factory.getSchema(Files.newInputStream(Paths.get(schemaFilePath))); + JsonSchema schema = factory.getSchema(Files.newInputStream(schemaFile.toPath())); JsonNode jsonNodeDirect = YAML_MAPPER.readTree(resolvedYAMLContent); Set errorsFromFile = schema.validate(jsonNodeDirect); if (!errorsFromFile.isEmpty()) { From 6f56b11718aa98c0e10324553e67b2041783dacd Mon Sep 17 00:00:00 2001 From: Tiemo Bang Date: Mon, 1 Apr 2024 18:56:13 -0700 Subject: [PATCH 2/3] Fix: Null handing and custom result paths for dashboard --- .gitignore | 3 + metrics/app/README.md | 3 + metrics/app/main.py | 391 ++++++++++++++++++++++++------------------ metrics/app/utils.py | 18 ++ 4 files changed, 246 insertions(+), 169 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 metrics/app/README.md mode change 100644 => 100755 metrics/app/main.py mode change 100644 => 100755 metrics/app/utils.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index bee554f5..d7e1e172 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,9 @@ bin/ # Local configuration file (sdk path, etc) local.properties +# Python +*.pyc + # Others *~ .DS_Store diff --git a/metrics/app/README.md b/metrics/app/README.md old mode 100644 new mode 100755 index e61e059c..86884783 --- a/metrics/app/README.md +++ b/metrics/app/README.md @@ -36,6 +36,8 @@ To include data from a new system, duplicate one of the directories in the [run For a deeper understanding of the directory structure, consult the [README file](/run/README.md). The LST-Bench dashboard web app automatically retrieves results from the .duckdb files within those folders and displays them on the dashboard. +Alternatively, you can provide your own paths to search for results via commandline arguments, see below. + ## Dashboard Development To run the LST-Bench dashboard locally and test your changes, follow these steps: @@ -67,6 +69,7 @@ With the dependencies installed, you can now start the Streamlit app by running ```bash python -m streamlit run main.py +python -m streamlit run main.py -- --result_dirs DIR1 DIR2 ... ``` This command will launch the LST-Bench dashboard locally in your browser. diff --git a/metrics/app/main.py b/metrics/app/main.py old mode 100644 new mode 100755 index aba25d25..654e4ddb --- a/metrics/app/main.py +++ b/metrics/app/main.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse +from typing import List import altair as alt import collections import duckdb @@ -23,33 +25,49 @@ @st.cache_resource -def get_connection(): +def get_connection(*, result_dirs: List[str] = None): + # Either search for results in provided direcotries + # or use default assuming that that the CWD is the location of this script + result_dirs = result_dirs or ["./", "../../run/"] + connection = duckdb.connect() # Get databases and attach them databases_list = [] # Function to recursively find DuckDB files in a directory def find_duckdb_files(directory: str) -> collections.abc.Iterator[str]: + # Warning if the directory does not exist + if not os.path.exists(directory): + st.warning(f"Directory '{directory}' does not exist.") + return + + if os.path.isfile(directory) and directory.endswith('.duckdb'): + yield directory + return + for root, dirs, files in os.walk(directory): for file in files: if file.endswith('.duckdb'): yield os.path.join(root, file) - # Combine the results of the current directory (used when deployed) - # and find_duckdb_files('../../run/') (used when developing) - for database_path in list(find_duckdb_files('./')) + list(find_duckdb_files('../../run/')): - database = os.path.basename(database_path)[:-3] - connection.execute(f"ATTACH DATABASE '{database_path}' AS \"{database}\" (READ_ONLY)") - databases_list.append(database) + # Combine the results from all directories + for result_dir in result_dirs: + for database_path in find_duckdb_files(result_dir): + database = os.path.basename(database_path)[:-3] + connection.execute(f"ATTACH DATABASE '{database_path}' AS \"{database}\" (READ_ONLY)") + databases_list.append(database) + + if not databases_list: + st.error("No DuckDB files found in the provided directories.") + st.stop() # Create view encompassing all experiments union_sql = " UNION ".join([f"SELECT * FROM \"{database}\".experiment_telemetry" for database in databases_list]) connection.execute(f"CREATE VIEW combined_experiment_telemetry AS {union_sql}") return connection - @st.cache_data -def get_systems(): - connection = get_connection() +def get_systems(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT concat_ws('-', json(event_data)->>'system', json(event_data)->>'system_version') AS system @@ -58,12 +76,14 @@ def get_systems(): ORDER BY system ASC; """ ).df() + # Replace None with Pandas NA + df.fillna("N/A") return df['system'] @st.cache_data -def get_table_formats(): - connection = get_connection() +def get_table_formats(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT concat_ws('-', json(event_data)->>'table_format', json(event_data)->>'table_format_version') AS table_format @@ -72,12 +92,14 @@ def get_table_formats(): ORDER BY table_format ASC; """ ).df() + # Replace None with Pandas NA + df.fillna("N/A", inplace=True) return df['table_format'] @st.cache_data -def get_modes(): - connection = get_connection() +def get_modes(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT json(event_data)->>'mode' AS mode @@ -86,12 +108,14 @@ def get_modes(): ORDER BY mode ASC; """ ).df() + # Replace None with Pandas NA + df.fillna("N/A", inplace=True) return df['mode'] @st.cache_data -def get_cluster_sizes(): - connection = get_connection() +def get_cluster_sizes(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT json(event_data)->>'cluster_size' AS cluster_size @@ -100,12 +124,14 @@ def get_cluster_sizes(): ORDER BY cluster_size ASC; """ ).df() + # Replace None with Pandas NA + df.fillna("N/A", inplace=True) return df['cluster_size'] @st.cache_data -def get_machines(): - connection = get_connection() +def get_machines(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT json(event_data)->>'machine' AS machine @@ -114,12 +140,14 @@ def get_machines(): ORDER BY machine ASC; """ ).df() + # Replace None with Pandas NA + df.fillna("N/A", inplace=True) return df['machine'] @st.cache_data -def get_workloads(): - connection = get_connection() +def get_workloads(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT event_id AS workload @@ -128,12 +156,14 @@ def get_workloads(): ORDER BY workload ASC; """ ).df() + # Replace None with Pandas NA + df['workload'] = df['workload'].replace('None', "N/A") return df['workload'] @st.cache_data -def get_scale_factors(): - connection = get_connection() +def get_scale_factors(*, result_dirs: List[str] = None): + connection = get_connection(result_dirs=result_dirs) df = connection.execute( f""" SELECT DISTINCT json(event_data)->>'scale_factor' AS scale_factor @@ -142,6 +172,8 @@ def get_scale_factors(): ORDER BY scale_factor ASC; """ ).df() + # Replace None with Pandas NA + df.fillna("N/A", inplace=True) return df['scale_factor'] @@ -152,8 +184,10 @@ def get_experiments_selected( _modes_selected: list[str], _cluster_sizes_selected: list[str], _machines_selected: list[str], - _scale_factors_selected: list[str]) -> pd.DataFrame: - connection = get_connection() + _scale_factors_selected: list[str], + *, result_dirs: List[str] = None) -> pd.DataFrame: + connection = get_connection(result_dirs=result_dirs) + df = connection.execute( f""" SELECT run_id, event_start_time, event_end_time, event_id, @@ -165,23 +199,32 @@ def get_experiments_selected( cast(json(event_data)->>'scale_factor' AS VARCHAR) AS scale_factor FROM combined_experiment_telemetry WHERE event_type = 'EXEC_EXPERIMENT' AND event_status='SUCCESS' AND event_id = '{_workload_selected}' - AND concat_ws('-', json(event_data)->>'system', json(event_data)->>'system_version') IN ({', '.join(["'" + system + "'" for system in _systems_selected])}) - AND concat_ws('-', json(event_data)->>'table_format', json(event_data)->>'table_format_version') IN ({', '.join(["'" + table_format + "'" for table_format in _table_formats_selected])}) - AND cast(json(event_data)->>'mode' AS VARCHAR) IN ({', '.join(["'" + mode + "'" for mode in _modes_selected])}) - AND cast(json(event_data)->>'cluster_size' AS VARCHAR) IN ({', '.join(["'" + cluster_size + "'" for cluster_size in _cluster_sizes_selected])}) - AND cast(json(event_data)->>'machine' AS VARCHAR) IN ({', '.join(["'" + machine + "'" for machine in _machines_selected])}) - AND cast(json(event_data)->>'scale_factor' AS VARCHAR) IN ({', '.join(["'" + scale_factor + "'" for scale_factor in _scale_factors_selected])}) + AND {utils.generate_sql_in_with_null('system', _systems_selected)} + AND {utils.generate_sql_in_with_null('table_format', _table_formats_selected)} + AND {utils.generate_sql_in_with_null('mode', _modes_selected)} + AND {utils.generate_sql_in_with_null('cluster_size', _cluster_sizes_selected)} + AND {utils.generate_sql_in_with_null('machine', _machines_selected)} + AND {utils.generate_sql_in_with_null('scale_factor', _scale_factors_selected)} ORDER BY cast(event_start_time AS TIMESTAMP) ASC; """ ).df() + df.fillna("N/A", inplace=True) logging.debug(df) + if len(df) == 0: + st.error("No data found for the selected dimensions.") + st.stop() return df - + #return df_unfiltered @st.cache_data -def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str) -> pd.DataFrame: - connection = get_connection() +def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str, + *, result_dirs: List[str] = None) -> pd.DataFrame: + connection = get_connection(result_dirs=result_dirs) df = experiments_df + if len(df) == 0: + st.error("Empty experiments data.") + st.stop() + granularities = { 'phase': 'EXEC_PHASE', 'session': 'EXEC_SESSION', @@ -212,6 +255,8 @@ def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str) df = new_experiments_data_df if granularity == target_granularity: break + # Replace None with Pandas NA + df.fillna("N/A", inplace=True) logging.debug(df) df['configuration'] = df.apply( lambda row: (row['system'] + ", " @@ -225,146 +270,154 @@ def get_experiments_data(experiments_df: pd.DataFrame, target_granularity: str) axis=1) return df +def run(*, result_dirs: List[str] = None): + st.set_page_config( + page_title="LST-Bench - Dashboard", + page_icon=":bar_chart:", + layout="wide") + st.title('LST-Bench - Dashboard') + st.write("[Project Page](https://github.com/microsoft/lst-bench/) | " + "[Technical Report](https://arxiv.org/abs/2305.01120) | " + "[Evaluation](https://github.com/microsoft/lst-bench/tree/main/metrics/app#evaluation) | " + "[Adding a New Result](https://github.com/microsoft/lst-bench/tree/main/metrics/app#adding-a-new-result)") -st.set_page_config( - page_title="LST-Bench - Dashboard", - page_icon=":bar_chart:", - layout="wide") -st.title('LST-Bench - Dashboard') -st.write("[Project Page](https://github.com/microsoft/lst-bench/) | " - "[Technical Report](https://arxiv.org/abs/2305.01120) | " - "[Evaluation](https://github.com/microsoft/lst-bench/tree/main/metrics/app#evaluation) | " - "[Adding a New Result](https://github.com/microsoft/lst-bench/tree/main/metrics/app#adding-a-new-result)") - -workloads = get_workloads() -workload_selected = st.sidebar.selectbox('Workload', workloads, index=0) - -systems = get_systems() -systems_selected = st.sidebar.multiselect('System', systems, default=systems) - -table_formats = get_table_formats() -table_formats_selected = st.sidebar.multiselect('Table Format', table_formats, default=table_formats) - -modes = get_modes() -modes_selected = st.sidebar.multiselect('Mode', modes, default=modes) - -cluster_sizes = get_cluster_sizes() -cluster_sizes_selected = st.sidebar.multiselect('Cluster Size', cluster_sizes, default=cluster_sizes) - -machines = get_machines() -machines_selected = st.sidebar.multiselect('Machine', machines, default=machines) - -scale_factors = get_scale_factors() -scale_factors_selected = st.sidebar.multiselect('Scale Factor', scale_factors, default=scale_factors) - -# Bail out if any of the dimensions if empty -if any(len(arr) == 0 for arr in [systems_selected, table_formats_selected, - modes_selected, cluster_sizes_selected, - machines_selected, scale_factors_selected]): - st.error("Please ensure you have selected at least one option for each dimension.") - st.stop() - -# Create tabs for current selection -exec_time_tab = None # This tab shows execution time. -performance_degradation_tab = None # This tab shows degradation rate. -# TODO -io_tab = None # This tab will show I/O metrics, such as bytes read/written. -io_api_calls_tab = None # This tab will show I/O API call metrics. -cpu_utilization_tab = None # This tab will show CPU utilization metrics. - -if workload_selected == 'wp1_longevity': - exec_time_tab, performance_degradation_tab = st.tabs(['Execution Time', 'Performance Degradation']) -else: - exec_time_tab = st.tabs(['Execution Time'])[0] - -if exec_time_tab is not None: - granularity_selected = exec_time_tab.radio( - 'Granularity:', - ['phase', 'session', 'task', 'file'], - horizontal=True) - regex = exec_time_tab.text_input('Filter Results:', placeholder='Regular Expression (Regex)') - - # --- Data manipulations --- # - experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected, - modes_selected, cluster_sizes_selected, machines_selected, - scale_factors_selected) - experiments_data_df = get_experiments_data(experiments_selected_df, granularity_selected) - experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.contains(regex, regex=True)] - - if len(experiments_data_df) > 3000: - st.error( - "Too many rows in the result. " - "Please refine your dimension selection or apply a regex filter to narrow down the results.") + workloads = get_workloads(result_dirs=result_dirs) + workload_selected = st.sidebar.selectbox('Workload', workloads, index=0) + + systems = get_systems(result_dirs=result_dirs) + systems_selected = st.sidebar.multiselect('System', systems, default=systems) + + table_formats = get_table_formats(result_dirs=result_dirs) + table_formats_selected = st.sidebar.multiselect('Table Format', table_formats, default=table_formats) + + modes = get_modes(result_dirs=result_dirs) + modes_selected = st.sidebar.multiselect('Mode', modes, default=modes) + + cluster_sizes = get_cluster_sizes(result_dirs=result_dirs) + cluster_sizes_selected = st.sidebar.multiselect('Cluster Size', cluster_sizes, default=cluster_sizes) + + machines = get_machines(result_dirs=result_dirs) + machines_selected = st.sidebar.multiselect('Machine', machines, default=machines) + + scale_factors = get_scale_factors(result_dirs=result_dirs) + scale_factors_selected = st.sidebar.multiselect('Scale Factor', scale_factors, default=scale_factors) + + # Bail out if any of the dimensions if empty + if any(len(arr) == 0 for arr in [systems_selected, table_formats_selected, + modes_selected, cluster_sizes_selected, + machines_selected, scale_factors_selected]): + st.error("Please ensure you have selected at least one option for each dimension.") st.stop() - # --- Plot the data --- # - chart = ( - alt.Chart(experiments_data_df) - .mark_bar() - .encode( - alt.X("configuration:N", axis=None, title='Configuration', stack=None), - alt.Y("time_diff_in_mins:Q", title='Latency (mins)', axis=alt.Axis(titleFontWeight='bold')), - alt.Color("configuration:N", legend=alt.Legend(titleFontWeight='bold', labelLimit=400), - title='Configuration'), - alt.Column("event_id:N", title="", - header=alt.Header(orient='bottom', labelFontWeight='bold', labelAlign='right', - labelAngle=-45, labelPadding=20), - sort=alt.SortField("event_start_time", order="ascending")) - ) - .configure_range( - category={'scheme': 'dark2'} - ) - ) - exec_time_tab.markdown('#') - exec_time_tab.altair_chart(chart, theme=None) - -if performance_degradation_tab is not None: - # --- Data manipulations --- # - experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected, - modes_selected, cluster_sizes_selected, machines_selected, - scale_factors_selected) - experiments_data_df = get_experiments_data(experiments_selected_df, 'phase') - # Filter rows with event_id following the format _ - experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.match(r'^.+_\d+$')] - # Extract name part from event_id - experiments_data_df['phase_type'] = experiments_data_df['event_id'].str.extract(r'^(.+)_\d+$') - # Group by each distinct 'configuration' and 'phase_type' - grouped_df = experiments_data_df.groupby(['configuration', 'phase_type']) - # Compute performance degradation - grouped_df = grouped_df['time_diff_in_mins'].agg(performance_degradation_rate=utils.performance_degradation) - grouped_df = grouped_df.reset_index() - - # --- Plot the data --- # - # X axis: phase type - # Y axis: configuration - # score: degradation rate - base = ( - alt.Chart(grouped_df) - .encode( - alt.X("phase_type:N", title='', axis=alt.Axis(labelFontWeight='bold', labelAngle=-45)), - alt.Y("configuration:N", title='Configuration', - axis=alt.Axis(titleFontWeight='bold', maxExtent=430, labelLimit=400)) + # Create tabs for current selection + exec_time_tab = None # This tab shows execution time. + performance_degradation_tab = None # This tab shows degradation rate. + # TODO + io_tab = None # This tab will show I/O metrics, such as bytes read/written. + io_api_calls_tab = None # This tab will show I/O API call metrics. + cpu_utilization_tab = None # This tab will show CPU utilization metrics. + + if workload_selected == 'wp1_longevity': + exec_time_tab, performance_degradation_tab = st.tabs(['Execution Time', 'Performance Degradation']) + else: + exec_time_tab = st.tabs(['Execution Time'])[0] + + if exec_time_tab is not None: + granularity_selected = exec_time_tab.radio( + 'Granularity:', + ['phase', 'session', 'task', 'file'], + horizontal=True) + regex = exec_time_tab.text_input('Filter Results:', placeholder='Regular Expression (Regex)') + + # --- Data manipulations --- # + experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected, + modes_selected, cluster_sizes_selected, machines_selected, + scale_factors_selected, + result_dirs=result_dirs) + experiments_data_df = get_experiments_data(experiments_selected_df, granularity_selected, result_dirs=result_dirs) + experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.contains(regex, regex=True)] + + if len(experiments_data_df) > 3000: + st.error( + "Too many rows in the result. " + "Please refine your dimension selection or apply a regex filter to narrow down the results.") + st.stop() + + # --- Plot the data --- # + chart = ( + alt.Chart(experiments_data_df) + .mark_bar() + .encode( + alt.X("configuration:N", axis=None, title='Configuration', stack=None), + alt.Y("time_diff_in_mins:Q", title='Latency (mins)', axis=alt.Axis(titleFontWeight='bold')), + alt.Color("configuration:N", legend=alt.Legend(titleFontWeight='bold', labelLimit=400), + title='Configuration'), + alt.Column("event_id:N", title="", + header=alt.Header(orient='bottom', labelFontWeight='bold', labelAlign='right', + labelAngle=-45, labelPadding=20), + sort=alt.SortField("event_start_time", order="ascending")) + ) + .configure_range( + category={'scheme': 'dark2'} + ) ) - ) - heatmap = ( - base.mark_rect() - .encode( - alt.Color('performance_degradation_rate:Q', - scale=alt.Scale(scheme='redblue', reverse=True), - title='Performance Degradation Rate', - legend=alt.Legend(titleFontWeight='bold', titleLimit=400, direction="horizontal")) + exec_time_tab.markdown('#') + exec_time_tab.altair_chart(chart, theme=None) + + if performance_degradation_tab is not None: + # --- Data manipulations --- # + experiments_selected_df = get_experiments_selected(workload_selected, systems_selected, table_formats_selected, + modes_selected, cluster_sizes_selected, machines_selected, + scale_factors_selected, result_dirs=result_dirs) + experiments_data_df = get_experiments_data(experiments_selected_df, 'phase', result_dirs=result_dirs) + # Filter rows with event_id following the format _ + experiments_data_df = experiments_data_df[experiments_data_df['event_id'].str.match(r'^.+_\d+$')] + # Extract name part from event_id + experiments_data_df['phase_type'] = experiments_data_df['event_id'].str.extract(r'^(.+)_\d+$') + # Group by each distinct 'configuration' and 'phase_type' + grouped_df = experiments_data_df.groupby(['configuration', 'phase_type']) + # Compute performance degradation + grouped_df = grouped_df['time_diff_in_mins'].agg(performance_degradation_rate=utils.performance_degradation) + grouped_df = grouped_df.reset_index() + + # --- Plot the data --- # + # X axis: phase type + # Y axis: configuration + # score: degradation rate + base = ( + alt.Chart(grouped_df) + .encode( + alt.X("phase_type:N", title='', axis=alt.Axis(labelFontWeight='bold', labelAngle=-45)), + alt.Y("configuration:N", title='Configuration', + axis=alt.Axis(titleFontWeight='bold', maxExtent=430, labelLimit=400)) + ) ) - .properties( - height={"step": 50}, - width={"step": 50} + heatmap = ( + base.mark_rect() + .encode( + alt.Color('performance_degradation_rate:Q', + scale=alt.Scale(scheme='redblue', reverse=True), + title='Performance Degradation Rate', + legend=alt.Legend(titleFontWeight='bold', titleLimit=400, direction="horizontal")) + ) + .properties( + height={"step": 50}, + width={"step": 50} + ) ) - ) - text = ( - base.mark_text() - .encode( - alt.Text('performance_degradation_rate:Q', format=".2f"), - color=alt.condition(alt.datum.performance_degradation_rate > 0.8, alt.value("black"), alt.value("white")) + text = ( + base.mark_text() + .encode( + alt.Text('performance_degradation_rate:Q', format=".2f"), + color=alt.condition(alt.datum.performance_degradation_rate > 0.8, alt.value("black"), alt.value("white")) + ) ) - ) - performance_degradation_tab.markdown('#') - performance_degradation_tab.altair_chart(heatmap + text, theme=None) + performance_degradation_tab.markdown('#') + performance_degradation_tab.altair_chart(heatmap + text, theme=None) + +if __name__ == '__main__': + # Parse arguments + parser = argparse.ArgumentParser(description='LST-Bench Dashboard') + parser.add_argument('--result_dirs', type=str, nargs='+', help='Directories containing the result files') + args = parser.parse_args() + run(result_dirs=args.result_dirs) diff --git a/metrics/app/utils.py b/metrics/app/utils.py old mode 100644 new mode 100755 index 4595fc8b..82463ec6 --- a/metrics/app/utils.py +++ b/metrics/app/utils.py @@ -60,3 +60,21 @@ def performance_degradation(values: pd.DataFrame) -> float: # TODO: Handle multiple runs for more comprehensive analysis. return degradation_rate + +# -------- SQL GENERATION -------- # +def generate_sql_in_with_null(lhs: str, values: list, NA_value = "N/A") -> str: + """ + Generates a string of comma-separated values from a list + with None values converted to NULL. + + Args: + - values (list): A list of values to be converted to a string. + + Returns: + - str: A string of comma-separated values. + """ + + str_list = ', '.join(["'" + str(value) + "'" for value in values if value is not NA_value]) + null_predicate = '' if NA_value in values else 'NOT' + + return f"({lhs} IN ({str_list}) OR {lhs} IS {null_predicate} NULL)" \ No newline at end of file From d1ef6f15345ee52be33db5f9660badd947963766 Mon Sep 17 00:00:00 2001 From: Tiemo Bang Date: Fri, 5 Apr 2024 17:32:48 -0700 Subject: [PATCH 3/3] Fix Packaged schemas and path variables for config files --- launcher.sh | 8 +++-- .../microsoft/lst_bench/util/FileParser.java | 34 ++++++++++++------- .../microsoft/lst_bench/util/StringUtils.java | 12 ++++--- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/launcher.sh b/launcher.sh index c6f26a00..32611c21 100755 --- a/launcher.sh +++ b/launcher.sh @@ -1,7 +1,11 @@ #!/bin/bash -e # Constants -LST_BENCH_HOME="$PWD" +# Directory of the script +export LST_BENCH_HOME="$(dirname "$(readlink -f "$0")")" LST_BENCH_CLASSPATH="$LST_BENCH_HOME/target/*:$LST_BENCH_HOME/target/lib/*:$LST_BENCH_HOME/target/classes/*" -java -cp $LST_BENCH_CLASSPATH com.microsoft.lst_bench.Driver "$@" +SPARK_CLASSPATH="${SPARK_HOME}/jars/*" +CLASSPATH="${LST_BENCH_CLASSPATH}:${SPARK_CLASSPATH}" + +java -cp ${CLASSPATH} com.microsoft.lst_bench.Driver "$@" diff --git a/src/main/java/com/microsoft/lst_bench/util/FileParser.java b/src/main/java/com/microsoft/lst_bench/util/FileParser.java index cf93223f..f6fb950a 100755 --- a/src/main/java/com/microsoft/lst_bench/util/FileParser.java +++ b/src/main/java/com/microsoft/lst_bench/util/FileParser.java @@ -30,6 +30,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -44,15 +45,7 @@ public class FileParser { private static final ObjectMapper YAML_MAPPER = new YAMLMapper(); - private static final String SCHEMAS_PATH = - "src" - + File.separator - + "main" - + File.separator - + "resources" - + File.separator - + "schemas" - + File.separator; + private static final String SCHEMAS_PATH = "schemas" + File.separator; private FileParser() { // Defeat instantiation @@ -126,8 +119,12 @@ public static Map getParameterValues(String parameterValuesFile, /** * Reads the YAML file and replaces all environment variables (if present). Validates the YAML * file according to the schema. Creates and returns a `TaskLibrary` object. + * + *

Exports LIB_PATH for the directory of the file, so that the file contents can reference it + * as ${LIB_PATH}. */ public static Library loadLibrary(String filePath) throws IOException { + exportFilePath(filePath, "LIB_PATH"); return createObject(filePath, Library.class, SCHEMAS_PATH + "library.json"); } @@ -136,6 +133,7 @@ public static Library loadLibrary(String filePath) throws IOException { * file according to the schema. Creates and returns a `Workload` object. */ public static Workload loadWorkload(String filePath) throws IOException { + exportFilePath(filePath, "WL_PATH"); return createObject(filePath, Workload.class, SCHEMAS_PATH + "workload.json"); } @@ -144,6 +142,7 @@ public static Workload loadWorkload(String filePath) throws IOException { * file according to the schema. Creates and returns a `ConnectionsConfig` object. */ public static ConnectionsConfig loadConnectionsConfig(String filePath) throws IOException { + exportFilePath(filePath, "CON_PATH"); return createObject( filePath, ConnectionsConfig.class, SCHEMAS_PATH + "connections_config.json"); } @@ -153,6 +152,7 @@ public static ConnectionsConfig loadConnectionsConfig(String filePath) throws IO * file according to the schema. Creates and returns a `ExperimentConfig` object. */ public static ExperimentConfig loadExperimentConfig(String filePath) throws IOException { + exportFilePath(filePath, "EXP_PATH"); return createObject(filePath, ExperimentConfig.class, SCHEMAS_PATH + "experiment_config.json"); } @@ -161,6 +161,7 @@ public static ExperimentConfig loadExperimentConfig(String filePath) throws IOEx * file according to the schema. Creates and returns a `TelemetryConfig` object. */ public static TelemetryConfig loadTelemetryConfig(String filePath) throws IOException { + exportFilePath(filePath, "TEL_PATH"); return createObject(filePath, TelemetryConfig.class, SCHEMAS_PATH + "telemetry_config.json"); } @@ -174,11 +175,13 @@ private static T createObject(String filePath, Class objectType, String s // Verify that files exist File file = new File(filePath); - File schemaFile = new File(schemaFilePath); if (!file.exists()) { throw new IllegalArgumentException("File does not exist: " + filePath); } - if (!schemaFile.exists()) { + + InputStream schemaInputStream = + FileParser.class.getClassLoader().getResourceAsStream(schemaFilePath); + if (schemaInputStream == null) { throw new IllegalArgumentException("Schema file does not exist: " + schemaFilePath); } @@ -193,7 +196,7 @@ private static T createObject(String filePath, Class objectType, String s JsonSchemaFactory.builder(JsonSchemaFactory.getInstance(SpecVersion.VersionFlag.V202012)) .objectMapper(YAML_MAPPER) .build(); - JsonSchema schema = factory.getSchema(Files.newInputStream(schemaFile.toPath())); + JsonSchema schema = factory.getSchema(schemaInputStream); JsonNode jsonNodeDirect = YAML_MAPPER.readTree(resolvedYAMLContent); Set errorsFromFile = schema.validate(jsonNodeDirect); if (!errorsFromFile.isEmpty()) { @@ -202,4 +205,11 @@ private static T createObject(String filePath, Class objectType, String s // Create and return POJO return YAML_MAPPER.treeToValue(jsonNodeDirect, objectType); } + + /** Exports the directory of the file as an environment variable. */ + private static void exportFilePath(String file, String variableName) { + File f = new File(file); + String directory = f.isDirectory() ? file : f.getParent(); + System.setProperty(variableName, directory); + } } diff --git a/src/main/java/com/microsoft/lst_bench/util/StringUtils.java b/src/main/java/com/microsoft/lst_bench/util/StringUtils.java index e81e21ff..eccddcf6 100644 --- a/src/main/java/com/microsoft/lst_bench/util/StringUtils.java +++ b/src/main/java/com/microsoft/lst_bench/util/StringUtils.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -92,9 +93,10 @@ public static FileExec replaceRegex(FileExec f, String regex, String replacement } /** - * Reads the contents of the `sourceFile` and replaces any environment variables if present. If - * the environment variable is not set, the default value is used if specified. All other - * parameters are ignored. + * Reads the contents of the `sourceFile` and replaces any environment variables and JVM + * properties if present. JVM properties take precedence over environment variables. If a + * environment variable is not set, the default value is used if specified. All other parameters + * are ignored. */ public static String replaceEnvVars(File sourceFile) throws IOException { if (sourceFile == null || !sourceFile.isFile()) { @@ -102,7 +104,9 @@ public static String replaceEnvVars(File sourceFile) throws IOException { LOGGER.debug("replaceEnvVars received a null or missing file."); return null; } - StringSubstitutor envSub = new StringSubstitutor(System.getenv()); + Map env = new HashMap<>(System.getenv()); + System.getProperties().forEach((k, v) -> env.put(k.toString(), v.toString())); + StringSubstitutor envSub = new StringSubstitutor(env); return envSub.replace(FileUtils.readFileToString(sourceFile, StandardCharsets.UTF_8)); } }