Add rliable plots (#176)

araffin · web-flow · commit b606c1f0865d · 2021-10-20T18:18:40.000+02:00
* Start integrating rliable

* Add proba of improvement plot

* Make rliable optional and update normalization

* Update doc

* Add sample efficiency plot

* Avoid for loop and prevent potential bug

* Update titles and warn user

* Ensure backward compat

* Fix backward compat
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## Release 1.2.1a0 (WIP)
+## Release 1.2.1a4 (WIP)
 
 ### Breaking Changes
 - Upgrade to panda-gym 1.1.1
 
 ### New Features
+- Added support for using rliable for performance comparison
 
 ### Bug fixes
 - Fix training with Dict obs and channel last images
diff --git a/README.md b/README.md
@@ -79,6 +79,30 @@ Plot evaluation reward curve for TQC, SAC and TD3 on the HalfCheetah and Ant PyB
 python scripts/all_plots.py -a sac td3 tqc --env HalfCheetah Ant -f rl-trained-agents/
 ```
 
+## Plot with the rliable library
+
+The RL zoo integrates some of [rliable](https://agarwl.github.io/rliable/) library features.
+
+First, you need to install [rliable](https://github.com/google-research/rliable).
+
+Note: Python 3.7+ is required in that case.
+
+Then export your results to a file using the `all_plots.py` script (see above):
+```
+python scripts/all_plots.py -a sac td3 tqc --env Half Ant -f logs/ -o logs/offpolicy
+```
+
+You can now use the `plot_from_file.py` script with `--rliable`, `--versus` and `--iqm` arguments:
+```
+python scripts/plot_from_file.py -i logs/offpolicy.pkl --skip-timesteps --rliable --versus -l SAC TD3 TQC
+```
+
+Note: you may need to edit `plot_from_file.py`, in particular the `env_key_to_env_id` dictionary
+and the `scripts/score_normalization.py` which stores min and max score for each environment.
+
+Remark: plotting with the `--rliable` option is usually slow as confidence interval need to be computed using bootstrap sampling.
+
+
 ## Custom Environment
 
 The easiest way to add support for a custom environment is to edit `utils/import_envs.py` and register your environment here. Then, you need to add a section for it in the hyperparameters file (`hyperparams/algo.yml`).
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,5 @@ cloudpickle>=1.5.0
 atari-py==0.2.6
 plotly
 panda-gym>=1.1.1
+# rliable requires python 3.7+
+# rliable>=1.0.5
diff --git a/scripts/all_plots.py b/scripts/all_plots.py
@@ -195,6 +195,7 @@
                     "std_error": std_error,
                     "last_evals": last_evals,
                     "std_error_last_eval": std_error_last_eval,
+                    "mean_per_eval": mean_per_eval,
                 }
 
                 plt.plot(timesteps / divider, mean_, label=f"{algo}-{args.labels[folder_idx]}", linewidth=3)
@@ -203,6 +204,7 @@
     plt.legend()
 
 
+# Markdown Table
 writer = pytablewriter.MarkdownTableWriter()
 writer.table_name = "results_table"
 
diff --git a/scripts/plot_from_file.py b/scripts/plot_from_file.py
@@ -1,12 +1,22 @@
 import argparse
+import itertools
 import pickle
+import warnings
 
 import numpy as np
 import pandas as pd
 import pytablewriter
 import seaborn
 from matplotlib import pyplot as plt
 
+try:
+    from rliable import library as rly  # pytype: disable=import-error
+    from rliable import metrics, plot_utils  # pytype: disable=import-error
+except ImportError:
+    rly = None
+
+from score_normalization import normalize_score
+
 
 # From https://github.com/mwaskom/seaborn/blob/master/seaborn/categorical.py
 def restyle_boxplot(artist_dict, color, gray="#222222", linewidth=1, fliersize=5):
@@ -42,6 +52,10 @@ def restyle_boxplot(artist_dict, color, gray="#222222", linewidth=1, fliersize=5
 parser.add_argument("--fontsize", help="Font size", type=int, default=14)
 parser.add_argument("-l", "--labels", help="Custom labels", type=str, nargs="+")
 parser.add_argument("-b", "--boxplot", help="Enable boxplot", action="store_true", default=False)
+parser.add_argument("-r", "--rliable", help="Enable rliable plots", action="store_true", default=False)
+parser.add_argument("-vs", "--versus", help="Enable probability of improvement plot", action="store_true", default=False)
+parser.add_argument("-iqm", "--iqm", help="Enable IQM sample efficiency plot", action="store_true", default=False)
+parser.add_argument("-ci", "--ci-size", help="Confidence interval size (for rliable)", type=float, default=0.95)
 parser.add_argument("-latex", "--latex", help="Enable latex support", action="store_true", default=False)
 parser.add_argument("--merge", help="Merge with other results files", nargs="+", default=[], type=str)
 
@@ -132,21 +146,192 @@ def restyle_boxplot(artist_dict, color, gray="#222222", linewidth=1, fliersize=5
 
 # Convert to pandas dataframe, in order to use seaborn
 labels_df, envs_df, scores = [], [], []
+# Post-process to use it with rliable
+# algo: (n_runs, n_envs)
+normalized_score_dict = {}
+# algo: (n_runs, n_envs, n_eval)
+all_eval_normalized_scores_dict = {}
+# Convert env key to env id for normalization
+env_key_to_env_id = {
+    "Half": "HalfCheetahBulletEnv-v0",
+    "Ant": "AntBulletEnv-v0",
+    "Hopper": "HopperBulletEnv-v0",
+    "Walker": "Walker2DBulletEnv-v0",
+}
+# Backward compat
+skip_all_algos_dict = False
+
 for key in keys:
+    algo_scores, all_algo_scores = [], []
     for env in envs:
         if isinstance(results[env][key]["last_evals"], (np.float32, np.float64)):
             # No enough timesteps
             print(f"Skipping {env}-{key}")
             continue
+
         for score in results[env][key]["last_evals"]:
             labels_df.append(labels[key])
             # convert to int if needed
             # labels_df.append(int(labels[key]))
             envs_df.append(env)
             scores.append(score)
 
+        algo_scores.append(results[env][key]["last_evals"])
+
+        # Backward compat: mean_per_eval key may not be present
+        if "mean_per_eval" in results[env][key]:
+            all_algo_scores.append(results[env][key]["mean_per_eval"])
+        else:
+            skip_all_algos_dict = True
+
+        # Normalize score, env key must match env_id
+        if env in env_key_to_env_id:
+            algo_scores[-1] = normalize_score(algo_scores[-1], env_key_to_env_id[env])
+            if not skip_all_algos_dict:
+                all_algo_scores[-1] = normalize_score(all_algo_scores[-1], env_key_to_env_id[env])
+        elif env not in env_key_to_env_id and args.rliable:
+            warnings.warn(f"{env} not found for normalizing scores, you should update `env_key_to_env_id`")
+
+    # Truncate to convert to matrix
+    min_runs = min([len(algo_score) for algo_score in algo_scores])
+    if min_runs > 0:
+        algo_scores = [algo_score[:min_runs] for algo_score in algo_scores]
+        # shape: (n_envs, n_runs) -> (n_runs, n_envs)
+        normalized_score_dict[labels[key]] = np.array(algo_scores).T
+        if not skip_all_algos_dict:
+            all_algo_scores = [all_algo_score[:, :min_runs] for all_algo_score in all_algo_scores]
+            # (n_envs, n_eval, n_runs) -> (n_runs, n_envs, n_eval)
+            all_eval_normalized_scores_dict[labels[key]] = np.array(all_algo_scores).transpose((2, 0, 1))
+
 data_frame = pd.DataFrame(data=dict(Method=labels_df, Environment=envs_df, Score=scores))
 
+# Rliable plots, see https://github.com/google-research/rliable
+if args.rliable:
+
+    if rly is None:
+        raise ImportError("You must install rliable package to use this feature. Note: Python 3.7+ is required in that case.")
+
+    print("Computing bootstrap CI ...")
+    algorithms = list(labels.values())
+    # Scores as a dictionary mapping algorithms to their normalized
+    # score matrices, each of which is of size `(num_runs x num_envs)`.
+
+    aggregate_func = lambda x: np.array(  # noqa: E731
+        [
+            metrics.aggregate_median(x),
+            metrics.aggregate_iqm(x),
+            metrics.aggregate_mean(x),
+            metrics.aggregate_optimality_gap(x),
+        ]
+    )
+    aggregate_scores, aggregate_interval_estimates = rly.get_interval_estimates(
+        normalized_score_dict,
+        aggregate_func,
+        # Default was 50000
+        reps=2000,  # Number of bootstrap replications.
+        confidence_interval_size=args.ci_size,  # Coverage of confidence interval. Defaults to 95%.
+    )
+
+    fig, axes = plot_utils.plot_interval_estimates(
+        aggregate_scores,
+        aggregate_interval_estimates,
+        metric_names=["Median", "IQM", "Mean", "Optimality Gap"],
+        algorithms=algorithms,
+        xlabel="Normalized Score",
+        xlabel_y_coordinate=0.02,
+        subfigure_width=5,
+        row_height=1,
+        max_ticks=4,
+        interval_height=0.6,
+    )
+    fig.canvas.manager.set_window_title("Rliable metrics")
+    # Adjust margin to see the x label
+    plt.tight_layout()
+    plt.subplots_adjust(bottom=0.2)
+
+    # Performance profiles
+    # Normalized score thresholds
+    normalized_score_thresholds = np.linspace(0.0, 1.5, 50)
+    score_distributions, score_distributions_cis = rly.create_performance_profile(
+        normalized_score_dict,
+        normalized_score_thresholds,
+        reps=2000,
+        confidence_interval_size=args.ci_size,
+    )
+    # Plot score distributions
+    fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
+    plot_utils.plot_performance_profiles(
+        score_distributions,
+        normalized_score_thresholds,
+        performance_profile_cis=score_distributions_cis,
+        colors=dict(zip(algorithms, seaborn.color_palette("colorblind"))),
+        xlabel=r"Normalized Score $(\tau)$",
+        ax=ax,
+    )
+    fig.canvas.manager.set_window_title("Performance profiles")
+    plt.legend()
+
+    # Probability of improvement
+    # Scores as a dictionary containing pairs of normalized score
+    # matrices for pairs of algorithms we want to compare
+    algorithm_pairs_keys = itertools.combinations(algorithms, 2)
+    # algorithm_pairs = {.. , 'x,y': (score_x, score_y), ..}
+    algorithm_pairs = {}
+    for algo1, algo2 in algorithm_pairs_keys:
+        algorithm_pairs[f"{algo1}, {algo2}"] = (normalized_score_dict[algo1], normalized_score_dict[algo2])
+
+    if args.versus:
+        average_probabilities, average_prob_cis = rly.get_interval_estimates(
+            algorithm_pairs,
+            metrics.probability_of_improvement,
+            reps=1000,  # Default was 50000
+            confidence_interval_size=args.ci_size,
+        )
+        plot_utils.plot_probability_of_improvement(
+            average_probabilities,
+            average_prob_cis,
+            figsize=(10, 8),
+            interval_height=0.6,
+        )
+        plt.gcf().canvas.manager.set_window_title("Probability of Improvement")
+        plt.tight_layout()
+
+    if args.iqm:
+        # Load scores as a dictionary mapping algorithms to their normalized
+        # score matrices across all evaluations, each of which is of size
+        # `(n_runs, n_envs, n_eval)` where scores are recorded every n steps.
+        # Only compute CI for 1/4 of the evaluations and keep the first and last eval
+        downsample_factor = 4
+        n_evals = all_eval_normalized_scores_dict[algorithms[0]].shape[-1]
+        eval_indices = np.arange(n_evals - 1)[::downsample_factor]
+        eval_indices = np.concatenate((eval_indices, [n_evals - 1]))
+        eval_indices_scores_dict = {
+            algorithm: score[:, :, eval_indices] for algorithm, score in all_eval_normalized_scores_dict.items()
+        }
+        iqm = lambda scores: np.array(  # noqa: E731
+            [metrics.aggregate_iqm(scores[..., eval_idx]) for eval_idx in range(scores.shape[-1])]
+        )
+        iqm_scores, iqm_cis = rly.get_interval_estimates(
+            eval_indices_scores_dict,
+            iqm,
+            reps=2000,
+            confidence_interval_size=args.ci_size,
+        )
+        plot_utils.plot_sample_efficiency_curve(
+            eval_indices + 1,
+            iqm_scores,
+            iqm_cis,
+            algorithms=algorithms,
+            # TODO: convert to timesteps using the timesteps
+            xlabel=r"Number of Evaluations",
+            ylabel="IQM Normalized Score",
+        )
+        plt.gcf().canvas.manager.set_window_title("IQM Normalized Score - Sample Efficiency Curve")
+        plt.legend()
+        plt.tight_layout()
+
+    plt.show()
+
 # Plot final results with env as x axis
 plt.figure("Sensitivity plot", figsize=args.figsize)
 plt.title("Sensitivity plot", fontsize=args.fontsize)
diff --git a/scripts/score_normalization.py b/scripts/score_normalization.py
diff --git a/version.txt b/version.txt