CliMA · trontrytel · Mar 6, 2024 · Aug 20, 2023
diff --git a/Project.toml b/Project.toml
@@ -11,10 +11,19 @@ RootSolvers = "7181ea78-2dcb-4de3-ab41-2b8ab5a31e74"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Thermodynamics = "b60c26fb-14c3-4610-9d3e-2d17fe7ff00c"
 
+[weakdeps]
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
+
+[extensions]
+EmulatorModelsExt = ["DataFrames", "MLJ"]
+
 [compat]
 ClimaParams = "0.10"
+DataFrames = "1.6"
 DocStringExtensions = "0.8, 0.9"
 ForwardDiff = "0.10"
+MLJ = "0.20"
 RootSolvers = "0.3, 0.4"
 SpecialFunctions = "1, 2"
 Thermodynamics = "0.12.4"

diff --git a/ext/Common.jl b/ext/Common.jl
@@ -0,0 +1,77 @@
+import CSV
+import DataFrames as DF
+using DataFramesMeta
+
+function get_num_modes(df::DataFrame)
+    i = 1
+    while true
+        if !("mode_$(i)_N" in names(df))
+            return i - 1
+        end
+        i += 1
+    end
+end
+
+function get_num_modes(data_row::NamedTuple)
+    i = 1
+    while true
+        if !(Symbol("mode_$(i)_N") in keys(data_row))
+            return i - 1
+        end
+        i += 1
+    end
+end
+
+function read_aerosol_dataset(
+    dataset_filename::String,
+    Y_name = :mode_1_act_frac_S_interp,
+)
+    initial_data = DF.DataFrame(CSV.File(dataset_filename))
+    df = filter(row -> row.S_max > 0 && row.S_max < 0.2, initial_data)
+    selected_columns_X = []
+    num_modes = get_num_modes(df)
+    @info(num_modes)
+    for i in 1:num_modes
+        append!(
+            selected_columns_X,
+            Symbol.([
+                "mode_$(i)_N",
+                "mode_$(i)_mean",
+                "mode_$(i)_stdev",
+                "mode_$(i)_kappa",
+            ]),
+        )
+    end
+    append!(
+        selected_columns_X,
+        [:velocity, :initial_temperature, :initial_pressure],
+    )
+    X = df[:, selected_columns_X]
+    Y = df[:, Y_name]
+    return (X, Y, initial_data)
+end
+
+function preprocess_aerosol_data(X::DataFrame)
+    num_modes = get_num_modes(X)
+    for i in 1:num_modes
+        X = DF.transform(
+            X,
+            Symbol("mode_$(i)_N") => ByRow(log) => Symbol("mode_$(i)_N"),
+        )
+        X = DF.transform(
+            X,
+            Symbol("mode_$(i)_mean") =>
+                ByRow(log) => Symbol("mode_$(i)_mean"),
+        )
+    end
+    X = DF.transform(X, :velocity => ByRow(log) => :velocity)
+    return X
+end
+
+function target_transform(act_frac)
+    return @. atanh(2.0 * 0.99 * (act_frac - 0.5))
+end
+
+function inverse_target_transform(transformed_act_frac)
+    return @. (1.0 / (2.0 * 0.99)) * tanh(transformed_act_frac) + 0.5
+end
diff --git a/ext/EmulatorModelsExt.jl b/ext/EmulatorModelsExt.jl
@@ -0,0 +1,47 @@
+module EmulatorModelsExt
+
+import MLJ
+import DataFrames as DF
+
+import Thermodynamics as TD
+import Thermodynamics.Parameters as TDP
+import CloudMicrophysics.AerosolActivation as AA
+import CloudMicrophysics.AerosolModel as AM
+import CloudMicrophysics.Parameters as CMP
+
+function AA.N_activated_per_mode(
+    machine::MLJ.Machine,
+    ap::CMP.AerosolActivationParameters,
+    ad::CMP.AerosolDistributionType,
+    aip::CMP.AirProperties,
+    tps::TDP.ThermodynamicsParameters,
+    T::FT,
+    p::FT,
+    w::FT,
+    q::TD.PhasePartition{FT},
+) where {FT <: Real}
+    hygro = AA.mean_hygroscopicity_parameter(ap, ad)
+    return ntuple(Val(AM.n_modes(ad))) do i
+        # Model predicts activation of the first mode. So, swap each mode
+        # with the first mode repeatedly to predict all activations.
+        modes_perm = collect(1:AM.n_modes(ad))
+        modes_perm[[1, i]] = modes_perm[[i, 1]]
+        per_mode_data = [
+            (;
+                Symbol("mode_$(j)_N") => ad.Modes[modes_perm[j]].N,
+                Symbol("mode_$(j)_mean") => ad.Modes[modes_perm[j]].r_dry,
+                Symbol("mode_$(j)_stdev") => ad.Modes[modes_perm[j]].stdev,
+                Symbol("mode_$(j)_kappa") => hygro[modes_perm[j]],
+            ) for j in 1:AM.n_modes(ad)
+        ]
+        additional_data = (;
+            :velocity => w,
+            :initial_temperature => T,
+            :initial_pressure => p,
+        )
+        X = DF.DataFrame([merge(reduce(merge, per_mode_data), additional_data)])
+        max(FT(0), min(FT(1), MLJ.predict(machine, X)[1])) * ad.Modes[i].N
+    end
+end
+
+end # module
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,11 +1,18 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
+ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c"
 CloudMicrophysics = "6a9e3e04-43cd-43ba-94b9-e8782df3c71b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
+MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
+MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 RootSolvers = "7181ea78-2dcb-4de3-ab41-2b8ab5a31e74"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/test/emulator_NN.jl b/test/emulator_NN.jl
@@ -0,0 +1,165 @@
+# Get ML packages
+import MLJ
+import Flux
+import MLJModels
+import MLJFlux
+Standardizer = MLJ.@load Standardizer pkg = MLJModels
+NeuralNetworkRegressor = MLJ.@load NeuralNetworkRegressor pkg = MLJFlux
+
+# Get the testing package
+import Test as TT
+
+# Get the CliMA packages
+import CloudMicrophysics as CM
+import ClimaParams as CP
+import Thermodynamics as TD
+import CloudMicrophysics.AerosolModel as AM
+import CloudMicrophysics.AerosolActivation as AA
+import CloudMicrophysics.Parameters as CMP
+
+# Container for the NN we are about to build
+struct NNBuilder <: MLJFlux.Builder
+    layer_sizes::Vector{Integer}
+    dropout::Vector
+end
+
+# Define the NN structure
+function MLJFlux.build(builder::NNBuilder, rng, n_in, n_out)
+    @assert length(builder.layer_sizes) == length(builder.dropout)
+    num_hidden_layers = length(builder.layer_sizes)
+    init = Flux.glorot_uniform(rng)
+    layers::Vector{Any} = []
+    if num_hidden_layers == 0
+        push!(layers, Flux.Dense(n_in => n_out, init = init))
+    else
+        push!(
+            layers,
+            Flux.Dense(
+                n_in => builder.layer_sizes[1],
+                Flux.sigmoid_fast,
+                init = init,
+            ),
+        )
+    end
+    for i in 1:num_hidden_layers
+        push!(layers, Flux.Dropout(builder.dropout[i]))
+        if i == num_hidden_layers
+            push!(
+                layers,
+                Flux.Dense(builder.layer_sizes[i] => n_out, init = init),
+            )
+        else
+            push!(
+                layers,
+                Flux.Dense(
+                    builder.layer_sizes[i] => builder.layer_sizes[i + 1],
+                    Flux.sigmoid_fast,
+                    init = init,
+                ),
+            )
+        end
+    end
+    return Flux.Chain(layers...)
+end
+
+# Load aerosol data reading and preprocessing functions
+include(joinpath(pkgdir(CM), "ext", "Common.jl"))
+
+# Get the ML model
+function get_2modal_NN_model_FT32()
+    FT = Float32
+    machine_name = "2modal_nn_machine_naive.jls"
+
+    # If the ML model already exists load it in.
+    # If it does not exist, train a NN
+    fpath = joinpath(pkgdir(CM), "test")
+    if isfile(joinpath(fpath, machine_name))
+        # Read-in the saved ML model
+        emulator_filepath = joinpath(fpath, machine_name)
+        return MLJ.machine(emulator_filepath)
+    else
+        # Download data files if you don't have them
+        # (not the most elegant way...)
+        fname = "2modal_dataset1_train.csv"
+        if !isfile(joinpath(fpath, "data", fname))
+            # For reasons unknown to humankind uploading/downloading the file
+            # from Caltech box changes the file. We are using dropbox for now.
+            # In the end we should upload the files to Caltech Data
+            # (and pray they are not changed there...)
+            url = "https://www.dropbox.com/scl/fi/qgq6ujvqenebjkskqvht5/2modal_dataset1_train.csv?rlkey=53qtqz0mtce993gy5jtnpdfz5&dl=0"
+            download(url, fname)
+            mkdir(joinpath(fpath, "data"))
+            mv(fname, joinpath(fpath, "data", fname), force = true)
+        end
+
+        # Read the aerosol data
+        X_train, Y_train, initial_data =
+            read_aerosol_dataset(joinpath(fpath, "data", fname))
+
+        # Define the training pipeline
+        pipeline =
+            preprocess_aerosol_data |>
+            Standardizer() |>
+            MLJ.TransformedTargetModel(
+                NeuralNetworkRegressor(
+                    builder = NNBuilder([250, 50, 5], [FT(0.3), FT(0), FT(0)]),
+                    optimiser = Flux.Optimise.Adam(0.001, (0.9, 0.999), 1.0e-8),
+                    epochs = 2000,
+                    loss = Flux.mse,
+                    batch_size = 1000,
+                ),
+                transformer = target_transform,
+                inverse = inverse_target_transform,
+            )
+        # Create the untrained ML model
+        mach = MLJ.machine(pipeline, X_train, Y_train)
+        # Train a new ML model
+        @info("Training a new ML model. This may take a minute or two.")
+        MLJ.fit!(mach, verbosity = 0)
+        # Save the ML model to a binary file that can be re-used by CloudMicrophysics
+        MLJ.save(joinpath(fpath, machine_name), mach)
+        return mach
+    end
+end
+
+function test_emulator_NN(FT)
+
+    aip = CMP.AirProperties(FT)
+    tps = TD.Parameters.ThermodynamicsParameters(FT)
+    ap = CMP.AerosolActivationParameters(FT)
+
+    # Atmospheric conditions
+    T = FT(294)    # air temperature K
+    p = FT(1e5)    # air pressure Pa
+    w = FT(0.5)    # vertical velocity m/s
+    p_vs = TD.saturation_vapor_pressure(tps, T, TD.Liquid())
+    q_vs = 1 / (1 - TD.Parameters.molmass_ratio(tps) * (p_vs - p) / p_vs)
+    q = TD.PhasePartition(q_vs)
+
+    # Aerosol size distribution
+    salt = CMP.Seasalt(FT)
+    # Accumulation mode
+    r1 = FT(0.243 * 1e-6) # m
+    σ1 = FT(1.4)          # -
+    N1 = FT(100 * 1e6)    # 1/m3
+    # Coarse Mode
+    r2 = FT(1.5 * 1e-6)   # m
+    σ2 = FT(2.1)          # -
+    N2 = FT(1e6)          # 1/m3
+    acc = AM.Mode_κ(r1, σ1, N1, (FT(1.0),), (FT(1.0),), (salt.M,), (salt.κ,), 1)
+    crs = AM.Mode_κ(r2, σ2, N2, (FT(1.0),), (FT(1.0),), (salt.M,), (salt.κ,), 1)
+    ad = AM.AerosolDistribution((crs, acc))
+
+    # Get the ML model
+    mach = get_2modal_NN_model_FT32()
+
+    TT.@test AA.N_activated_per_mode(mach, ap, ad, aip, tps, T, p, w, q)[1] ≈
+             AA.N_activated_per_mode(ap, ad, aip, tps, T, p, w, q)[1] rtol =
+        1e-5
+    TT.@test AA.N_activated_per_mode(mach, ap, ad, aip, tps, T, p, w, q)[2] ≈
+             AA.N_activated_per_mode(ap, ad, aip, tps, T, p, w, q)[2] rtol =
+        1e-3
+end
+
+@info "Aerosol activation NN test"
+test_emulator_NN(Float32)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -13,4 +13,5 @@ include("common_types_tests.jl")
 include("nucleation_unit_tests.jl")
 include("precipitation_susceptibility_tests.jl")
 include("p3_tests.jl")
+include("emulator_NN.jl")
 include("aqua.jl")