From d38910d1de91e3178eb29fd3e5f910bfb0357392 Mon Sep 17 00:00:00 2001 From: Mikhail Mints Date: Sun, 20 Aug 2023 09:32:10 -0700 Subject: [PATCH] NN aerosol activation extension --- Project.toml | 9 +++ ext/Common.jl | 77 ++++++++++++++++++ ext/EmulatorModelsExt.jl | 47 +++++++++++ test/Project.toml | 9 ++- test/emulator_NN.jl | 165 +++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 6 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 ext/Common.jl create mode 100644 ext/EmulatorModelsExt.jl create mode 100644 test/emulator_NN.jl diff --git a/Project.toml b/Project.toml index c554c7d16..52ca40a09 100644 --- a/Project.toml +++ b/Project.toml @@ -11,10 +11,19 @@ RootSolvers = "7181ea78-2dcb-4de3-ab41-2b8ab5a31e74" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Thermodynamics = "b60c26fb-14c3-4610-9d3e-2d17fe7ff00c" +[weakdeps] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" + +[extensions] +EmulatorModelsExt = ["DataFrames", "MLJ"] + [compat] ClimaParams = "0.10" +DataFrames = "1.6" DocStringExtensions = "0.8, 0.9" ForwardDiff = "0.10" +MLJ = "0.20" RootSolvers = "0.3, 0.4" SpecialFunctions = "1, 2" Thermodynamics = "0.12.4" diff --git a/ext/Common.jl b/ext/Common.jl new file mode 100644 index 000000000..b09274649 --- /dev/null +++ b/ext/Common.jl @@ -0,0 +1,77 @@ +import CSV +import DataFrames as DF +using DataFramesMeta + +function get_num_modes(df::DataFrame) + i = 1 + while true + if !("mode_$(i)_N" in names(df)) + return i - 1 + end + i += 1 + end +end + +function get_num_modes(data_row::NamedTuple) + i = 1 + while true + if !(Symbol("mode_$(i)_N") in keys(data_row)) + return i - 1 + end + i += 1 + end +end + +function read_aerosol_dataset( + dataset_filename::String, + Y_name = :mode_1_act_frac_S_interp, +) + initial_data = DF.DataFrame(CSV.File(dataset_filename)) + df = filter(row -> row.S_max > 0 && row.S_max < 0.2, initial_data) + selected_columns_X = [] + num_modes = get_num_modes(df) + @info(num_modes) + for i in 1:num_modes + append!( + selected_columns_X, + Symbol.([ + "mode_$(i)_N", + "mode_$(i)_mean", + "mode_$(i)_stdev", + "mode_$(i)_kappa", + ]), + ) + end + append!( + selected_columns_X, + [:velocity, :initial_temperature, :initial_pressure], + ) + X = df[:, selected_columns_X] + Y = df[:, Y_name] + return (X, Y, initial_data) +end + +function preprocess_aerosol_data(X::DataFrame) + num_modes = get_num_modes(X) + for i in 1:num_modes + X = DF.transform( + X, + Symbol("mode_$(i)_N") => ByRow(log) => Symbol("mode_$(i)_N"), + ) + X = DF.transform( + X, + Symbol("mode_$(i)_mean") => + ByRow(log) => Symbol("mode_$(i)_mean"), + ) + end + X = DF.transform(X, :velocity => ByRow(log) => :velocity) + return X +end + +function target_transform(act_frac) + return @. atanh(2.0 * 0.99 * (act_frac - 0.5)) +end + +function inverse_target_transform(transformed_act_frac) + return @. (1.0 / (2.0 * 0.99)) * tanh(transformed_act_frac) + 0.5 +end diff --git a/ext/EmulatorModelsExt.jl b/ext/EmulatorModelsExt.jl new file mode 100644 index 000000000..4926667f4 --- /dev/null +++ b/ext/EmulatorModelsExt.jl @@ -0,0 +1,47 @@ +module EmulatorModelsExt + +import MLJ +import DataFrames as DF + +import Thermodynamics as TD +import Thermodynamics.Parameters as TDP +import CloudMicrophysics.AerosolActivation as AA +import CloudMicrophysics.AerosolModel as AM +import CloudMicrophysics.Parameters as CMP + +function AA.N_activated_per_mode( + machine::MLJ.Machine, + ap::CMP.AerosolActivationParameters, + ad::CMP.AerosolDistributionType, + aip::CMP.AirProperties, + tps::TDP.ThermodynamicsParameters, + T::FT, + p::FT, + w::FT, + q::TD.PhasePartition{FT}, +) where {FT <: Real} + hygro = AA.mean_hygroscopicity_parameter(ap, ad) + return ntuple(Val(AM.n_modes(ad))) do i + # Model predicts activation of the first mode. So, swap each mode + # with the first mode repeatedly to predict all activations. + modes_perm = collect(1:AM.n_modes(ad)) + modes_perm[[1, i]] = modes_perm[[i, 1]] + per_mode_data = [ + (; + Symbol("mode_$(j)_N") => ad.Modes[modes_perm[j]].N, + Symbol("mode_$(j)_mean") => ad.Modes[modes_perm[j]].r_dry, + Symbol("mode_$(j)_stdev") => ad.Modes[modes_perm[j]].stdev, + Symbol("mode_$(j)_kappa") => hygro[modes_perm[j]], + ) for j in 1:AM.n_modes(ad) + ] + additional_data = (; + :velocity => w, + :initial_temperature => T, + :initial_pressure => p, + ) + X = DF.DataFrame([merge(reduce(merge, per_mode_data), additional_data)]) + max(FT(0), min(FT(1), MLJ.predict(machine, X)[1])) * ad.Modes[i].N + end +end + +end # module diff --git a/test/Project.toml b/test/Project.toml index 238e5446c..d1b797297 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,11 +1,18 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" +ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c" CloudMicrophysics = "6a9e3e04-43cd-43ba-94b9-e8782df3c71b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" +MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845" +MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" RootSolvers = "7181ea78-2dcb-4de3-ab41-2b8ab5a31e74" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/emulator_NN.jl b/test/emulator_NN.jl new file mode 100644 index 000000000..e0ee5bed5 --- /dev/null +++ b/test/emulator_NN.jl @@ -0,0 +1,165 @@ +# Get ML packages +import MLJ +import Flux +import MLJModels +import MLJFlux +Standardizer = MLJ.@load Standardizer pkg = MLJModels +NeuralNetworkRegressor = MLJ.@load NeuralNetworkRegressor pkg = MLJFlux + +# Get the testing package +import Test as TT + +# Get the CliMA packages +import CloudMicrophysics as CM +import ClimaParams as CP +import Thermodynamics as TD +import CloudMicrophysics.AerosolModel as AM +import CloudMicrophysics.AerosolActivation as AA +import CloudMicrophysics.Parameters as CMP + +# Container for the NN we are about to build +struct NNBuilder <: MLJFlux.Builder + layer_sizes::Vector{Integer} + dropout::Vector +end + +# Define the NN structure +function MLJFlux.build(builder::NNBuilder, rng, n_in, n_out) + @assert length(builder.layer_sizes) == length(builder.dropout) + num_hidden_layers = length(builder.layer_sizes) + init = Flux.glorot_uniform(rng) + layers::Vector{Any} = [] + if num_hidden_layers == 0 + push!(layers, Flux.Dense(n_in => n_out, init = init)) + else + push!( + layers, + Flux.Dense( + n_in => builder.layer_sizes[1], + Flux.sigmoid_fast, + init = init, + ), + ) + end + for i in 1:num_hidden_layers + push!(layers, Flux.Dropout(builder.dropout[i])) + if i == num_hidden_layers + push!( + layers, + Flux.Dense(builder.layer_sizes[i] => n_out, init = init), + ) + else + push!( + layers, + Flux.Dense( + builder.layer_sizes[i] => builder.layer_sizes[i + 1], + Flux.sigmoid_fast, + init = init, + ), + ) + end + end + return Flux.Chain(layers...) +end + +# Load aerosol data reading and preprocessing functions +include(joinpath(pkgdir(CM), "ext", "Common.jl")) + +# Get the ML model +function get_2modal_NN_model_FT32() + FT = Float32 + machine_name = "2modal_nn_machine_naive.jls" + + # If the ML model already exists load it in. + # If it does not exist, train a NN + fpath = joinpath(pkgdir(CM), "test") + if isfile(joinpath(fpath, machine_name)) + # Read-in the saved ML model + emulator_filepath = joinpath(fpath, machine_name) + return MLJ.machine(emulator_filepath) + else + # Download data files if you don't have them + # (not the most elegant way...) + fname = "2modal_dataset1_train.csv" + if !isfile(joinpath(fpath, "data", fname)) + # For reasons unknown to humankind uploading/downloading the file + # from Caltech box changes the file. We are using dropbox for now. + # In the end we should upload the files to Caltech Data + # (and pray they are not changed there...) + url = "https://www.dropbox.com/scl/fi/qgq6ujvqenebjkskqvht5/2modal_dataset1_train.csv?rlkey=53qtqz0mtce993gy5jtnpdfz5&dl=0" + download(url, fname) + mkdir(joinpath(fpath, "data")) + mv(fname, joinpath(fpath, "data", fname), force = true) + end + + # Read the aerosol data + X_train, Y_train, initial_data = + read_aerosol_dataset(joinpath(fpath, "data", fname)) + + # Define the training pipeline + pipeline = + preprocess_aerosol_data |> + Standardizer() |> + MLJ.TransformedTargetModel( + NeuralNetworkRegressor( + builder = NNBuilder([250, 50, 5], [FT(0.3), FT(0), FT(0)]), + optimiser = Flux.Optimise.Adam(0.001, (0.9, 0.999), 1.0e-8), + epochs = 2000, + loss = Flux.mse, + batch_size = 1000, + ), + transformer = target_transform, + inverse = inverse_target_transform, + ) + # Create the untrained ML model + mach = MLJ.machine(pipeline, X_train, Y_train) + # Train a new ML model + @info("Training a new ML model. This may take a minute or two.") + MLJ.fit!(mach, verbosity = 0) + # Save the ML model to a binary file that can be re-used by CloudMicrophysics + MLJ.save(joinpath(fpath, machine_name), mach) + return mach + end +end + +function test_emulator_NN(FT) + + aip = CMP.AirProperties(FT) + tps = TD.Parameters.ThermodynamicsParameters(FT) + ap = CMP.AerosolActivationParameters(FT) + + # Atmospheric conditions + T = FT(294) # air temperature K + p = FT(1e5) # air pressure Pa + w = FT(0.5) # vertical velocity m/s + p_vs = TD.saturation_vapor_pressure(tps, T, TD.Liquid()) + q_vs = 1 / (1 - TD.Parameters.molmass_ratio(tps) * (p_vs - p) / p_vs) + q = TD.PhasePartition(q_vs) + + # Aerosol size distribution + salt = CMP.Seasalt(FT) + # Accumulation mode + r1 = FT(0.243 * 1e-6) # m + σ1 = FT(1.4) # - + N1 = FT(100 * 1e6) # 1/m3 + # Coarse Mode + r2 = FT(1.5 * 1e-6) # m + σ2 = FT(2.1) # - + N2 = FT(1e6) # 1/m3 + acc = AM.Mode_κ(r1, σ1, N1, (FT(1.0),), (FT(1.0),), (salt.M,), (salt.κ,), 1) + crs = AM.Mode_κ(r2, σ2, N2, (FT(1.0),), (FT(1.0),), (salt.M,), (salt.κ,), 1) + ad = AM.AerosolDistribution((crs, acc)) + + # Get the ML model + mach = get_2modal_NN_model_FT32() + + TT.@test AA.N_activated_per_mode(mach, ap, ad, aip, tps, T, p, w, q)[1] ≈ + AA.N_activated_per_mode(ap, ad, aip, tps, T, p, w, q)[1] rtol = + 1e-5 + TT.@test AA.N_activated_per_mode(mach, ap, ad, aip, tps, T, p, w, q)[2] ≈ + AA.N_activated_per_mode(ap, ad, aip, tps, T, p, w, q)[2] rtol = + 1e-3 +end + +@info "Aerosol activation NN test" +test_emulator_NN(Float32) diff --git a/test/runtests.jl b/test/runtests.jl index 31e94bdd8..13347a7d8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,4 +13,5 @@ include("common_types_tests.jl") include("nucleation_unit_tests.jl") include("precipitation_susceptibility_tests.jl") include("p3_tests.jl") +include("emulator_NN.jl") include("aqua.jl")