diff --git a/ext/EmulatorModelsExt.jl b/ext/EmulatorModelsExt.jl index bf7cb60ece..4926667f4a 100644 --- a/ext/EmulatorModelsExt.jl +++ b/ext/EmulatorModelsExt.jl @@ -12,6 +12,7 @@ import CloudMicrophysics.Parameters as CMP function AA.N_activated_per_mode( machine::MLJ.Machine, ap::CMP.AerosolActivationParameters, + ad::CMP.AerosolDistributionType, aip::CMP.AirProperties, tps::TDP.ThermodynamicsParameters, T::FT, @@ -19,7 +20,7 @@ function AA.N_activated_per_mode( w::FT, q::TD.PhasePartition{FT}, ) where {FT <: Real} - hygro = mean_hygroscopicity_parameter(ap, ad) + hygro = AA.mean_hygroscopicity_parameter(ap, ad) return ntuple(Val(AM.n_modes(ad))) do i # Model predicts activation of the first mode. So, swap each mode # with the first mode repeatedly to predict all activations. diff --git a/ml_test/Common.jl b/ml_test/Common.jl new file mode 100644 index 0000000000..834cc9cbb6 --- /dev/null +++ b/ml_test/Common.jl @@ -0,0 +1,77 @@ +import CSV +import DataFrames as DF +using DataFramesMeta + +function get_num_modes(df::DataFrame) + i = 1 + while true + if !("mode_$(i)_N" in names(df)) + return i - 1 + end + i += 1 + end +end + +function get_num_modes(data_row::NamedTuple) + i = 1 + while true + if !(Symbol("mode_$(i)_N") in keys(data_row)) + return i - 1 + end + i += 1 + end +end + +function read_aerosol_dataset( + dataset_filename::String, + Y_name = :mode_1_act_frac_S_interp +) + initial_data = DF.DataFrame(CSV.File(dataset_filename)) + df = filter(row -> row.S_max > 0 && row.S_max < 0.2, initial_data) + selected_columns_X = [] + num_modes = get_num_modes(df) + @info(num_modes) + for i in 1:num_modes + append!( + selected_columns_X, + Symbol.([ + "mode_$(i)_N", + "mode_$(i)_mean", + "mode_$(i)_stdev", + "mode_$(i)_kappa", + ]), + ) + end + append!( + selected_columns_X, + [:velocity, :initial_temperature, :initial_pressure], + ) + X = df[:, selected_columns_X] + Y = df[:, Y_name] + return (X, Y, initial_data) +end + +function preprocess_aerosol_data(X::DataFrame) + num_modes = get_num_modes(X) + for i in 1:num_modes + X = DF.transform( + X, + Symbol("mode_$(i)_N") => ByRow(log) => Symbol("mode_$(i)_N"), + ) + X = DF.transform( + X, + Symbol("mode_$(i)_mean") => + ByRow(log) => Symbol("mode_$(i)_mean"), + ) + end + X = DF.transform(X, :velocity => ByRow(log) => :velocity) + return X +end + +function target_transform(act_frac) + return @. atanh(2.0 * 0.99 * (act_frac - 0.5)) +end + +function inverse_target_transform(transformed_act_frac) + return @. (1.0 / (2.0 * 0.99)) * tanh(transformed_act_frac) + 0.5 +end diff --git a/ml_test/PreprocessAerosolData.jl b/ml_test/PreprocessAerosolData.jl deleted file mode 100644 index 62d5b80323..0000000000 --- a/ml_test/PreprocessAerosolData.jl +++ /dev/null @@ -1,207 +0,0 @@ -#module PreprocessAerosolData - -import CloudMicrophysics as CM - -import CloudMicrophysics.AerosolActivation as AA -import CloudMicrophysics.AerosolModel as AM -import CloudMicrophysics.Parameters as CMP -import CloudMicrophysics.Parameters as CMP -import CloudMicrophysics.Common as CO - -import ClimaParams as CP -import Thermodynamics as TD -import DataFrames as DF -using DataFramesMeta - -const FT = Float64 -#const APS = CMP.ParametersType - -function get_num_modes(df::DataFrame) - i = 1 - while true - if !("mode_$(i)_N" in names(df)) - return i - 1 - end - i += 1 - end -end - -function get_num_modes(data_row::NamedTuple) - i = 1 - while true - if !(Symbol("mode_$(i)_N") in keys(data_row)) - return i - 1 - end - i += 1 - end -end - -#function convert_to_ARG_params(data_row::NamedTuple, tps) -# -# num_modes = get_num_modes(data_row) -# @assert num_modes > 0 -# mode_Ns = [] -# mode_means = [] -# mode_stdevs = [] -# mode_kappas = [] -# w = data_row.velocity -# T = data_row.initial_temperature -# p = data_row.initial_pressure -# for i in 1:num_modes -# push!(mode_Ns, data_row[Symbol("mode_$(i)_N")]) -# push!(mode_means, data_row[Symbol("mode_$(i)_mean")]) -# push!(mode_stdevs, data_row[Symbol("mode_$(i)_stdev")]) -# push!(mode_kappas, data_row[Symbol("mode_$(i)_kappa")]) -# end -# ad = AM.AerosolDistribution( -# Tuple( -# AM.Mode_κ( -# mode_means[i], -# mode_stdevs[i], -# mode_Ns[i], -# FT(1), -# FT(1), -# FT(0), -# mode_kappas[i], -# 1, -# ) for i in 1:num_modes -# ), -# ) -# pv0 = TD.saturation_vapor_pressure(tps, T, TD.Liquid()) -# vapor_mix_ratio = pv0 / TD.Parameters.molmass_ratio(tps) / (p - pv0) -# q_vap = vapor_mix_ratio / (vapor_mix_ratio + 1) -# q = TD.PhasePartition(q_vap, FT(0), FT(0)) -# return (; ad, T, p, w, q, mode_Ns) -#end -# -#function convert_to_ARG_params(data_row::NamedTuple) -# return convert_to_ARG_params(data_row, default_param_set) -#end -# -#function get_ARG_S_max(data_row::NamedTuple, param_set::APS) -# (; ad, T, p, w, q) = convert_to_ARG_params(data_row, param_set) -# max_supersaturation = -# AA.max_supersaturation(param_set, CT.ARG2000Type(), ad, T, p, w, q) -# return max_supersaturation -#end -# -#function get_ARG_S_max(data_row::NamedTuple) -# return get_ARG_S_max(data_row, default_param_set) -#end -# -#function get_ARG_S_max(X::DataFrame, param_set::APS) -# return get_ARG_S_max.(NamedTuple.(eachrow(X)), param_set) -#end -# -#function get_ARG_S_max(X::DataFrame) -# return get_ARG_S_max(X, default_param_set) -#end -# -#function get_ARG_S_crit(data_row::NamedTuple, param_set::APS) -# (; ad, T) = convert_to_ARG_params(data_row, param_set) -# return AA.critical_supersaturation(param_set, ad, T) -#end -# -#function get_ARG_S_crit(data_row::NamedTuple) -# return get_ARG_S_crit(data_row, default_param_set) -#end -# -#function get_ARG_S_crit(X::DataFrame, param_set::APS) -# return get_ARG_S_crit.(NamedTuple.(eachrow(X)), param_set) -#end -# -#function get_ARG_S_crit(X::DataFrame) -# return get_ARG_S_crit(X, default_param_set) -#end -# -#function get_ARG_act_N(data_row::NamedTuple, param_set::APS, S_max = nothing) -# (; ad, T, p, w, q) = convert_to_ARG_params(data_row, param_set) -# if S_max === nothing -# return collect( -# AA.N_activated_per_mode( -# param_set, -# CT.ARG2000Type(), -# ad, -# T, -# p, -# w, -# q, -# ), -# ) -# else -# critical_supersaturation = AA.critical_supersaturation(param_set, ad, T) -# return collect( -# AA.N_activated_per_mode( -# param_set, -# CT.ARG2000Type(), -# ad, -# T, -# p, -# w, -# q, -# S_max, -# critical_supersaturation, -# ), -# ) -# end -#end -# -#function get_ARG_act_N(data_row::NamedTuple, S_max = nothing) -# return get_ARG_act_N(data_row, default_param_set, S_max) -#end -# -#function get_ARG_act_N(X::DataFrame, param_set::APS, S_max = nothing) -# return transpose( -# hcat(get_ARG_act_N.(NamedTuple.(eachrow(X)), param_set, S_max)...), -# ) -#end -# -#function get_ARG_act_N(X::DataFrame, S_max = nothing) -# return get_ARG_act_N(X, default_param_set, S_max) -#end -# -#function get_ARG_act_frac(data_row::NamedTuple, param_set::APS, S_max = nothing) -# (; mode_Ns) = convert_to_ARG_params(data_row, param_set) -# return get_ARG_act_N(data_row, param_set, S_max) ./ mode_Ns -#end -# -#function get_ARG_act_frac(data_row::NamedTuple, S_max = nothing) -# return get_ARG_act_frac(data_row, default_param_set, S_max) -#end -# -#function get_ARG_act_frac(X::DataFrame, param_set::APS, S_max = nothing) -# return transpose( -# hcat(get_ARG_act_frac.(NamedTuple.(eachrow(X)), param_set, S_max)...), -# ) -#end -# -#function get_ARG_act_frac(X::DataFrame, S_max = nothing) -# return get_ARG_act_frac(X, default_param_set, S_max) -#end - -function preprocess_aerosol_data(X::DataFrame) - num_modes = get_num_modes(X) - for i in 1:num_modes - X = DF.transform( - X, - Symbol("mode_$(i)_N") => ByRow(log) => Symbol("mode_$(i)_N"), - ) - X = DF.transform( - X, - Symbol("mode_$(i)_mean") => - ByRow(log) => Symbol("mode_$(i)_mean"), - ) - end - X = DF.transform(X, :velocity => ByRow(log) => :velocity) - return X -end - -function target_transform(act_frac) - return @. atanh(2.0 * 0.99 * (act_frac - 0.5)) -end - -function inverse_target_transform(transformed_act_frac) - return @. (1.0 / (2.0 * 0.99)) * tanh(transformed_act_frac) + 0.5 -end - -#end # module diff --git a/ml_test/Project.toml b/ml_test/Project.toml index f505bcaec6..48af46c9e1 100644 --- a/ml_test/Project.toml +++ b/ml_test/Project.toml @@ -1,12 +1,11 @@ [deps] -CLIMAParameters = "6eacf6c3-8458-43b9-ae03-caf5306d3d53" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c" CloudMicrophysics = "6a9e3e04-43cd-43ba-94b9-e8782df3c71b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" -EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" -JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845" MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" diff --git a/ml_test/testing.jl b/ml_test/testing.jl deleted file mode 100644 index ab73bd8b97..0000000000 --- a/ml_test/testing.jl +++ /dev/null @@ -1,63 +0,0 @@ -import CloudMicrophysics -import MLJ -import MLJFlux -import Flux - -include(joinpath( - pkgdir(CloudMicrophysics), - "ml_test", - "PreprocessAerosolData.jl") -) - -@info(PreprocessAerosolData) - -struct NNBuilder <: MLJFlux.Builder - layer_sizes::Vector{Integer} - dropout::Vector{Float64} -end - -function MLJFlux.build(builder::NNBuilder, rng, n_in, n_out) - @assert length(builder.layer_sizes) == length(builder.dropout) - num_hidden_layers = length(builder.layer_sizes) - init = Flux.glorot_uniform(rng) - layers::Vector{Any} = [] - if num_hidden_layers == 0 - push!(layers, Flux.Dense(n_in => n_out, init = init)) - else - push!( - layers, - Flux.Dense( - n_in => builder.layer_sizes[1], - Flux.sigmoid_fast, - init = init, - ), - ) - end - for i in 1:num_hidden_layers - push!(layers, Flux.Dropout(builder.dropout[i])) - if i == num_hidden_layers - push!( - layers, - Flux.Dense(builder.layer_sizes[i] => n_out, init = init), - ) - else - push!( - layers, - Flux.Dense( - builder.layer_sizes[i] => builder.layer_sizes[i + 1], - Flux.sigmoid_fast, - init = init, - ), - ) - end - end - return Flux.Chain(layers...) -end - -emulator_filepath = joinpath( - pkgdir(CloudMicrophysics), - "ml_test", - "2modal_nn_machine_naive.jls", -) - -machine = MLJ.machine(emulator_filepath) diff --git a/ml_test/train_emulator.jl b/ml_test/train_emulator.jl new file mode 100644 index 0000000000..dbe2fb6958 --- /dev/null +++ b/ml_test/train_emulator.jl @@ -0,0 +1,155 @@ +# Get ML packages +import MLJ +import Flux +import MLJModels +import MLJFlux +Standardizer = MLJ.@load Standardizer pkg = MLJModels +NeuralNetworkRegressor = MLJ.@load NeuralNetworkRegressor pkg = MLJFlux + +# path to our ML testing folder +import CloudMicrophysics as CM +fpath = joinpath(pkgdir(CM), "ml_test") + +# We will be precise +const FT = Float64 + +# Container for the NN we are about to build +struct NNBuilder <: MLJFlux.Builder + layer_sizes::Vector{Integer} + dropout::Vector{Float64} +end + +# TODO - what does it do exactly? +function MLJFlux.build(builder::NNBuilder, rng, n_in, n_out) + @assert length(builder.layer_sizes) == length(builder.dropout) + num_hidden_layers = length(builder.layer_sizes) + init = Flux.glorot_uniform(rng) + layers::Vector{Any} = [] + if num_hidden_layers == 0 + push!(layers, Flux.Dense(n_in => n_out, init = init)) + else + push!( + layers, + Flux.Dense( + n_in => builder.layer_sizes[1], + Flux.sigmoid_fast, + init = init, + ), + ) + end + for i in 1:num_hidden_layers + push!(layers, Flux.Dropout(builder.dropout[i])) + if i == num_hidden_layers + push!( + layers, + Flux.Dense(builder.layer_sizes[i] => n_out, init = init), + ) + else + push!( + layers, + Flux.Dense( + builder.layer_sizes[i] => builder.layer_sizes[i + 1], + Flux.sigmoid_fast, + init = init, + ), + ) + end + end + return Flux.Chain(layers...) +end + +# Load aerosol data reading and preprocessing functions +include(joinpath(fpath, "Common.jl")) + +# Get the ML model +function get_ML_model(machine_name) + # If the ML model already exists load it in. + # If it does not exist, train a NN + if isfile(joinpath(fpath, machine_name)) + # Read-in the saved ML model + emulator_filepath = joinpath(fpath, machine_name) + return MLJ.machine(emulator_filepath) + else + # Download data files if you don't have them + # (not the most elegant way...) + fname = "2modal_dataset1_train.csv" + if !isfile(joinpath(fpath, "data", fname)) + # For reasons unknown to humankind uploading/downloading the file + # from Caltech box changes the file. We are using dropbox for now. + # In the end we should upload the files to Caltech Data + # (and pray they are not changed there...) + url = "https://www.dropbox.com/scl/fi/qgq6ujvqenebjkskqvht5/2modal_dataset1_train.csv?rlkey=53qtqz0mtce993gy5jtnpdfz5&dl=0" + download(url, fname) + mkdir(joinpath(fpath, "data")) + mv(joinpath(pkgdir(CM), fname), joinpath(fpath, "data", fname), force=true) + end + + # Read the aerosol data + X_train, Y_train, initial_data = + read_aerosol_dataset(joinpath(fpath, "data", fname)) + + # Define the training pipeline + pipeline = + preprocess_aerosol_data |> + Standardizer() |> + MLJ.TransformedTargetModel( + NeuralNetworkRegressor( + builder = NNBuilder([250, 50, 5], [0.3, 0.0, 0.0]), + optimiser = Flux.Optimise.Adam(0.001, (0.9, 0.999), 1.0e-8), + epochs = 2000, + loss = Flux.mse, + batch_size = 1000, + ), + transformer = target_transform, + inverse = inverse_target_transform, + ) + # Create the untrained ML model + mach = MLJ.machine(pipeline, X_train, Y_train) + # Train a new ML model + @info("Training a new ML model. This may take a minute or two.") + MLJ.fit!(mach, verbosity = 0) + # Save the ML model to a binary file that can be re-used by CloudMicrophysics + MLJ.save(joinpath(fpath, machine_name), mach) + return mach + end +end + +# Try using the ML model and the default aerosol activation functions +import ClimaParams as CP +import Thermodynamics as TD + +import CloudMicrophysics.AerosolModel as AM +import CloudMicrophysics.AerosolActivation as AA +import CloudMicrophysics.Parameters as CMP + +aip = CMP.AirProperties(FT) +tps = TD.Parameters.ThermodynamicsParameters(FT) +ap = CMP.AerosolActivationParameters(FT) + +# Atmospheric conditions +T = FT(294) # air temperature K +p = FT(1e5) # air pressure Pa +w = FT(0.5) # vertical velocity m/s +p_vs = TD.saturation_vapor_pressure(tps, T, TD.Liquid()) +q_vs = 1 / (1 - TD.Parameters.molmass_ratio(tps) * (p_vs - p) / p_vs) +q = TD.PhasePartition(q_vs) + +# Aerosol size distribution +salt = CMP.Seasalt(FT) +# Accumulation mode +r1 = FT(0.243 * 1e-6) # m +σ1 = FT(1.4) # - +N1 = FT(100 * 1e6) # 1/m3 +# Coarse Mode +r2 = FT(1.5 * 1e-6) # m +σ2 = FT(2.1) # - +N2 = FT(1e6) # 1/m3 +acc = AM.Mode_κ(r1, σ1, N1, (FT(1.0),), (FT(1.0),), (salt.M,), (salt.κ,), 1) +crs = AM.Mode_κ(r2, σ2, N2, (FT(1.0),), (FT(1.0),), (salt.M,), (salt.κ,), 1) +ad = AM.AerosolDistribution((crs, acc)) + +# Get the ML model +mach = get_ML_model("2modal_nn_machine_naive.jls") + +@info("ML version: ", AA.N_activated_per_mode(mach, ap, ad, aip, tps, T, p, w, q)) +@info("CM version: ", AA.N_activated_per_mode(ap, ad, aip, tps, T, p, w, q))