From f9b2927131803620d9af3ac671006bbb2de6b83e Mon Sep 17 00:00:00 2001
From: huebleruwm <37674341+huebleruwm@users.noreply.github.com>
Date: Mon, 10 Jul 2023 18:42:07 -0500
Subject: [PATCH 1/7] Openacc tweaks and cleanup (#1097)

* Chaning acc declare statements to acc enter data statement

* Making acc statements more consistent

* Making lapack useable while using openacc. Lapack is still run on the CPU

* Updating setup_clubb_core to now accept clubb_config_flags as an input, and adding warning in case clubb is running with lapack but was compiled with openacc.
---
 src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90     |   4 +-
 src/CLUBB_core/advance_clubb_core_module.F90  | 196 ++++++++++--------
 src/CLUBB_core/advance_helper_module.F90      |  40 +++-
 .../advance_windm_edsclrm_module.F90          |   4 +-
 src/CLUBB_core/advance_wp2_wp3_module.F90     |  38 ++--
 src/CLUBB_core/advance_xm_wpxp_module.F90     |  85 ++++----
 src/CLUBB_core/advance_xp2_xpyp_module.F90    |   4 +-
 src/CLUBB_core/clip_explicit.F90              |   6 +-
 src/CLUBB_core/clubb_api_module.F90           | 140 +------------
 src/CLUBB_core/diffusion.F90                  |  24 +--
 src/CLUBB_core/fill_holes.F90                 |  12 +-
 src/CLUBB_core/grid_class.F90                 |   8 +-
 src/CLUBB_core/interpolation.F90              |   2 +-
 src/CLUBB_core/matrix_solver_wrapper.F90      |  73 +++----
 src/CLUBB_core/mean_adv.F90                   |   4 +-
 src/CLUBB_core/mixing_length.F90              |  42 ++--
 src/CLUBB_core/mono_flux_limiter.F90          |  41 ++--
 src/CLUBB_core/sfc_varnce_module.F90          |  52 ++---
 src/CLUBB_core/sigma_sqd_w_module.F90         |   4 +-
 src/CLUBB_core/turbulent_adv_pdf.F90          |   8 +-
 src/clubb_driver.F90                          |  10 +-
 21 files changed, 373 insertions(+), 424 deletions(-)

diff --git a/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90 b/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90
index 222b26b32..09334a800 100644
--- a/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90
+++ b/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90
@@ -147,7 +147,7 @@ subroutine ADG1_pdf_driver( nz, ngrdcol,                              & ! In
 
     integer :: j  ! Loop index
 
-    !$acc declare create( w_1_n, w_2_n )
+    !$acc enter data create( w_1_n, w_2_n )
     
     ! Calculate the mixture fraction and the PDF component means and variances
     ! of w.
@@ -204,6 +204,8 @@ subroutine ADG1_pdf_driver( nz, ngrdcol,                              & ! In
        enddo ! i=1, sclr_dim
     endif ! l_scalar_calc
     
+    !$acc exit data delete( w_1_n, w_2_n )
+
     return
 
   end subroutine ADG1_pdf_driver
diff --git a/src/CLUBB_core/advance_clubb_core_module.F90 b/src/CLUBB_core/advance_clubb_core_module.F90
index 3a8ba25bc..925448535 100644
--- a/src/CLUBB_core/advance_clubb_core_module.F90
+++ b/src/CLUBB_core/advance_clubb_core_module.F90
@@ -1136,20 +1136,22 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
     ! only be used to compute the variance at the surface. -dschanen 8 Sept 2009
     if ( .not. l_host_applies_sfc_fluxes ) then
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         wpthlp(i,1) = wpthlp_sfc(i)
         wprtp(i,1)  = wprtp_sfc(i)
         upwp(i,1)   = upwp_sfc(i)
         vpwp(i,1)   = vpwp_sfc(i)
       end do
+      !$acc end parallel loop
 
       if ( clubb_config_flags%l_linearize_pbl_winds ) then
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           upwp_pert(i,1) = upwp_sfc_pert(i)
           vpwp_pert(i,1) = vpwp_sfc_pert(i)
         end do
+        !$acc end parallel loop
       endif ! l_linearize_pbl_winds
 
       ! Set fluxes for passive scalars (if enabled)
@@ -1160,6 +1162,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             wpsclrp(i,1,j)   = wpsclrp_sfc(i,j)
           end do
         end do
+        !$acc end parallel loop
       end if
 
       if ( edsclr_dim > 0 ) then
@@ -1169,17 +1172,19 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             wpedsclrp(i,1,j) = wpedsclrp_sfc(i,j)
           end do
         end do
+        !$acc end parallel loop
       end if
 
     else
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         wpthlp(i,1) = 0.0_core_rknd
         wprtp(i,1)  = 0.0_core_rknd
         upwp(i,1)   = 0.0_core_rknd
         vpwp(i,1)   = 0.0_core_rknd
       end do
+      !$acc end parallel loop
 
       ! Set fluxes for passive scalars (if enabled)
       if ( sclr_dim > 0 ) then
@@ -1189,6 +1194,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             wpsclrp(i,1,j) = 0.0_core_rknd
           end do
         end do
+        !$acc end parallel loop
       end if
 
       if ( edsclr_dim > 0 ) then
@@ -1198,20 +1204,23 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             wpedsclrp(i,1,j) = 0.0_core_rknd
           end do
         end do
+        !$acc end parallel loop
       end if
 
     end if ! ~l_host_applies_sfc_fluxes
 
 #ifdef CLUBBND_CAM
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       newmu(i) = varmu(i)
     end do
+    !$acc end parallel loop
 #else
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       newmu(i) = mu
     end do
+    !$acc end parallel loop
 #endif
 
     if ( clubb_config_flags%ipdf_call_placement == ipdf_pre_advance_fields &
@@ -1304,6 +1313,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         wp2_zt(i,k) = max( wp2_zt(i,k), w_tol_sqd )
       end do
     end do
+    !$acc end parallel loop
 
     beta = clubb_params(ibeta)
     Skw_denom_coef = clubb_params(iSkw_denom_coef)
@@ -1336,6 +1346,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
                   *exp( -(1.0_core_rknd/2.0_core_rknd) * (Skw_zm(i,k)/gamma_coefc)**2 )
           end do
         end do
+        !$acc end parallel loop
       else
         !$acc parallel loop gang vector collapse(2) default(present)
         do k = 1, nz
@@ -1343,6 +1354,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             gamma_Skw_fnc(i,k) = gamma_coef
           end do
         end do
+        !$acc end parallel loop
       endif
 
       ! Compute sigma_sqd_w (dimensionless PDF width parameter)
@@ -1361,6 +1373,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           sigma_sqd_w(i,k) = max( zero_threshold, sigma_sqd_w(i,k) ) ! Pos. def. quantity
         end do
       end do
+      !$acc end parallel loop
 
     endif ! clubb_config_flags%ipdf_call_placement == ipdf_post_advance_fields
 
@@ -1381,6 +1394,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         a3_coef(i,k) = -2._core_rknd * ( 1._core_rknd - sigma_sqd_w(i,k) )**2 + 3.0_core_rknd
       end do
     end do
+    !$acc end parallel loop
 
     ! We found we obtain fewer spikes in wp3 when we clip a3 to be no greater
     ! than -1.4 -dschanen 4 Jan 2011
@@ -1391,6 +1405,8 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         a3_coef(i,k) = max( a3_coef(i,k), a3_coef_min )
       end do
     end do
+    !$acc end parallel loop
+
     a3_coef_zt(:,:) = zm2zt( nz, ngrdcol, gr, a3_coef(:,:) )
 
     ! Interpolate thlp2, rtp2, and rtpthlp to thermodynamic levels.
@@ -1405,6 +1421,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         rtp2_zt(i,k)  = max( rtp2_zt(i,k), rt_tol**2 )
       end do
     end do
+    !$acc end parallel loop
 
     ! Compute wp3 / wp2 on zt levels.  Always use the interpolated value in the
     ! denominator since it's less likely to create spikes
@@ -1414,6 +1431,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         wp3_on_wp2_zt(i,k) = ( wp3(i,k) / max( wp2_zt(i,k), w_tol_sqd ) )
       end do
     end do
+    !$acc end parallel loop
 
     ! Clip wp3_on_wp2_zt if it's too large
     !$acc parallel loop gang vector collapse(2) default(present)
@@ -1426,6 +1444,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         end if
       end do
     end do
+    !$acc end parallel loop
 
     ! Compute wp3_on_wp2 by interpolating wp3_on_wp2_zt
     wp3_on_wp2(:,:) = zt2zm( nz, ngrdcol, gr, wp3_on_wp2_zt(:,:) )
@@ -1451,6 +1470,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           em(i,k) = three_halves * wp2(i,k) 
         end do
       end do
+      !$acc end parallel loop
     else
       !$acc parallel loop gang vector collapse(2) default(present)
       do k = 1, nz
@@ -1458,6 +1478,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           em(i,k) = 0.5_core_rknd * ( wp2(i,k) + vp2(i,k) + up2(i,k) )
         end do
       end do
+      !$acc end parallel loop
     end if
 
     sqrt_em_zt(:,:) = zm2zt( nz, ngrdcol, gr, em(:,:) )
@@ -1468,6 +1489,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         sqrt_em_zt(i,k) = sqrt( max( em_min, sqrt_em_zt(i,k) ) )
       end do
     end do
+    !$acc end parallel loop
 
     !----------------------------------------------------------------
     ! Compute mixing length and dissipation time
@@ -1503,6 +1525,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           tau_zt(i,k) = min( Lscale(i,k) / sqrt_em_zt(i,k), taumax )
         end do
       end do
+      !$acc end parallel loop
 
       tau_zm(:,:) = zt2zm( nz, ngrdcol, gr, Lscale(:,:) )
           
@@ -1513,6 +1536,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
                        / sqrt( max( em_min, em(i,k) ) ) ), taumax )
         end do
       end do
+      !$acc end parallel loop
 
       !$acc parallel loop gang vector collapse(2) default(present)
       do k = 1, nz
@@ -1528,6 +1552,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           tau_max_zt(i,k) = taumax
         end do
       end do
+      !$acc end parallel loop
 
       ! End Vince Larson's replacement.
 
@@ -1601,6 +1626,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         Kh_zt(i,k) = c_K * Lscale(i,k) * sqrt_em_zt(i,k)
       end do
     end do
+    !$acc end parallel loop
 
     Lscale_zm(:,:) = zt2zm( nz, ngrdcol, gr, Lscale(:,:) )
 
@@ -1611,6 +1637,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
                      * sqrt( max( em(i,k), em_min ) )
       end do
     end do
+    !$acc end parallel loop
 
     ! calculate Brunt-Vaisala frequency used for splatting
     brunt_vaisala_freq_sqd_splat  &
@@ -1703,6 +1730,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           mixt_frac_zm(i,k)  = pdf_params_zm%mixt_frac(i,k)
         end do
       end do
+      !$acc end parallel loop
     else
       w_1_zm(:,:)        = zt2zm( nz, ngrdcol, gr, pdf_params%w_1(:,:) )
       w_2_zm(:,:)        = zt2zm( nz, ngrdcol, gr, pdf_params%w_2(:,:) )
@@ -1746,6 +1774,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           invrs_tau_C1_zm(i,k) = invrs_tau_N2_zm(i,k)
         end do
       end do
+      !$acc end parallel loop
     else
       !$acc parallel loop gang vector collapse(2) default(present)
       do k = 1, nz
@@ -1755,6 +1784,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           invrs_tau_C1_zm(i,k) = invrs_tau_wp2_zm(i,k)
         end do
       end do
+      !$acc end parallel loop
     end if ! l_stability_correction
 
     ! Set invrs_tau variables for C4 and C14
@@ -1764,6 +1794,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         invrs_tau_C14_zm(i,k) = invrs_tau_wp2_zm(i,k)
       end do
     end do
+    !$acc end parallel loop
     
     if ( .not. l_use_invrs_tau_N2_iso ) then
       !$acc parallel loop gang vector collapse(2) default(present)
@@ -1772,6 +1803,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           invrs_tau_C4_zm(i,k) = invrs_tau_wp2_zm(i,k)
         end do
       end do
+      !$acc end parallel loop
     else
       !$acc parallel loop gang vector collapse(2) default(present)
       do k = 1, nz
@@ -1779,6 +1811,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           invrs_tau_C4_zm(i,k) = invrs_tau_N2_iso(i,k)
         end do
       end do
+      !$acc end parallel loop
     end if
 
     if ( l_stats_samp ) then
@@ -1843,6 +1876,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           Cx_fnc_Richardson(i,k) = 0.0
         end do
       end do
+      !$acc end parallel loop
     end if
 
     ! Loop over the 4 main advance subroutines -- advance_xm_wpxp,
@@ -2160,6 +2194,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           Kmh_zm(i,k) = Kh_zm(i,k) * C_K10h ! Coefficient for thermo
         end do
       end do
+      !$acc end parallel loop
 
       if ( edsclr_dim > 1 .and. clubb_config_flags%l_do_expldiff_rtm_thlm ) then
         !$acc parallel loop gang vector collapse(2) default(present)
@@ -2169,6 +2204,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             edsclrm(i,k,edsclr_dim) = rtm(i,k)
           end do
         end do
+        !$acc end parallel loop
       end if
 
       call advance_windm_edsclrm( nz, ngrdcol, gr, dt,                        & ! intent(in)
@@ -2211,6 +2247,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             end if
           end do
         end do
+        !$acc end parallel loop
 
       end if
 
@@ -2317,6 +2354,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
           vp2_zt(i,k)   = max( vp2_zt(i,k), w_tol_sqd )
         end do
       end do
+      !$acc end parallel loop
 
       if ( clubb_config_flags%iiPDF_type == iiPDF_ADG1 ) then
 
@@ -2330,6 +2368,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
             sigma_sqd_w_zt(i,k) = max( sigma_sqd_w_zt(i,k), zero_threshold )
           end do
         end do
+        !$acc end parallel loop
 
         call xp3_LG_2005_ansatz( nz, ngrdcol, Skw_zt, wpthlp_zt, wp2_zt, &
                                  thlp2_zt, sigma_sqd_w_zt, &
@@ -2362,6 +2401,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
               sclrp2_zt(i,k)  = max( sclrp2_zt(i,k), sclr_tol(j)**2 )
             end do
           end do
+          !$acc end parallel loop
 
           call xp3_LG_2005_ansatz( nz, ngrdcol, Skw_zt, wpsclrp_zt, wp2_zt, &
                                    sclrp2_zt, sigma_sqd_w_zt, &
@@ -2532,6 +2572,7 @@ subroutine advance_clubb_core ( gr, nz, ngrdcol, &                ! intent(in)
         qclvar(i,k) = rcp2_zt(i,k)
       end do
     end do
+    !$acc end parallel loop
 #endif
 
 
@@ -3732,7 +3773,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
 
         ! Clip if extrap. causes sclrm_zm to be less than sclr_tol
 
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           sclrm_zm(i,nz,j) = max( sclrm_zm(i,nz,j), sclr_tol(j) )
         end do
@@ -3747,7 +3788,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
       ! p_sfc, which is p_in_Pa(1).  Thus, p_in_Pa_zm(1) = p_in_Pa(1).
       p_in_Pa_zm(:,:) = zt2zm( nz, ngrdcol, gr, p_in_Pa(:,:) )
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         p_in_Pa_zm(i,1) = p_in_Pa(i,1)
 
@@ -3768,7 +3809,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
       rtm_zm(:,:) = zt2zm( nz, ngrdcol, gr, rtm(:,:) )
       thlm_zm(:,:) = zt2zm( nz, ngrdcol, gr, thlm(:,:) )
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Clip if extrapolation at the top level causes rtm_zm to be < rt_tol
         rtm_zm(i,nz) = max( rtm_zm(i,nz), rt_tol )
@@ -3859,7 +3900,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Since top momentum level is higher than top thermo level,
         ! set variables at top momentum level to 0.
@@ -3886,7 +3927,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
         end do
         !$acc end parallel loop
 #ifndef CLUBB_CAM
-        !$acc parallel loop default(present) 
+        !$acc parallel loop gang vector default(present) 
         do i = 1, ngrdcol
           rcp2(i,nz) = zero
         end do
@@ -3906,7 +3947,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
       wp2up2(:,:)      = zt2zm( nz, ngrdcol, gr, wp2up2_zt(:,:) )
       wp2vp2(:,:)      = zt2zm( nz, ngrdcol, gr, wp2vp2_zt(:,:) )
 
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do i = 1, ngrdcol 
         wpthvp(i,nz)     = 0.0_core_rknd
         thlpthvp(i,nz)   = 0.0_core_rknd
@@ -3940,7 +3981,7 @@ subroutine pdf_closure_driver( gr, nz, ngrdcol,               & ! Intent(in)
         sclrpthvp(:,:,j)       = zt2zm( nz, ngrdcol, gr, sclrpthvp_zt(:,:,j) )
         sclrprcp(:,:,j)        = zt2zm( nz, ngrdcol, gr, sclrprcp_zt(:,:,j) )
 
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do k = 1, nz
           do i = 1, ngrdcol
             sclrpthvp(i,nz,j) = 0.0_core_rknd
@@ -4105,15 +4146,7 @@ subroutine setup_clubb_core &
 #ifdef GFDL
                  I_sat_sphum,                             & ! intent(in)  h1g, 2010-06-16
 #endif
-                 iiPDF_type,                              & ! intent(in)
-                 ipdf_call_placement,                     & ! intent(in)
-                 l_predict_upwp_vpwp,                     & ! intent(in)
-                 l_min_xp2_from_corr_wx,                  & ! intent(in)
-                 l_prescribed_avg_deltaz,                 & ! intent(in)
-                 l_damp_wp2_using_em,                     & ! intent(in)
-                 l_stability_correct_tau_zm,              & ! intent(in)
-                 l_enable_relaxed_clipping,               & ! intent(in)
-                 l_diag_Lscale_from_tau,                  & ! intent(in)
+                 clubb_config_flags,                      & ! intent(in)
 
 #ifdef GFDL
                  cloud_frac_min,                          & ! intent(in)  h1g, 2010-06-16
@@ -4174,6 +4207,7 @@ subroutine setup_clubb_core &
           iiPDF_TSDADG,     &
           iiPDF_LY93,       &
           iiPDF_new_hybrid, &
+          lapack,           &
           l_explicit_turbulent_adv_wpxp
 
       use clubb_precision, only: &
@@ -4215,35 +4249,9 @@ subroutine setup_clubb_core &
       logical, intent(in) ::  &
         l_input_fields    ! Flag for whether LES input fields are being used
 
-      integer, intent(in) :: &
-        iiPDF_type,          & ! Selected option for the two-component normal
-                               ! (double Gaussian) PDF type to use for the w,
-                               ! rt, and theta-l (or w, chi, and eta) portion of
-                               ! CLUBB's multivariate, two-component PDF.
-        ipdf_call_placement    ! Selected option for the placement of the call to
-                               ! CLUBB's PDF.
-
-      logical, intent(in) :: &
-        l_predict_upwp_vpwp,        & ! Flag to predict <u'w'> and <v'w'> along with <u> and <v>
-                                      ! alongside the advancement of <rt>, <w'rt'>, <thl>, <wpthlp>,
-                                      ! <sclr>, and <w'sclr'> in subroutine advance_xm_wpxp.
-                                      ! Otherwise, <u'w'> and <v'w'> are still approximated by eddy
-                                      ! diffusivity when <u> and <v> are advanced in subroutine
-                                      ! advance_windm_edsclrm.
-        l_min_xp2_from_corr_wx,     & ! Flag to base the threshold minimum value of xp2 (rtp2 and
-                                      ! thlp2) on keeping the overall correlation of w and x within
-                                      ! the limits of -max_mag_correlation_flux to
-                                      ! max_mag_correlation_flux.
-        l_prescribed_avg_deltaz,    & ! used in adj_low_res_nu. If .true., avg_deltaz = deltaz
-        l_stability_correct_tau_zm, & ! Use tau_N2_zm instead of tau_zm in wpxp_pr1 stability
-                                      ! correction
-        l_damp_wp2_using_em,        & ! In wp2 equation, use a dissipation formula of
-                                      ! -(2/3)*em/tau_zm, as in Bougeault (1981)
-        l_enable_relaxed_clipping,  & ! Flag to relax clipping on wpxp in
-                                      ! xm_wpxp_clipping_and_stats
-        l_diag_Lscale_from_tau        ! First diagnose dissipation time tau, and
-                                      ! then diagnose the mixing length scale as
-                                      ! Lscale = tau * tke
+      type(clubb_config_flags_type), intent(in) :: &
+        clubb_config_flags
+        
 
 #ifdef GFDL
       logical, intent(in) :: &  ! h1g, 2010-06-16 begin mod
@@ -4261,17 +4269,29 @@ subroutine setup_clubb_core &
       err_code_out = clubb_no_error ! Initialize to no error value
       call initialize_error_headers
 
+#ifdef _OPENACC
+      if ( clubb_config_flags%penta_solve_method == lapack ) then
+        write(fstderr,*) "WARNING: The penta-diagonal lapack solver is not GPU accelerated"
+        write(fstderr,*) " Set penta_solve_method = 2, to use an accelerated penta-diagonal solver"
+      end if
+
+      if ( clubb_config_flags%tridiag_solve_method == lapack ) then
+        write(fstderr,*) "WARNING: The tri-diagonal lapack solver is not GPU accelerated"
+        write(fstderr,*) " Set tridiag_solve_method = 2, to use an accelerated tri-diagonal solver"
+      end if
+#endif
+
       ! Sanity check
       if ( clubb_at_least_debug_level( 0 ) ) then
 
-        if ( l_damp_wp2_using_em .and. &
+        if ( clubb_config_flags%l_damp_wp2_using_em .and. &
            (abs(params(iC1) - params(iC14)) > abs(params(iC1) + params(iC14)) / 2 * eps .or. &
-             l_stability_correct_tau_zm) ) then
+             clubb_config_flags%l_stability_correct_tau_zm) ) then
           write(fstderr,*) "l_damp_wp2_using_em = T requires C1=C14 and" &
                             // " l_stability_correct_tau_zm = F"
           write(fstderr,*) "C1 = ", params(iC1)
           write(fstderr,*) "C14 = ", params(iC14)
-          write(fstderr,*) "l_stability_correct_tau_zm = ", l_stability_correct_tau_zm
+          write(fstderr,*) "l_stability_correct_tau_zm = ", clubb_config_flags%l_stability_correct_tau_zm
           write(fstderr,*) "Fatal error in setup_clubb_core"
           err_code = clubb_fatal_error
           err_code_out = clubb_fatal_error
@@ -4307,9 +4327,10 @@ subroutine setup_clubb_core &
 
       ! Check for the type of two component normal (double Gaussian) PDF being
       ! used for w, rt, and theta-l (or w, chi, and eta).
-      if ( iiPDF_type < iiPDF_ADG1 .or. iiPDF_type > iiPDF_new_hybrid ) then
-         write(fstderr,*) "Unknown type of double Gaussian PDF selected: ", iiPDF_type
-         write(fstderr,*) "iiPDF_type = ", iiPDF_type
+      if ( clubb_config_flags%iiPDF_type < iiPDF_ADG1 &
+           .or. clubb_config_flags%iiPDF_type > iiPDF_new_hybrid ) then
+         write(fstderr,*) "Unknown type of double Gaussian PDF selected: ", clubb_config_flags%iiPDF_type
+         write(fstderr,*) "iiPDF_type = ", clubb_config_flags%iiPDF_type
          write(fstderr,*) "Fatal error in setup_clubb_core"
          err_code = clubb_fatal_error
          err_code_out = clubb_fatal_error
@@ -4317,11 +4338,11 @@ subroutine setup_clubb_core &
       endif ! iiPDF_type < iiPDF_ADG1 or iiPDF_type > iiPDF_lY93
 
       ! The ADG2 and 3D Luhar PDFs can only be used as part of input fields.
-      if ( iiPDF_type == iiPDF_ADG2 ) then
+      if ( clubb_config_flags%iiPDF_type == iiPDF_ADG2 ) then
          if ( .not. l_input_fields ) then
             write(fstderr,*) "The ADG2 PDF can only be used with" &
                              // " input fields (l_input_fields = .true.)."
-            write(fstderr,*) "iiPDF_type = ", iiPDF_type
+            write(fstderr,*) "iiPDF_type = ", clubb_config_flags%iiPDF_type
             write(fstderr,*) "l_input_fields = ", l_input_fields
             write(fstderr,*) "Fatal error in setup_clubb_core"
             err_code = clubb_fatal_error
@@ -4330,11 +4351,11 @@ subroutine setup_clubb_core &
          endif ! .not. l_input_fields
       endif ! iiPDF_type == iiPDF_ADG2
 
-      if ( iiPDF_type == iiPDF_3D_Luhar ) then
+      if ( clubb_config_flags%iiPDF_type == iiPDF_3D_Luhar ) then
          if ( .not. l_input_fields ) then
             write(fstderr,*) "The 3D Luhar PDF can only be used with" &
                              // " input fields (l_input_fields = .true.)."
-            write(fstderr,*) "iiPDF_type = ", iiPDF_type
+            write(fstderr,*) "iiPDF_type = ", clubb_config_flags%iiPDF_type
             write(fstderr,*) "l_input_fields = ", l_input_fields
             write(fstderr,*) "Fatal error in setup_clubb_core"
             err_code = clubb_fatal_error
@@ -4345,11 +4366,11 @@ subroutine setup_clubb_core &
 
       ! This also currently applies to the new PDF until it has been fully
       ! implemented.
-      if ( iiPDF_type == iiPDF_new ) then
+      if ( clubb_config_flags%iiPDF_type == iiPDF_new ) then
          if ( .not. l_input_fields ) then
             write(fstderr,*) "The new PDF can only be used with" &
                              // " input fields (l_input_fields = .true.)."
-            write(fstderr,*) "iiPDF_type = ", iiPDF_type
+            write(fstderr,*) "iiPDF_type = ", clubb_config_flags%iiPDF_type
             write(fstderr,*) "l_input_fields = ", l_input_fields
             write(fstderr,*) "Fatal error in setup_clubb_core"
             err_code = clubb_fatal_error
@@ -4360,11 +4381,11 @@ subroutine setup_clubb_core &
 
       ! This also currently applies to the TSDADG PDF until it has been fully
       ! implemented.
-      if ( iiPDF_type == iiPDF_TSDADG ) then
+      if ( clubb_config_flags%iiPDF_type == iiPDF_TSDADG ) then
          if ( .not. l_input_fields ) then
             write(fstderr,*) "The new TSDADG PDF can only be used with" &
                              // " input fields (l_input_fields = .true.)."
-            write(fstderr,*) "iiPDF_type = ", iiPDF_type
+            write(fstderr,*) "iiPDF_type = ", clubb_config_flags%iiPDF_type
             write(fstderr,*) "l_input_fields = ", l_input_fields
             write(fstderr,*) "Fatal error in setup_clubb_core"
             err_code = clubb_fatal_error
@@ -4374,11 +4395,11 @@ subroutine setup_clubb_core &
       endif ! iiPDF_type == iiPDF_TSDADG
 
       ! This also applies to Lewellen and Yoh (1993).
-      if ( iiPDF_type == iiPDF_LY93 ) then
+      if ( clubb_config_flags%iiPDF_type == iiPDF_LY93 ) then
          if ( .not. l_input_fields ) then
             write(fstderr,*) "The Lewellen and Yoh PDF can only be used with" &
                              // " input fields (l_input_fields = .true.)."
-            write(fstderr,*) "iiPDF_type = ", iiPDF_type
+            write(fstderr,*) "iiPDF_type = ", clubb_config_flags%iiPDF_type
             write(fstderr,*) "l_input_fields = ", l_input_fields
             write(fstderr,*) "Fatal error in setup_clubb_core"
             err_code = clubb_fatal_error
@@ -4388,9 +4409,10 @@ subroutine setup_clubb_core &
       endif ! iiPDF_type == iiPDF_LY93
 
       ! Check the option for the placement of the call to CLUBB's PDF.
-      if ( ipdf_call_placement < ipdf_pre_advance_fields &
-           .or. ipdf_call_placement > ipdf_pre_post_advance_fields ) then
-         write(fstderr,*) "Invalid option selected for ipdf_call_placement: ", ipdf_call_placement
+      if ( clubb_config_flags%ipdf_call_placement < ipdf_pre_advance_fields &
+           .or. clubb_config_flags%ipdf_call_placement > ipdf_pre_post_advance_fields ) then
+         write(fstderr,*) "Invalid option selected for ipdf_call_placement: ", &
+                          clubb_config_flags%ipdf_call_placement
          write(fstderr,*) "Fatal error in setup_clubb_core"
          err_code = clubb_fatal_error
          err_code_out = clubb_fatal_error
@@ -4399,7 +4421,7 @@ subroutine setup_clubb_core &
 
       ! The l_predict_upwp_vpwp flag requires that the ADG1 PDF is used
       ! implicitly in subroutine advance_xm_wpxp.
-      if ( l_predict_upwp_vpwp ) then
+      if ( clubb_config_flags%l_predict_upwp_vpwp ) then
 
          ! When l_predict_upwp_vpwp is enabled, the
          ! l_explicit_turbulent_adv_wpxp flag must be turned off.
@@ -4420,8 +4442,8 @@ subroutine setup_clubb_core &
          ! the ADG1 PDF or the new hybrid PDF.  The other PDFs are not currently
          ! set up to calculate variables needed for implicit or semi-implicit
          ! turbulent advection, such as coef_wp2up_implicit, etc.
-         if ( ( iiPDF_type /= iiPDF_ADG1 ) &
-              .and. ( iiPDF_type /= iiPDF_new_hybrid ) ) then
+         if ( ( clubb_config_flags%iiPDF_type /= iiPDF_ADG1 ) &
+              .and. ( clubb_config_flags%iiPDF_type /= iiPDF_new_hybrid ) ) then
             write(fstderr,*) "Currently, only the ADG1 PDF and the new hybrid" &
                              // " PDF are set up for use with the" &
                              // " l_predict_upwp_vpwp code."
@@ -4435,8 +4457,8 @@ subroutine setup_clubb_core &
 
       ! The flags l_min_xp2_from_corr_wx and l_enable_relaxed_clipping must
       ! have opposite values.
-      if ( ( l_min_xp2_from_corr_wx ) &
-         .and. ( l_enable_relaxed_clipping ) ) then
+      if ( ( clubb_config_flags%l_min_xp2_from_corr_wx ) &
+         .and. ( clubb_config_flags%l_enable_relaxed_clipping ) ) then
          write(fstderr,*) "Invalid configuration: l_min_xp2_from_corr_wx = T " &
                           // "and l_enable_relaxed_clipping = T"
          write(fstderr,*) "They must have opposite values"
@@ -4444,8 +4466,8 @@ subroutine setup_clubb_core &
          err_code = clubb_fatal_error
          err_code_out = clubb_fatal_error
          return
-      elseif ( ( .not. l_min_xp2_from_corr_wx ) &
-               .and. ( .not. l_enable_relaxed_clipping ) ) then
+      elseif ( ( .not. clubb_config_flags%l_min_xp2_from_corr_wx ) &
+               .and. ( .not. clubb_config_flags%l_enable_relaxed_clipping ) ) then
          write(fstderr,*) "Invalid configuration: l_min_xp2_from_corr_wx = F " &
                           // "and l_enable_relaxed_clipping = F"
          write(fstderr,*) "They must have opposite values"
@@ -4555,7 +4577,7 @@ subroutine setup_clubb_core &
       ! Checking that when the l_diag_Lscale_from_tau is enabled, the
       ! relevant Cx tunable parameters are all set to a value of 1 (as
       ! you're supposed to tune the C_invrs_tau_ parameters instead).
-      if ( l_diag_Lscale_from_tau ) then
+      if ( clubb_config_flags%l_diag_Lscale_from_tau ) then
 
          ! Note: someday when we can successfully run with all these parameters
          ! having a value of 1, the "Warning" messages should be removed and the
@@ -4864,7 +4886,7 @@ subroutine trapezoidal_rule_zt( nz, ngrdcol, gr, l_call_pdf_closure_twice,   & !
 
         ! Since top momentum level is higher than top thermo. level,
         ! set variables at top momentum level to 0.
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           wprtp2_zm(i,nz)             = 0.0_core_rknd
           wpthlp2_zm(i,nz)            = 0.0_core_rknd
@@ -4881,7 +4903,7 @@ subroutine trapezoidal_rule_zt( nz, ngrdcol, gr, l_call_pdf_closure_twice,   & !
           wpsclrp2_zm(:,:,sclr)     = zt2zm( nz, ngrdcol, gr, wpsclrp2(:,:,sclr) )
           wpsclrpthlp_zm(:,:,sclr)  = zt2zm( nz, ngrdcol, gr, wpsclrpthlp(:,:,sclr) )
 
-          !$acc parallel loop default(present)
+          !$acc parallel loop gang vector default(present)
           do i = 1, ngrdcol
             wpsclrprtp_zm(i,nz,sclr)  = 0.0_core_rknd
             wpsclrp2_zm(i,nz,sclr)    = 0.0_core_rknd
@@ -5055,7 +5077,7 @@ subroutine calc_trapezoid_zt( nz, ngrdcol, gr, &
       ! ---------------- Begin Code ----------------
 
       ! Boundary condition: trapezoidal rule not valid at zt level 1
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do i = 1, ngrdcol
         trapezoid_zt(i,1) = variable_zt(i,1)
       end do
@@ -5118,7 +5140,7 @@ subroutine calc_trapezoid_zm( nz, ngrdcol, gr, variable_zm, variable_zt, &
 
       ! Boundary conditions: trapezoidal rule not valid at top zm level, nzmax.
       ! Trapezoidal rule also not used at zm level 1.
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do i = 1, ngrdcol
         trapezoid_zm(i,1)  = variable_zm(i,1)
         trapezoid_zm(i,nz) = variable_zm(i,nz)
@@ -5210,7 +5232,8 @@ subroutine compute_cloud_cover( gr, nz, ngrdcol, &
 
       !------------------------ Begin code ------------------------
 
-      !$acc declare create( chi_mean, vert_cloud_frac_upper, vert_cloud_frac_lower, vert_cloud_frac )
+      !$acc enter data create( chi_mean, vert_cloud_frac_upper, &
+      !$acc                    vert_cloud_frac_lower, vert_cloud_frac )
 
       !$acc parallel loop gang vector collapse(2) default(present)
       do k = 1, nz
@@ -5307,7 +5330,7 @@ subroutine compute_cloud_cover( gr, nz, ngrdcol, &
       end do ! k = 2, gr%nz-1, 1
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         cloud_cover(i,1)  = cloud_frac(i,1)
         cloud_cover(i,nz) = cloud_frac(i,nz)
@@ -5334,6 +5357,9 @@ subroutine compute_cloud_cover( gr, nz, ngrdcol, &
         end if
       end if 
 
+      !$acc exit data delete( chi_mean, vert_cloud_frac_upper, &
+      !$acc                   vert_cloud_frac_lower, vert_cloud_frac )
+
       return
 
     end subroutine compute_cloud_cover
@@ -5471,13 +5497,13 @@ subroutine set_Lscale_max( ngrdcol, l_implemented, host_dx, host_dy, &
 
       ! Determine the maximum allowable value for Lscale (in meters).
       if ( l_implemented ) then
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           Lscale_max(i) = 0.25_core_rknd * min( host_dx(i), host_dy(i) )
         end do
         !$acc end parallel loop
       else
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           Lscale_max(i) = 1.0e5_core_rknd
         end do
diff --git a/src/CLUBB_core/advance_helper_module.F90 b/src/CLUBB_core/advance_helper_module.F90
index 6ab79056c..19602e375 100644
--- a/src/CLUBB_core/advance_helper_module.F90
+++ b/src/CLUBB_core/advance_helper_module.F90
@@ -293,9 +293,9 @@ subroutine calc_stability_correction( nz, ngrdcol, gr, &
 
     !------------ Begin Code --------------
 
-    !$acc declare create( brunt_vaisala_freq_sqd, brunt_vaisala_freq_sqd_mixed, &
-    !$acc                 brunt_vaisala_freq_sqd_moist, brunt_vaisala_freq_sqd_dry, &
-    !$acc                 brunt_vaisala_freq_sqd_plus, lambda0_stability, Lscale_zm )
+    !$acc enter data create( brunt_vaisala_freq_sqd, brunt_vaisala_freq_sqd_mixed, &
+    !$acc                    brunt_vaisala_freq_sqd_moist, brunt_vaisala_freq_sqd_dry, &
+    !$acc                    brunt_vaisala_freq_sqd_plus, lambda0_stability, Lscale_zm )
 
     call calc_brunt_vaisala_freq_sqd( nz, ngrdcol, gr, thlm, &          ! intent(in)
                                       exner, rtm, rcm, p_in_Pa, thvm, & ! intent(in)
@@ -332,6 +332,10 @@ subroutine calc_stability_correction( nz, ngrdcol, gr, &
     end do
     !$acc end parallel loop
 
+    !$acc exit data delete( brunt_vaisala_freq_sqd, brunt_vaisala_freq_sqd_mixed, &
+    !$acc                   brunt_vaisala_freq_sqd_moist, brunt_vaisala_freq_sqd_dry, &
+    !$acc                   brunt_vaisala_freq_sqd_plus, lambda0_stability, Lscale_zm )
+
     return
 
   end subroutine calc_stability_correction
@@ -724,11 +728,12 @@ subroutine compute_Cx_fnc_Richardson( nz, ngrdcol, gr, &
 
     !------------------------------ Begin Code ------------------------------
 
-    !$acc declare create( brunt_vaisala_freq_sqd, brunt_vaisala_freq_sqd_mixed, brunt_vaisala_freq_sqd_dry, &
-    !$acc                 brunt_vaisala_freq_sqd_moist, brunt_vaisala_freq_sqd_plus, Richardson_num, &
-    !$acc                 Ri_zm, ddzt_um, ddzt_vm, shear_sqd, turb_freq_sqd, Lscale_zm, Cx_fnc_interp, &
-    !$acc                 Richardson_num_clipped, Cx_fnc_Richardson_avg, fnc_Richardson, &
-    !$acc                 fnc_Richardson_clipped, fnc_Richardson_smooth )
+    !$acc enter data create( brunt_vaisala_freq_sqd, brunt_vaisala_freq_sqd_mixed, &
+    !$acc                    brunt_vaisala_freq_sqd_dry, brunt_vaisala_freq_sqd_moist, &
+    !$acc                    brunt_vaisala_freq_sqd_plus, Richardson_num, Cx_fnc_interp, &
+    !$acc                    Ri_zm, ddzt_um, ddzt_vm, shear_sqd, turb_freq_sqd, Lscale_zm, &
+    !$acc                    Richardson_num_clipped, Cx_fnc_Richardson_avg, fnc_Richardson, &
+    !$acc                    fnc_Richardson_clipped, fnc_Richardson_smooth )
 
     if ( l_modify_limiters_for_cnvg_test .and. l_use_shear_turb_freq_sqd ) then
       error stop "ERROR: l_modify_limiters_for_cnvg_test .and. l_use_shear_turb_freq_sqd "// &
@@ -910,6 +915,13 @@ subroutine compute_Cx_fnc_Richardson( nz, ngrdcol, gr, &
       end do
     end if
 
+    !$acc exit data delete( brunt_vaisala_freq_sqd, brunt_vaisala_freq_sqd_mixed, &
+    !$acc                   brunt_vaisala_freq_sqd_dry, brunt_vaisala_freq_sqd_moist, &
+    !$acc                   brunt_vaisala_freq_sqd_plus, Richardson_num, Cx_fnc_interp, &
+    !$acc                   Ri_zm, ddzt_um, ddzt_vm, shear_sqd, turb_freq_sqd, Lscale_zm, &
+    !$acc                   Richardson_num_clipped, Cx_fnc_Richardson_avg, fnc_Richardson, &
+    !$acc                   fnc_Richardson_clipped, fnc_Richardson_smooth )
+
     return
 
   end subroutine compute_Cx_fnc_Richardson
@@ -979,7 +991,7 @@ function Lscale_width_vert_avg( nz, ngrdcol, gr, smth_type, &
 
     !-------------------------- Begin Code --------------------------
 
-    !$acc declare create( one_half_avg_width, numer_terms, denom_terms )
+    !$acc enter data create( one_half_avg_width, numer_terms, denom_terms )
 
     if ( smth_type == 1 ) then
       !$acc parallel loop gang vector collapse(2) default(present)
@@ -1079,6 +1091,8 @@ function Lscale_width_vert_avg( nz, ngrdcol, gr, smth_type, &
       end do
     end do
 
+    !$acc exit data delete( one_half_avg_width, numer_terms, denom_terms )
+
     return
 
   end function Lscale_width_vert_avg
@@ -1133,7 +1147,7 @@ subroutine wp2_term_splat_lhs( nz, ngrdcol, gr, C_wp2_splat, &
 
     !----------------------------- Begin Code -----------------------------
 
-    !$acc declare create( brunt_vaisala_freq_splat_clipped, brunt_vaisala_freq_splat_smooth )
+    !$acc enter data create( brunt_vaisala_freq_splat_clipped, brunt_vaisala_freq_splat_smooth )
 
     !$acc parallel loop gang vector collapse(2) default(present)
     do k = 1, nz
@@ -1155,6 +1169,8 @@ subroutine wp2_term_splat_lhs( nz, ngrdcol, gr, C_wp2_splat, &
     end do
     !$acc end parallel loop
 
+    !$acc exit data delete( brunt_vaisala_freq_splat_clipped, brunt_vaisala_freq_splat_smooth )
+
     return
 
   end subroutine wp2_term_splat_lhs
@@ -1211,7 +1227,7 @@ subroutine wp3_term_splat_lhs( nz, ngrdcol, gr, C_wp2_splat, &
 
     !----------------------------- Begin Code -----------------------------
 
-    !$acc declare create( brunt_vaisala_freq_splat_clipped, brunt_vaisala_freq_splat_smooth )
+    !$acc enter data create( brunt_vaisala_freq_splat_clipped, brunt_vaisala_freq_splat_smooth )
 
     !$acc parallel loop gang vector collapse(2) default(present)
     do k = 1, nz
@@ -1234,6 +1250,8 @@ subroutine wp3_term_splat_lhs( nz, ngrdcol, gr, C_wp2_splat, &
     end do
     !$acc end parallel loop
 
+    !$acc exit data delete( brunt_vaisala_freq_splat_clipped, brunt_vaisala_freq_splat_smooth )
+
     return
 
   end subroutine wp3_term_splat_lhs
diff --git a/src/CLUBB_core/advance_windm_edsclrm_module.F90 b/src/CLUBB_core/advance_windm_edsclrm_module.F90
index ce47bcd70..37c639c24 100644
--- a/src/CLUBB_core/advance_windm_edsclrm_module.F90
+++ b/src/CLUBB_core/advance_windm_edsclrm_module.F90
@@ -2133,7 +2133,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
 
     ! -------------------------- Begin Code --------------------------
 
-    !$acc declare create( xm_gf, xm_cf )
+    !$acc enter data create( xm_gf, xm_cf )
 
     if ( .not. l_implemented ) then
       ! Only compute the Coriolis term if the model is running on it's own,
@@ -2241,6 +2241,8 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
 
     endif
 
+    !$acc exit data delete( xm_gf, xm_cf )
+
     return
 
   end subroutine compute_uv_tndcy
diff --git a/src/CLUBB_core/advance_wp2_wp3_module.F90 b/src/CLUBB_core/advance_wp2_wp3_module.F90
index 3b14504fe..a79562f8f 100644
--- a/src/CLUBB_core/advance_wp2_wp3_module.F90
+++ b/src/CLUBB_core/advance_wp2_wp3_module.F90
@@ -413,17 +413,17 @@ subroutine advance_wp2_wp3( nz, ngrdcol, gr, dt,                           & ! i
 
     integer :: k, i, b
 
-    !$acc declare create( wp2_old, wp3_old, C1_Skw_fnc, C11_Skw_fnc, C16_fnc, &
-    !$acc                 wp3_term_ta_lhs_result, wp3_pr3_lhs, lhs_ta_wp2, &
-    !$acc                 lhs_tp_wp3, lhs_adv_tp_wp3, lhs_pr_tp_wp3, &
-    !$acc                 lhs_ta_wp3, lhs_dp1_wp2, rhs_dp1_wp2, lhs_pr1_wp2, &
-    !$acc                 rhs_pr1_wp2, lhs_pr1_wp3, rhs_pr1_wp3, rhs_bp_pr2_wp2, &
-    !$acc                 rhs_pr_dfsn_wp2, rhs_bp1_pr2_wp3, rhs_pr3_wp2, &
-    !$acc                 rhs_pr3_wp3, rhs_ta_wp3, rhs_pr_turb_wp3, rhs_pr_dfsn_wp3, &
-    !$acc                 lhs_diff_zm, lhs_diff_zt, lhs_diff_zm_crank, lhs_diff_zt_crank, &
-    !$acc                 lhs_ma_zm, lhs_ma_zt, lhs_ac_pr2_wp2, lhs_ac_pr2_wp3, &
-    !$acc                 coef_wp4_implicit_zt, coef_wp4_implicit, a1, a1_zt, &
-    !$acc                 dum_dz, dvm_dz, lhs, rhs, Kw1, Kw8, Kw1_zm, Kw8_zt )
+    !$acc enter data create( wp2_old, wp3_old, C1_Skw_fnc, C11_Skw_fnc, C16_fnc, &
+    !$acc                    wp3_term_ta_lhs_result, wp3_pr3_lhs, lhs_ta_wp2, &
+    !$acc                    lhs_tp_wp3, lhs_adv_tp_wp3, lhs_pr_tp_wp3, &
+    !$acc                    lhs_ta_wp3, lhs_dp1_wp2, rhs_dp1_wp2, lhs_pr1_wp2, &
+    !$acc                    rhs_pr1_wp2, lhs_pr1_wp3, rhs_pr1_wp3, rhs_bp_pr2_wp2, &
+    !$acc                    rhs_pr_dfsn_wp2, rhs_bp1_pr2_wp3, rhs_pr3_wp2, &
+    !$acc                    rhs_pr3_wp3, rhs_ta_wp3, rhs_pr_turb_wp3, rhs_pr_dfsn_wp3, &
+    !$acc                    lhs_diff_zm, lhs_diff_zt, lhs_diff_zm_crank, lhs_diff_zt_crank, &
+    !$acc                    lhs_ma_zm, lhs_ma_zt, lhs_ac_pr2_wp2, lhs_ac_pr2_wp3, &
+    !$acc                    coef_wp4_implicit_zt, coef_wp4_implicit, a1, a1_zt, &
+    !$acc                    dum_dz, dvm_dz, lhs, rhs, Kw1, Kw8, Kw1_zm, Kw8_zt )
 
     !-----------------------------------------------------------------------
 
@@ -1160,6 +1160,18 @@ subroutine advance_wp2_wp3( nz, ngrdcol, gr, dt,                           & ! i
       end if ! fatal error
     end if
 
+    !$acc exit data delete( wp2_old, wp3_old, C1_Skw_fnc, C11_Skw_fnc, C16_fnc, &
+    !$acc                   wp3_term_ta_lhs_result, wp3_pr3_lhs, lhs_ta_wp2, &
+    !$acc                   lhs_tp_wp3, lhs_adv_tp_wp3, lhs_pr_tp_wp3, &
+    !$acc                   lhs_ta_wp3, lhs_dp1_wp2, rhs_dp1_wp2, lhs_pr1_wp2, &
+    !$acc                   rhs_pr1_wp2, lhs_pr1_wp3, rhs_pr1_wp3, rhs_bp_pr2_wp2, &
+    !$acc                   rhs_pr_dfsn_wp2, rhs_bp1_pr2_wp3, rhs_pr3_wp2, &
+    !$acc                   rhs_pr3_wp3, rhs_ta_wp3, rhs_pr_turb_wp3, rhs_pr_dfsn_wp3, &
+    !$acc                   lhs_diff_zm, lhs_diff_zt, lhs_diff_zm_crank, lhs_diff_zt_crank, &
+    !$acc                   lhs_ma_zm, lhs_ma_zt, lhs_ac_pr2_wp2, lhs_ac_pr2_wp3, &
+    !$acc                   coef_wp4_implicit_zt, coef_wp4_implicit, a1, a1_zt, &
+    !$acc                   dum_dz, dvm_dz, lhs, rhs, Kw1, Kw8, Kw1_zm, Kw8_zt )
+
     return
 
   end subroutine advance_wp2_wp3
@@ -1392,7 +1404,7 @@ subroutine wp23_solve( nz, ngrdcol, gr, dt, lhs, rhs, &
 
     !------------------------- Begin Code -------------------------
 
-    !$acc declare create( rhs_save, solut, old_solut, rcond, threshold_array ) 
+    !$acc enter data create( rhs_save, solut, old_solut, rcond, threshold_array ) 
 
     ! Save the value of rhs, which will be overwritten with the solution as
     ! part of the solving routine.
@@ -1815,6 +1827,8 @@ subroutine wp23_solve( nz, ngrdcol, gr, dt, lhs, rhs, &
     ! Compute wp3_zm for output purposes
     wp3_zm(:,:) = zt2zm( nz, ngrdcol, gr, wp3 )
 
+    !$acc exit data delete( rhs_save, solut, old_solut, rcond, threshold_array ) 
+
     return
 
   end subroutine wp23_solve
diff --git a/src/CLUBB_core/advance_xm_wpxp_module.F90 b/src/CLUBB_core/advance_xm_wpxp_module.F90
index 9a28e41e5..78648bbad 100644
--- a/src/CLUBB_core/advance_xm_wpxp_module.F90
+++ b/src/CLUBB_core/advance_xm_wpxp_module.F90
@@ -1111,17 +1111,17 @@ subroutine advance_xm_wpxp( nz, ngrdcol, gr, dt, sigma_sqd_w, wm_zm, wm_zt, wp2,
     end if ! l_predict_upwp_vpwp
 
     !$acc exit data delete( C6rt_Skw_fnc, C6thl_Skw_fnc, C7_Skw_fnc, C6_term, Kw6, &
-    !$acc                 low_lev_effect, high_lev_effect, rtm_old, wprtp_old, thlm_old, &
-    !$acc                 wpthlp_old, um_old, upwp_old, vm_old, &
-    !$acc                 vpwp_old, lhs_diff_zm, lhs_diff_zt, lhs_ma_zt, lhs_ma_zm, &
-    !$acc                 lhs_ta_wprtp, lhs_ta_wpthlp, lhs_ta_wpup, lhs_ta_wpvp, &
-    !$acc                 rhs_ta_wprtp, rhs_ta_wpthlp, rhs_ta_wpup, &
-    !$acc                 rhs_ta_wpvp, lhs_tp, lhs_ta_xm, lhs_ac_pr2, &
-    !$acc                 lhs_pr1_wprtp, lhs_pr1_wpthlp )
+    !$acc                   low_lev_effect, high_lev_effect, rtm_old, wprtp_old, thlm_old, &
+    !$acc                   wpthlp_old, um_old, upwp_old, vm_old, &
+    !$acc                   vpwp_old, lhs_diff_zm, lhs_diff_zt, lhs_ma_zt, lhs_ma_zm, &
+    !$acc                   lhs_ta_wprtp, lhs_ta_wpthlp, lhs_ta_wpup, lhs_ta_wpvp, &
+    !$acc                   rhs_ta_wprtp, rhs_ta_wpthlp, rhs_ta_wpup, &
+    !$acc                   rhs_ta_wpvp, lhs_tp, lhs_ta_xm, lhs_ac_pr2, &
+    !$acc                   lhs_pr1_wprtp, lhs_pr1_wpthlp )
 
     !$acc exit data if( sclr_dim > 0 ) &
-    !$acc            delete( sclrm_old, wpsclrp_old, lhs_ta_wpsclrp,  &
-    !$acc                    rhs_ta_wpsclrp, lhs_pr1_wpsclrp )
+    !$acc           delete( sclrm_old, wpsclrp_old, lhs_ta_wpsclrp,  &
+    !$acc                   rhs_ta_wpsclrp, lhs_pr1_wpsclrp )
 
     return
 
@@ -1292,21 +1292,16 @@ subroutine xm_wpxp_lhs( nz, ngrdcol, l_iter, dt, wpxp, wm_zt, C7_Skw_fnc,     &
       
     real (kind = core_rknd) :: &
       invrs_dt
-        
-    real( kind = core_rknd ), dimension(ngrdcol,nz) :: &
-      zero_vector    ! Vector of 0s
       
     integer :: i
 
     !------------------- Begin Code -------------------
-
-    !$acc declare create( zero_vector )
     
     ! Initializations/precalculations
     invrs_dt = 1.0_core_rknd / dt
 
     ! Lower boundary for xm, lhs(:,1)
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs(1,i,1) = 0.0_core_rknd
       lhs(2,i,1) = 0.0_core_rknd
@@ -1317,7 +1312,7 @@ subroutine xm_wpxp_lhs( nz, ngrdcol, l_iter, dt, wpxp, wm_zt, C7_Skw_fnc,     &
     !$acc end parallel loop
 
     ! Lower boundary for w'x', lhs(:,2)
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs(1,i,2) = 0.0_core_rknd
       lhs(2,i,2) = 0.0_core_rknd
@@ -1368,7 +1363,7 @@ subroutine xm_wpxp_lhs( nz, ngrdcol, l_iter, dt, wpxp, wm_zt, C7_Skw_fnc,     &
 
     ! Upper boundary for w'x', , lhs(:,2*gr%nz)
     ! These were set in the loop above for simplicity, so they must be set properly here
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs(1,i,2*nz) = 0.0_core_rknd
       lhs(2,i,2*nz) = 0.0_core_rknd
@@ -1562,7 +1557,7 @@ subroutine calc_xm_wpxp_lhs_terms( nz, ngrdcol, gr, Kh_zm, wm_zm, wm_zt, wp2,
 
     !------------------- Begin Code -------------------
 
-    !$acc declare create( Kh_N2_zm, K_zm, K_zt, Kw6_zm, zeros_array )
+    !$acc enter data create( Kh_N2_zm, K_zm, K_zt, Kw6_zm, zeros_array )
 
     ! Initializations/precalculations
     constant_nu = 0.1_core_rknd
@@ -1646,7 +1641,7 @@ subroutine calc_xm_wpxp_lhs_terms( nz, ngrdcol, gr, Kh_zm, wm_zm, wm_zt, wp2,
         end do
         !$acc end parallel loop
 
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol        
           zeros_array(i) = zero
         end do
@@ -1666,6 +1661,8 @@ subroutine calc_xm_wpxp_lhs_terms( nz, ngrdcol, gr, Kh_zm, wm_zm, wm_zt, wp2,
                            lhs_ma_zt )                              ! Intent(out)
     end if    
 
+    !$acc exit data delete( Kh_N2_zm, K_zm, K_zt, Kw6_zm, zeros_array )
+
     return
 
   end subroutine calc_xm_wpxp_lhs_terms
@@ -1828,7 +1825,7 @@ subroutine xm_wpxp_rhs( nz, ngrdcol, solve_type, l_iter, dt, xm, wpxp, & ! In
 
     !------------------- Begin Code -------------------
 
-    !$acc declare create( rhs_bp_pr3 )
+    !$acc enter data create( rhs_bp_pr3 )
 
     ! Initialize output array and precalculate the reciprocal of dt
     invrs_dt = 1.0_core_rknd / dt    
@@ -1837,7 +1834,7 @@ subroutine xm_wpxp_rhs( nz, ngrdcol, solve_type, l_iter, dt, xm, wpxp, & ! In
     call wpxp_terms_bp_pr3_rhs( nz, ngrdcol, C7_Skw_fnc, thv_ds_zm, xpthvp, & ! intent(in)
                                 rhs_bp_pr3 )                                  ! intent(out)
                             
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       ! Set lower boundary for xm
       rhs(i,1) = xm(i,1)
@@ -1873,7 +1870,7 @@ subroutine xm_wpxp_rhs( nz, ngrdcol, solve_type, l_iter, dt, xm, wpxp, & ! In
     end do
     !$acc end parallel loop
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       ! Upper boundary for xm
       rhs(i,2*nz-1) = xm(i,nz) * invrs_dt + xm_forcing(i,nz)
@@ -2027,6 +2024,8 @@ subroutine xm_wpxp_rhs( nz, ngrdcol, solve_type, l_iter, dt, xm, wpxp, & ! In
 
     endif ! l_stats_samp
 
+    !$acc exit data delete( rhs_bp_pr3 )
+
     return
 
   end subroutine xm_wpxp_rhs
@@ -4565,7 +4564,7 @@ subroutine xm_wpxp_clipping_and_stats( &
 
     ! --------------------------- Begin code ---------------------------
 
-    !$acc declare create( xm_old, wpxp_pd, xm_pd, wpxp_chnge, xp2_relaxed )
+    !$acc enter data create( xm_old, wpxp_pd, xm_pd, wpxp_chnge, xp2_relaxed )
 
     select case ( solve_type )
 
@@ -4678,7 +4677,7 @@ subroutine xm_wpxp_clipping_and_stats( &
     !$acc end parallel loop
 
     ! Lower boundary condition on xm
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       xm(i,1) = xm(i,2)
     end do
@@ -4911,7 +4910,7 @@ subroutine xm_wpxp_clipping_and_stats( &
 
       ! Hole filling does not affect the below ground level, perform a blunt clipping
       ! here on that level to prevent small values of xm(1)
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         if ( any( xm(i,:) < xm_threshold) ) then
           xm(i,1) = max( xm(i,1), xm_tol )
@@ -5044,7 +5043,9 @@ subroutine xm_wpxp_clipping_and_stats( &
                                   wpxp_chnge, gr%invrs_dzt,     & ! intent(in)
                                   stats_zt,                     & ! intent(inout)
                                   xm )                            ! intent(inout)
-    end if
+    end if 
+
+    !$acc exit data delete( xm_old, wpxp_pd, xm_pd, wpxp_chnge, xp2_relaxed )
 
     return
 
@@ -5135,7 +5136,7 @@ pure subroutine xm_term_ta_lhs( nz, ngrdcol, gr, &
     integer :: i, k    ! Vertical level index
 
     ! Set lower boundary condition to 0
-    !$acc parallel loop default(present) 
+    !$acc parallel loop gang vector default(present) 
     do i = 1, ngrdcol
       lhs_ta_xm(k_mdiag,i,1)   = zero
       lhs_ta_xm(km1_mdiag,i,1) = zero
@@ -5241,7 +5242,7 @@ pure subroutine wpxp_term_tp_lhs( nz, ngrdcol, gr, wp2, &
     integer :: i, k  ! Vertical level index
 
     ! Set lower boundary to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs_tp(1,i,1) = zero
       lhs_tp(2,i,1) = zero
@@ -5264,7 +5265,7 @@ pure subroutine wpxp_term_tp_lhs( nz, ngrdcol, gr, wp2, &
     !$acc end parallel loop
 
     ! Set upper boundary to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs_tp(1,i,nz) = 0.0_core_rknd
       lhs_tp(2,i,nz) = 0.0_core_rknd
@@ -5360,7 +5361,7 @@ pure subroutine wpxp_terms_ac_pr2_lhs( nz, ngrdcol, C7_Skw_fnc, &
     !$acc     copyout( lhs_ac_pr2 ) 
 
     ! Set lower boundary to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs_ac_pr2(i,1) = zero
     end do
@@ -5378,7 +5379,7 @@ pure subroutine wpxp_terms_ac_pr2_lhs( nz, ngrdcol, C7_Skw_fnc, &
     !$acc end parallel loop
 
     ! Set upper boundary to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs_ac_pr2(i,nz) = zero
     end do
@@ -5471,7 +5472,7 @@ pure subroutine wpxp_term_pr1_lhs( nz, ngrdcol, C6rt_Skw_fnc, C6thl_Skw_fnc, C7_
     end do
     !$acc end parallel loop
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
 
       ! Set lower boundary to 0
@@ -5502,7 +5503,7 @@ pure subroutine wpxp_term_pr1_lhs( nz, ngrdcol, C6rt_Skw_fnc, C6thl_Skw_fnc, C7_
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
 
         ! Set lower boundary to 0
@@ -5578,7 +5579,7 @@ pure subroutine wpxp_terms_bp_pr3_rhs( nz, ngrdcol, C7_Skw_fnc, thv_ds_zm, xpthv
     !$acc     copyout( rhs_bp_pr3 )
 
     ! Set lower boundary to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs_bp_pr3(i,1) = zero
     end do
@@ -5592,7 +5593,7 @@ pure subroutine wpxp_terms_bp_pr3_rhs( nz, ngrdcol, C7_Skw_fnc, thv_ds_zm, xpthv
     end do ! k = 2, nz-1
 
     ! Set upper boundary to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs_bp_pr3(i,nz) = zero
     end do
@@ -5775,7 +5776,7 @@ subroutine xm_correction_wpxp_cl( nz, ngrdcol, solve_type, dt, &
 
     !---------------------------- Begin Code ----------------------------
 
-    !$acc declare create( xm_tndcy_wpxp_cl, l_clipping_needed, l_any_clipping_needed )
+    !$acc enter data create( xm_tndcy_wpxp_cl, l_clipping_needed, l_any_clipping_needed )
 
     l_any_clipping_needed = .false.
 
@@ -5831,6 +5832,8 @@ subroutine xm_correction_wpxp_cl( nz, ngrdcol, solve_type, dt, &
       end do
     endif
 
+    !$acc exit data delete( xm_tndcy_wpxp_cl, l_clipping_needed, l_any_clipping_needed )
+
     return
 
   end subroutine xm_correction_wpxp_cl
@@ -5953,7 +5956,7 @@ pure subroutine diagnose_upxp( nz, ngrdcol, gr, ypwp, xm, wpxp, ym, &
 
     !----------------------------- Begin Code ------------------------------
 
-    !$acc declare create( ddzt_xm, ddzt_ym )
+    !$acc enter data create( ddzt_xm, ddzt_ym )
     
     ddzt_xm = ddzt( nz, ngrdcol, gr, xm )
     ddzt_ym = ddzt( nz, ngrdcol, gr, ym )
@@ -5964,9 +5967,11 @@ pure subroutine diagnose_upxp( nz, ngrdcol, gr, ypwp, xm, wpxp, ym, &
         ypxp(i,k) = ( tau_C6_zm(i,k) / C6x_Skw_fnc(i,k) ) &
                     * ( - ypwp(i,k) * ddzt_xm(i,k) - (one - C7_Skw_fnc(i,k) ) &
                       * ( wpxp(i,k) * ddzt_ym(i,k) ) )
-     end do
-   end do
-   !$acc end parallel loop
+      end do
+    end do
+    !$acc end parallel loop
+
+    !$acc exit data delete( ddzt_xm, ddzt_ym )
               
     return
 
diff --git a/src/CLUBB_core/advance_xp2_xpyp_module.F90 b/src/CLUBB_core/advance_xp2_xpyp_module.F90
index 7ac3d31ac..3c91399a2 100644
--- a/src/CLUBB_core/advance_xp2_xpyp_module.F90
+++ b/src/CLUBB_core/advance_xp2_xpyp_module.F90
@@ -3053,7 +3053,7 @@ subroutine xp2_xpyp_uv_rhs( nz, ngrdcol, gr, solve_type, dt, & ! In
 
     !----------------------------- Begin Code ----------------------------------
 
-    !$acc declare create( rhs_pr2 )
+    !$acc enter data create( rhs_pr2 )
 
     select case ( solve_type )
     case ( xp2_xpyp_vp2 )
@@ -3228,6 +3228,8 @@ subroutine xp2_xpyp_uv_rhs( nz, ngrdcol, gr, solve_type, dt, & ! In
       rhs(i,nz) = w_tol_sqd
     end do
     !$acc end parallel loop
+
+    !$acc exit data delete( rhs_pr2 )
     
     return
 
diff --git a/src/CLUBB_core/clip_explicit.F90 b/src/CLUBB_core/clip_explicit.F90
index 2f6801fad..40825cad1 100644
--- a/src/CLUBB_core/clip_explicit.F90
+++ b/src/CLUBB_core/clip_explicit.F90
@@ -619,7 +619,7 @@ subroutine clip_covar( nz, ngrdcol, gr, solve_type, l_first_clip_ts,  &
 
     ! Since there is no covariance clipping at the upper or lower boundaries,
     ! the change in x'y' due to covariance clipping at those levels is 0.
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       xpyp_chnge(i,1)  = 0.0_core_rknd
       xpyp_chnge(i,nz) = 0.0_core_rknd
@@ -1180,7 +1180,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
 
     !----------------------- Begin Code-----------------------
 
-    !$acc declare create( wp2_zt_cubed, wp3_lim_sqd, zagl_thresh, H_zagl )
+    !$acc enter data create( wp2_zt_cubed, wp3_lim_sqd, zagl_thresh, H_zagl )
 
     ! Compute the upper and lower limits of w'^3 at every level,
     ! based on the skewness of w, Sk_w, such that:
@@ -1277,6 +1277,8 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
     end do
     !$acc end parallel
 
+    !$acc exit data delete( wp2_zt_cubed, wp3_lim_sqd, zagl_thresh, H_zagl )
+
   end subroutine clip_skewness_core
 
 !===============================================================================
diff --git a/src/CLUBB_core/clubb_api_module.F90 b/src/CLUBB_core/clubb_api_module.F90
index 1008ea0b4..4a54552dc 100644
--- a/src/CLUBB_core/clubb_api_module.F90
+++ b/src/CLUBB_core/clubb_api_module.F90
@@ -252,6 +252,9 @@ module clubb_api_module
     ! Calculates liquid water potential temperature from absolute temperature 
     T_in_K2thlm_api => T_in_K2thlm
 
+  use advance_clubb_core_module, only: &
+    setup_clubb_core_api => setup_clubb_core
+
   use stats_type, only: stats ! Type
 
   implicit none
@@ -1697,143 +1700,6 @@ subroutine advance_clubb_core_api_multi_col( gr, nz, ngrdcol, &
 
   end subroutine advance_clubb_core_api_multi_col
 
-  !================================================================================================
-  ! setup_clubb_core - Sets up the model for execution.
-  !================================================================================================
-
-  subroutine setup_clubb_core_api( &
-    nzmax, T0_in, ts_nudge_in,                          & ! intent(in)
-    hydromet_dim_in, sclr_dim_in,                       & ! intent(in)
-    sclr_tol_in, edsclr_dim_in, params,                 & ! intent(in)
-    l_host_applies_sfc_fluxes,                          & ! intent(in)
-    saturation_formula,                                 & ! intent(in)
-    l_input_fields,                                     & ! intent(in)
-#ifdef GFDL
-    I_sat_sphum,                                        & ! intent(in)  h1g, 2010-06-16
-#endif
-    iiPDF_type,                                         & ! intent(in)
-    ipdf_call_placement,                                & ! intent(in)
-    l_predict_upwp_vpwp,                                & ! intent(in)
-    l_min_xp2_from_corr_wx,                             & ! intent(in)
-    l_prescribed_avg_deltaz,                            & ! intent(in)
-    l_damp_wp2_using_em,                                & ! intent(in)
-    l_stability_correct_tau_zm,                         & ! intent(in)
-    l_enable_relaxed_clipping,                          & ! intent(in)
-    l_diag_Lscale_from_tau,                             & ! intent(in)
-#ifdef GFDL
-    cloud_frac_min ,                                    & ! intent(in)  h1g, 2010-06-16
-#endif
-    err_code_api )             ! intent(out) 
-
-    use advance_clubb_core_module, only : setup_clubb_core
-
-    use parameter_indices, only:  &
-        nparams ! Variable(s)
-      
-    use model_flags, only: &
-        clubb_config_flags_type  ! Type
-
-! TODO: This should be called from the api, but all the host models appear to call
-!       it directly or not at all.
-!   use model_flags, only: &
-!     setup_model_flags    ! Subroutine
-
-      implicit none
-
-    ! Input Variables
-
-    integer, intent(in) :: nzmax  ! Vertical grid levels            [#]
-
-    ! Model parameters
-    real( kind = core_rknd ), intent(in) ::  &
-      T0_in, ts_nudge_in
-
-    integer, intent(in) :: &
-      hydromet_dim_in,  & ! Number of hydrometeor species
-      sclr_dim_in,      & ! Number of passive scalars
-      edsclr_dim_in       ! Number of eddy-diff. passive scalars
-
-    real( kind = core_rknd ), intent(in), dimension(sclr_dim_in) :: &
-      sclr_tol_in    ! Thresholds for passive scalars
-
-    real( kind = core_rknd ), intent(in), dimension(nparams) :: &
-      params  ! Including C1, nu1, nu2, etc.
-
-    ! Flags
-    logical, intent(in) ::  &
-      l_host_applies_sfc_fluxes ! Whether to apply for the surface flux
-
-    character(len=*), intent(in) :: &
-      saturation_formula ! Approximation for saturation vapor pressure
-
-    logical, intent(in) ::  &
-      l_input_fields    ! Flag for whether LES input fields are used
-
-    integer, intent(in) :: &
-      iiPDF_type,          & ! Selected option for the two-component normal
-                             ! (double Gaussian) PDF type to use for the w,
-                             ! rt, and theta-l (or w, chi, and eta) portion of
-                             ! CLUBB's multivariate, two-component PDF.
-      ipdf_call_placement    ! Selected option for the placement of the call to
-                             ! CLUBB's PDF.
-
-    logical, intent(in) :: &
-      l_predict_upwp_vpwp,         & ! Flag to predict <u'w'> and <v'w'> along with <u> and <v>
-                                     ! alongside the advancement of <rt>, <w'rt'>, <thl>, <wpthlp>,
-                                     ! <sclr>, and <w'sclr'> in subroutine advance_xm_wpxp.
-                                     ! Otherwise, <u'w'> and <v'w'> are still approximated by eddy
-                                     ! diffusivity when <u> and <v> are advanced in subroutine
-                                     ! advance_windm_edsclrm.
-      l_min_xp2_from_corr_wx,     & ! Flag to base the threshold minimum value of xp2 (rtp2 and
-                                    ! thlp2) on keeping the overall correlation of w and x within
-                                    ! the limits of -max_mag_correlation_flux to
-                                    ! max_mag_correlation_flux.
-      l_prescribed_avg_deltaz,    & ! used in adj_low_res_nu. If .true., avg_deltaz = deltaz
-      l_damp_wp2_using_em,        &
-      l_stability_correct_tau_zm, &
-      l_enable_relaxed_clipping,  & ! Flag to relax clipping on wpxp in
-                                    ! xm_wpxp_clipping_and_stats
-      l_diag_Lscale_from_tau        ! First diagnose dissipation time tau, and
-                                    ! then diagnose the mixing length scale as
-                                    ! Lscale = tau * tke
-
-#ifdef GFDL
-      logical, intent(in) :: &  ! h1g, 2010-06-16 begin mod
-         I_sat_sphum
-
-      real( kind = core_rknd ), intent(in) :: &
-         cloud_frac_min         ! h1g, 2010-06-16 end mod
-#endif
-
-    integer, intent(out) :: & 
-      err_code_api   ! Diagnostic for a problem with the setup 
-
-    call setup_clubb_core(  &
-      nzmax, T0_in, ts_nudge_in,                            & ! intent(in)
-      hydromet_dim_in, sclr_dim_in,                         & ! intent(in)
-      sclr_tol_in, edsclr_dim_in, params,                   & ! intent(in)
-      l_host_applies_sfc_fluxes,                            & ! intent(in)
-      saturation_formula,                                   & ! intent(in)
-      l_input_fields,                                       & ! intent(in)
-#ifdef GFDL
-      I_sat_sphum,                                          & ! intent(in)  h1g, 2010-06-16
-#endif
-      iiPDF_type,                                           & ! intent(in)
-      ipdf_call_placement,                                  & ! intent(in)
-      l_predict_upwp_vpwp,                                  & ! intent(in)
-      l_min_xp2_from_corr_wx,                               & ! intent(in)
-      l_prescribed_avg_deltaz,                              & ! intent(in)
-      l_damp_wp2_using_em,                                  & ! intent(in)
-      l_stability_correct_tau_zm,                           & ! intent(in)
-      l_enable_relaxed_clipping,                            & ! intent(in)
-      l_diag_Lscale_from_tau,                               & ! intent(in)
-#ifdef GFDL
-      cloud_frac_min,                                       & ! intent(in)  h1g, 2010-06-16
-#endif
-      err_code_api )                                          ! intent(out)
-
-  end subroutine setup_clubb_core_api
-
   !================================================================================================
   ! cleanup_clubb_core_api - Frees memory used by the model.
   !================================================================================================
diff --git a/src/CLUBB_core/diffusion.F90 b/src/CLUBB_core/diffusion.F90
index 3903c9a0e..f0ce296e6 100644
--- a/src/CLUBB_core/diffusion.F90
+++ b/src/CLUBB_core/diffusion.F90
@@ -340,7 +340,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
 
       ! extra terms with upwind scheme 
       ! k = 1 (bottom level); lowere boundary level 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         lhs_upwind(kp1_tdiag,i,1) = + min( drhoKdz_zt(i,1) , zero ) * gr%invrs_dzm(i,1)  
         lhs_upwind(k_tdiag,i,1)   = - min( drhoKdz_zt(i,1) , zero ) * gr%invrs_dzm(i,1)
@@ -369,7 +369,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
 
       ! k = nz (top level); upper boundary level.
       ! Only relevant if zero-flux boundary conditions are used.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         lhs_upwind(kp1_tdiag,i,nz) =  zero 
         lhs_upwind(k_tdiag,i,nz)   = + max( drhoKdz_zt(i,nz) , zero ) * gr%invrs_dzm(i,nz-1)
@@ -387,7 +387,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
 
     if ( .not. l_upwind_Kh_dp_term ) then
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Thermodynamic superdiagonal: [ x var_zt(k+1,<t+1>) ]
         lhs(kp1_tdiag,i,1) = - gr%invrs_dzt(i,1) * invrs_rho_ds_zt(i,1) &
@@ -404,7 +404,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
 
     else
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Thermodynamic superdiagonal: [ x var_zt(k+1,<t+1>) ]
         lhs(kp1_tdiag,i,1) = - gr%invrs_dzt(i,1) * ( K_zt(i,1) + nu(i) ) * gr%invrs_dzm(i,1) & 
@@ -477,7 +477,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
 
     if ( .not. l_upwind_Kh_dp_term ) then 
 
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do i = 1, ngrdcol
         ! Thermodynamic superdiagonal: [ x var_zt(k+1,<t+1>) ]
         lhs(kp1_tdiag,i,nz) = zero
@@ -496,7 +496,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
 
     else
 
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do i = 1, ngrdcol
         ! Thermodynamic superdiagonal: [ x var_zt(k+1,<t+1>) ]
         lhs(kp1_tdiag,i,nz) = zero + lhs_upwind(kp1_tdiag,i,nz)
@@ -990,7 +990,7 @@ pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
 
     ! extra terms with upwind scheme 
     ! k = 1 (bottom level); lowere boundary level 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs_upwind(kp1_mdiag,i,1) = + min( drhoKdz_zm(i,1) , zero ) * gr%invrs_dzt(i,2)
       lhs_upwind(k_mdiag,i,1)   = - min( drhoKdz_zm(i,1) , zero ) * gr%invrs_dzt(i,2)
@@ -1015,7 +1015,7 @@ pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
 
     ! k = nz (top level); upper boundary level.
     ! Only relevant if zero-flux boundary conditions are used.
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lhs_upwind(kp1_mdiag,i,nz) = zero
       lhs_upwind(k_mdiag,i,nz)   = + max( drhoKdz_zm(i,nz) , zero ) * gr%invrs_dzt(i,nz)
@@ -1031,7 +1031,7 @@ pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
 
     if ( .not. l_upwind_Kh_dp_term ) then
       
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Momentum superdiagonal: [ x var_zm(k+1,<t+1>) ]
         lhs(kp1_mdiag,i,1) = - gr%invrs_dzm(i,1) * invrs_rho_ds_zm(i,1) &
@@ -1048,7 +1048,7 @@ pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
 
     else
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Momentum superdiagonal: [ x var_zm(k+1,<t+1>) ]
         lhs(kp1_mdiag,i,1) = - gr%invrs_dzm(i,1) * ( K_zm(i,1) + nu(i) ) * gr%invrs_dzt(i,2) &
@@ -1122,7 +1122,7 @@ pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
 
     if ( .not. l_upwind_Kh_dp_term ) then
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Momentum superdiagonal: [ x var_zm(k+1,<t+1>) ]
         lhs(kp1_mdiag,i,nz) = zero
@@ -1141,7 +1141,7 @@ pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
 
     else
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ! Momentum superdiagonal: [ x var_zm(k+1,<t+1>) ]
         lhs(kp1_mdiag,i,nz) =  zero &
diff --git a/src/CLUBB_core/fill_holes.F90 b/src/CLUBB_core/fill_holes.F90
index b3f92135a..81b49d48a 100644
--- a/src/CLUBB_core/fill_holes.F90
+++ b/src/CLUBB_core/fill_holes.F90
@@ -143,7 +143,7 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
       return
     end if
 
-    !$acc parallel loop collapse(2) default(present)
+    !$acc parallel loop gang vector collapse(2) default(present)
     do k = 1, nz
       do i = 1, ngrdcol
         rho_ds_dz(i,k) = rho_ds(i,k) * dz(i,k)
@@ -246,14 +246,14 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
     ! if any holes need filling before the final step of updating the field. 
 
     ! Compute the numerator and denominator integrals
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       numer_integral_global(i) = 0.0_core_rknd
       denom_integral_global(i) = 0.0_core_rknd
     end do
     !$acc end parallel loop
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       do k = 2, upper_hf_level
         numer_integral_global(i) = numer_integral_global(i) + rho_ds_dz(i,k) * field(i,k)
@@ -264,7 +264,7 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
     !$acc end parallel loop
 
     
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
 
       ! Find the vertical average of field, using the precomputed numerator and denominator,
@@ -285,7 +285,7 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
     !$acc end parallel loop
 
     ! To compute the clipped field's vertical integral we only need to recompute the numerator
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       numer_integral_global(i) = 0.0_core_rknd
       do k = 2, upper_hf_level
@@ -294,7 +294,7 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
     end do
     !$acc end parallel loop
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
 
       ! Do not complete calculations or update field values for this 
diff --git a/src/CLUBB_core/grid_class.F90 b/src/CLUBB_core/grid_class.F90
index 7381f7100..a94de605c 100644
--- a/src/CLUBB_core/grid_class.F90
+++ b/src/CLUBB_core/grid_class.F90
@@ -1505,7 +1505,7 @@ pure subroutine linear_interpolated_azm_2D( nz, ngrdcol, gr, azt, &
     ! Use a linear extension based on the values of azt at levels gr%nz and
     ! gr%nz-1 to find the value of azm at level gr%nz (the uppermost level
     ! in the model).
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       linear_interpolated_azm(i,nz) &
         = ( ( azt(i,nz) - azt(i,nz-1) ) / ( gr%zt(i,nz) - gr%zt(i,nz-1) ) ) & 
@@ -1964,7 +1964,7 @@ pure subroutine linear_interpolated_azt_2D( nz, ngrdcol, gr, azm, &
     ! thermodynamic levels is azt.
     ! Use a linear extension based on the values of azm at levels 1 and 2 to
     ! find the value of azt at level 1 (the lowermost level in the model).
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       linear_interpolated_azt(i,1) &
         = ( ( azm(i,2) - azm(i,1) ) / ( gr%zm(i,2) - gr%zm(i,1) ) ) & 
@@ -2317,7 +2317,7 @@ pure function gradzm_2D( nz, ngrdcol, gr, azm )
     !$acc data copyin( gr, gr%invrs_dzt, azm ) &
     !$acc     copyout( gradzm_2D )
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       gradzm_2D(i,1) = ( azm(i,2) - azm(i,1) ) * gr%invrs_dzt(i,2)
     end do
@@ -2411,7 +2411,7 @@ pure function gradzt_2D( nz, ngrdcol, gr, azt )
     !$acc data copyin( gr, gr%invrs_dzm, azt ) &
     !$acc     copyout( gradzt_2D )
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       gradzt_2D(i,nz) = ( azt(i,nz) - azt(i,nz-1) ) * gr%invrs_dzm(i,nz-1)
     end do
diff --git a/src/CLUBB_core/interpolation.F90 b/src/CLUBB_core/interpolation.F90
index 8ee884f35..680cc28b0 100644
--- a/src/CLUBB_core/interpolation.F90
+++ b/src/CLUBB_core/interpolation.F90
@@ -541,7 +541,7 @@ subroutine pvertinterp( nz, ngrdcol, &
     ! If we've fallen through the k=1,nz-1 loop, we cannot interpolate and
     ! must extrapolate from the bottom or top data level for at least some
     ! of the longitude points.
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
 
       if ( p_out >= p_mid(i,1) ) then
diff --git a/src/CLUBB_core/matrix_solver_wrapper.F90 b/src/CLUBB_core/matrix_solver_wrapper.F90
index 46aa831cf..f22be91d9 100644
--- a/src/CLUBB_core/matrix_solver_wrapper.F90
+++ b/src/CLUBB_core/matrix_solver_wrapper.F90
@@ -107,16 +107,10 @@ subroutine band_solve_single_rhs_multiple_lhs( &
 
     ! ----------------------- Begin Code -----------------------
 
-#ifdef _OPENACC
-    if ( penta_solve_method /= penta_lu  ) then
-      write(fstderr,*) "Running on GPUs requires penta_solve_method = penta_lu = 2"
-      err_code = clubb_no_error
-      return
-    end if
-#endif
-
     if ( present(rcond) ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Lapack overwrites lhs and rhs, so we'll give it copies of them.
       lhs_copy = lhs
       rhs_copy = rhs
@@ -128,17 +122,23 @@ subroutine band_solve_single_rhs_multiple_lhs( &
                                lhs_copy, rhs_copy,    & ! Intent(inout)
                                dummy_soln, rcond )      ! Intent(out)
 
+      !$acc update device( rcond )
+
     end if
 
 
     if ( penta_solve_method == lapack ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Perform LU decomp and solve system (LAPACK)
       call lapack_band_solve( "xm_wpxp", nsup, nsub,  & ! Intent(in) 
                               ndim, 1, ngrdcol,       & ! Intent(in) 
                               lhs, rhs,               & ! Intent(inout)
                               soln )                    ! Intent(out)
 
+      !$acc update device( soln )
+
     else if ( penta_solve_method == penta_lu ) then 
 
       ! Solve the system with a penta-diagonal specific LU decomp
@@ -225,16 +225,10 @@ subroutine band_solve_multiple_rhs_lhs( &
 
     ! ----------------------- Begin Code -----------------------
 
-#ifdef _OPENACC
-    if ( penta_solve_method /= penta_lu  ) then
-      write(fstderr,*) "Running on GPUs requires penta_solve_method = penta_lu = 2"
-      err_code = clubb_no_error
-      return
-    end if
-#endif
-
     if ( present(rcond) ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Lapack overwrites lhs and rhs, so we'll give it copies of them.
       lhs_copy = lhs
       rhs_copy = rhs
@@ -246,17 +240,23 @@ subroutine band_solve_multiple_rhs_lhs( &
                                lhs_copy, rhs_copy,    & ! Intent(inout)
                                dummy_soln, rcond )      ! Intent(out)
 
+      !$acc update device( rcond )
+
     end if
 
 
     if ( penta_solve_method == lapack ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Perform LU decomp and solve system (LAPACK)
       call lapack_band_solve( "xm_wpxp", nsup, nsub,  & ! Intent(in) 
                               ndim, nrhs, ngrdcol,    & ! Intent(in) 
                               lhs, rhs,               & ! Intent(inout)
                               soln )                    ! Intent(out)
 
+      !$acc update device( soln )
+
     else if ( penta_solve_method == penta_lu ) then 
 
       ! Solve the system with a penta-diagonal specific LU decomp
@@ -339,14 +339,6 @@ subroutine tridiag_solve_single_rhs_lhs( &
 
     ! ----------------------- Begin Code -----------------------
 
-#ifdef _OPENACC
-    if ( tridiag_solve_method /= tridiag_lu ) then
-      write(fstderr,*) "Running on GPUs requires tridiag_solve_method = tridiag_lu = 2"
-      err_code = clubb_no_error
-      return
-    end if
-#endif
-
     if ( present(rcond) ) then
 
       ! Lapack overwrites lhs and rhs, so we'll give it copies of them.
@@ -357,7 +349,6 @@ subroutine tridiag_solve_single_rhs_lhs( &
       call lapack_tridiag_solvex( solve_name, ndim, 1, 1, & ! Intent(in) 
                                   lhs_copy, rhs_copy,     & ! Intent(inout)
                                   dummy_soln, rcond )       ! Intent(out)
-
     end if
 
 
@@ -448,16 +439,10 @@ subroutine tridiag_solve_single_rhs_multiple_lhs( &
 
     ! ----------------------- Begin Code -----------------------
 
-#ifdef _OPENACC
-    if ( tridiag_solve_method /= tridiag_lu ) then
-      write(fstderr,*) "Running on GPUs requires tridiag_solve_method = tridiag_lu = 2"
-      err_code = clubb_no_error
-      return
-    end if
-#endif
-
     if ( present(rcond) ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Lapack overwrites lhs and rhs, so we'll give it copies of them.
       lhs_copy = lhs
       rhs_copy = rhs
@@ -466,17 +451,23 @@ subroutine tridiag_solve_single_rhs_multiple_lhs( &
       call lapack_tridiag_solvex( solve_name, ndim, 1, ngrdcol, & ! Intent(in) 
                                   lhs_copy, rhs_copy,           & ! Intent(inout)
                                   dummy_soln, rcond )             ! Intent(out)
+      
+      !$acc update device( rcond )
 
     end if
 
 
     if ( tridiag_solve_method == lapack ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Perform LU decomp and solve system (LAPACK)
       call lapack_tridiag_solve( solve_name, ndim, 1, ngrdcol, & ! Intent(in) 
                                  lhs, rhs,                     & ! Intent(inout)
                                  soln )                          ! Intent(out)
 
+      !$acc update device( soln )
+
     else if ( tridiag_solve_method == tridiag_lu ) then
 
       ! Solve the system with a tri-diagonal specific LU decomp
@@ -555,17 +546,11 @@ subroutine tridiag_solve_multiple_rhs_lhs( &
     integer :: i
 
     ! ----------------------- Begin Code -----------------------
-    
-#ifdef _OPENACC
-    if ( tridiag_solve_method /= tridiag_lu ) then
-      write(fstderr,*) "Running on GPUs requires tridiag_solve_method = tridiag_lu = 2"
-      err_code = clubb_no_error
-      return
-    end if
-#endif
 
     if ( present(rcond) ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Lapack overwrites lhs and rhs, so we'll give it copies of them.
       lhs_copy = lhs
       rhs_copy = rhs
@@ -574,17 +559,23 @@ subroutine tridiag_solve_multiple_rhs_lhs( &
       call lapack_tridiag_solvex( solve_name, ndim, nrhs, ngrdcol,  & ! Intent(in) 
                                   lhs_copy, rhs_copy,               & ! Intent(inout)
                                   dummy_soln, rcond )                 ! Intent(out)
+      
+      !$acc update device( rcond )
 
     end if
 
 
     if ( tridiag_solve_method == lapack ) then
 
+      !$acc update host( lhs, rhs )
+
       ! Perform LU decomp and solve system (LAPACK)
       call lapack_tridiag_solve( solve_name, ndim, nrhs, ngrdcol, & ! Intent(in) 
                                  lhs, rhs,                        & ! Intent(inout)
                                  soln )                             ! Intent(out)
 
+      !$acc update device( soln )
+
     else if ( tridiag_solve_method == tridiag_lu ) then
 
       ! Solve the system with a tri-diagonal specific LU decomp
diff --git a/src/CLUBB_core/mean_adv.F90 b/src/CLUBB_core/mean_adv.F90
index 3ff37ca26..f111495f1 100644
--- a/src/CLUBB_core/mean_adv.F90
+++ b/src/CLUBB_core/mean_adv.F90
@@ -266,7 +266,7 @@ pure subroutine term_ma_zt_lhs( nz, ngrdcol, wm_zt, weights_zt2zm, & ! Intent(in
       ! derivative d(var_zt)/dz over the model top is set to 0, in order
       ! to stay consistent with the zero-flux boundary condition option
       ! in the eddy diffusion code.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         
         ! Thermodynamic superdiagonal: [ x var_zt(k+1,<t+1>) ]
@@ -317,7 +317,7 @@ pure subroutine term_ma_zt_lhs( nz, ngrdcol, wm_zt, weights_zt2zm, & ! Intent(in
       !$acc end parallel loop
 
       ! Upper Boundary
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         if ( wm_zt(i,nz) >= zero ) then  ! Mean wind is in upward direction
 
diff --git a/src/CLUBB_core/mixing_length.F90 b/src/CLUBB_core/mixing_length.F90
index f5917a9ab..b324e3fc3 100644
--- a/src/CLUBB_core/mixing_length.F90
+++ b/src/CLUBB_core/mixing_length.F90
@@ -232,12 +232,12 @@ subroutine compute_mixing_length( nz, ngrdcol, gr, thvm, thlm, &
 
     ! --------------------------------- Begin Code ---------------------------------
 
-    !$acc declare create( exp_mu_dzm, invrs_dzm_on_mu, grav_on_thvm, Lv_coef, &
-    !$acc                 entrain_coef, thl_par_j_precalc, rt_par_j_precalc, &
-    !$acc                 tl_par_1, rt_par_1, rsatl_par_1, thl_par_1, dCAPE_dz_1, &
-    !$acc                 s_par_1, rc_par_1, CAPE_incr_1, thv_par_1, tke_i )
+    !$acc enter data create( exp_mu_dzm, invrs_dzm_on_mu, grav_on_thvm, Lv_coef, &
+    !$acc                    entrain_coef, thl_par_j_precalc, rt_par_j_precalc, &
+    !$acc                    tl_par_1, rt_par_1, rsatl_par_1, thl_par_1, dCAPE_dz_1, &
+    !$acc                    s_par_1, rc_par_1, CAPE_incr_1, thv_par_1, tke_i )
  
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       if ( abs(mu(i)) < eps ) then
         err_code = clubb_fatal_error
@@ -855,6 +855,11 @@ subroutine compute_mixing_length( nz, ngrdcol, gr, thvm, thlm, &
 
     end if
 
+    !$acc exit data delete( exp_mu_dzm, invrs_dzm_on_mu, grav_on_thvm, Lv_coef, &
+    !$acc                   entrain_coef, thl_par_j_precalc, rt_par_j_precalc, &
+    !$acc                   tl_par_1, rt_par_1, rsatl_par_1, thl_par_1, dCAPE_dz_1, &
+    !$acc                   s_par_1, rc_par_1, CAPE_incr_1, thv_par_1, tke_i )
+
     return
 
   end subroutine compute_mixing_length
@@ -995,10 +1000,11 @@ subroutine calc_Lscale_directly ( ngrdcol, nz, gr, &
 
     !--------------------------------- Begin Code ---------------------------------
 
-    !$acc declare create( sign_rtpthlp, Lscale_pert_1, Lscale_pert_2, &
-    !$acc                 thlm_pert_1, thlm_pert_2, rtm_pert_1, rtm_pert_2, &
-    !$acc                 thlm_pert_pos_rt, thlm_pert_neg_rt, rtm_pert_pos_rt, rtm_pert_neg_rt, &
-    !$acc                 mu_pert_1, mu_pert_2, mu_pert_pos_rt, mu_pert_neg_rt )
+    !$acc enter data create( sign_rtpthlp, Lscale_pert_1, Lscale_pert_2, &
+    !$acc                    thlm_pert_1, thlm_pert_2, rtm_pert_1, rtm_pert_2, &
+    !$acc                    thlm_pert_pos_rt, thlm_pert_neg_rt, rtm_pert_pos_rt, &
+    !$acc                    rtm_pert_neg_rt, &
+    !$acc                    mu_pert_1, mu_pert_2, mu_pert_pos_rt, mu_pert_neg_rt )
 
     Lscale_mu_coef = clubb_params(iLscale_mu_coef)
     Lscale_pert_coef = clubb_params(iLscale_pert_coef)
@@ -1044,7 +1050,7 @@ subroutine calc_Lscale_directly ( ngrdcol, nz, gr, &
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do  i = 1, ngrdcol
         mu_pert_1(i)   = newmu(i) / Lscale_mu_coef
       end do 
@@ -1073,7 +1079,7 @@ subroutine calc_Lscale_directly ( ngrdcol, nz, gr, &
       end do
       !$acc end parallel loop
            
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do  i = 1, ngrdcol
         mu_pert_2(i)   = newmu(i) * Lscale_mu_coef
       end do 
@@ -1138,7 +1144,7 @@ subroutine calc_Lscale_directly ( ngrdcol, nz, gr, &
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present) 
+      !$acc parallel loop gang vector default(present) 
       do i = 1, ngrdcol
         mu_pert_pos_rt(i) = newmu(i) / Lscale_mu_coef
         mu_pert_neg_rt(i) = newmu(i) * Lscale_mu_coef
@@ -1219,6 +1225,12 @@ subroutine calc_Lscale_directly ( ngrdcol, nz, gr, &
       end if
     end if
 
+    !$acc exit data delete( sign_rtpthlp, Lscale_pert_1, Lscale_pert_2, &
+    !$acc                   thlm_pert_1, thlm_pert_2, rtm_pert_1, rtm_pert_2, &
+    !$acc                   thlm_pert_pos_rt, thlm_pert_neg_rt, rtm_pert_pos_rt, &
+    !$acc                   rtm_pert_neg_rt, &
+    !$acc                   mu_pert_1, mu_pert_2, mu_pert_pos_rt, mu_pert_neg_rt )
+
    return
    
  end subroutine  calc_Lscale_directly
@@ -1456,7 +1468,7 @@ subroutine diagnose_Lscale_from_tau( nz, ngrdcol, gr, &
     !$acc                    tau_zm_unclipped, tau_zt_unclipped, sqrt_Ri_zm_smooth, em_clipped, &
     !$acc                    tmp_calc, tmp_calc_max, tmp_calc_min_max )
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       if ( gr%zm(i,1) - sfc_elevation(i) + z_displace < eps ) then
         err_code = clubb_fatal_error
@@ -1500,7 +1512,7 @@ subroutine diagnose_Lscale_from_tau( nz, ngrdcol, gr, &
 
     if ( l_smooth_min_max ) then
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ustar(i) = smooth_max( ( upwp_sfc(i)**2 + vpwp_sfc(i)**2 )**one_fourth, ufmin, min_max_smth_mag )
       end do
@@ -1508,7 +1520,7 @@ subroutine diagnose_Lscale_from_tau( nz, ngrdcol, gr, &
 
     else 
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ustar(i) = max( ( upwp_sfc(i)**2 + vpwp_sfc(i)**2 )**one_fourth, ufmin )
       end do
diff --git a/src/CLUBB_core/mono_flux_limiter.F90 b/src/CLUBB_core/mono_flux_limiter.F90
index 108854b03..4b95fce9a 100644
--- a/src/CLUBB_core/mono_flux_limiter.F90
+++ b/src/CLUBB_core/mono_flux_limiter.F90
@@ -464,10 +464,10 @@ subroutine monotonic_turbulent_flux_limit( nz, ngrdcol, gr, solve_type, dt, xm_o
 
     !---------------------------- Begin Code ----------------------------
 
-    !$acc declare create( xp2_zt, xm_enter_mfl, xm_without_ta, wpxp_net_adjust, &
-    !$acc                 min_x_allowable_lev, max_x_allowable_lev, min_x_allowable, &
-    !$acc                 max_x_allowable, wpxp_mfl_max, wpxp_mfl_min, lhs_mfl_xm, &
-    !$acc                 rhs_mfl_xm, l_adjustment_needed, xm_mfl )
+    !$acc enter data create( xp2_zt, xm_enter_mfl, xm_without_ta, wpxp_net_adjust, &
+    !$acc                    min_x_allowable_lev, max_x_allowable_lev, min_x_allowable, &
+    !$acc                    max_x_allowable, wpxp_mfl_max, wpxp_mfl_min, lhs_mfl_xm, &
+    !$acc                    rhs_mfl_xm, l_adjustment_needed, xm_mfl )
 
     select case( solve_type )
     case ( mono_flux_rtm )  ! rtm/wprtp
@@ -616,7 +616,7 @@ subroutine monotonic_turbulent_flux_limit( nz, ngrdcol, gr, solve_type, dt, xm_o
     !$acc end parallel loop
 
     ! Boundary condition on xm_without_ta    
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       xm_without_ta(i,1) = xm(i,1)
       min_x_allowable_lev(i,1) = min_x_allowable_lev(i,2)
@@ -788,7 +788,7 @@ subroutine monotonic_turbulent_flux_limit( nz, ngrdcol, gr, solve_type, dt, xm_o
     !$acc end parallel loop
 
     ! Boundary conditions
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       min_x_allowable(i,1) = 0._core_rknd
       max_x_allowable(i,1) = 0._core_rknd
@@ -927,7 +927,7 @@ subroutine monotonic_turbulent_flux_limit( nz, ngrdcol, gr, solve_type, dt, xm_o
         !$acc end parallel loop
 
         ! Boundary condition on xm
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol 
           xm(i,1) = xm(i,2)
         end do
@@ -941,7 +941,7 @@ subroutine monotonic_turbulent_flux_limit( nz, ngrdcol, gr, solve_type, dt, xm_o
       !enddo
 
       !Ensure there are no spikes at the top of the domain
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
 
         if (abs( xm(i,nz) - xm_enter_mfl(i,nz) ) > 10._core_rknd * xm_tol) then
@@ -1024,6 +1024,11 @@ subroutine monotonic_turbulent_flux_limit( nz, ngrdcol, gr, solve_type, dt, xm_o
       end do
     endif
 
+    !$acc exit data delete( xp2_zt, xm_enter_mfl, xm_without_ta, wpxp_net_adjust, &
+    !$acc                   min_x_allowable_lev, max_x_allowable_lev, min_x_allowable, &
+    !$acc                   max_x_allowable, wpxp_mfl_max, wpxp_mfl_min, lhs_mfl_xm, &
+    !$acc                   rhs_mfl_xm, l_adjustment_needed, xm_mfl )
+
     return
     
   end subroutine monotonic_turbulent_flux_limit
@@ -1235,7 +1240,7 @@ subroutine mfl_xm_rhs( nz, ngrdcol, dt, xm_old, wpxp, xm_forcing, &
     ! The value of xm at the lower boundary will remain the same.  However, the
     ! value of xm at the lower boundary gets overwritten after the matrix is
     ! solved for the next timestep, such that xm(1) = xm(2).
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs(i,1) = xm_old(i,1)
     end do
@@ -1333,7 +1338,7 @@ subroutine mfl_xm_solve( nz, ngrdcol, solve_type, tridiag_solve_method, &
     end if
 
     ! Boundary condition on xm
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       xm(i,1) = xm(i,2)
     end do
@@ -1435,7 +1440,7 @@ subroutine calc_turb_adv_range( nz, ngrdcol, gr, dt, &
 
     !------------------------- Begin Code -------------------------
 
-    !$acc declare create( vert_vel_up, vert_vel_down, w_min )
+    !$acc enter data create( vert_vel_up, vert_vel_down, w_min )
 
     if ( l_constant_thickness ) then ! thickness is a constant value.
 
@@ -1681,7 +1686,7 @@ subroutine calc_turb_adv_range( nz, ngrdcol, gr, dt, &
     ! Information for levels 1, 2, gr%nz-1, and gr%nz is not needed.
     ! However, set the values at these levels for purposes of not having odd
     ! values in the arrays.
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       low_lev_effect(i,1)  = 1
       high_lev_effect(i,1) = 1
@@ -1694,6 +1699,8 @@ subroutine calc_turb_adv_range( nz, ngrdcol, gr, dt, &
     end do
     !$acc end parallel loop
 
+    !$acc exit data delete( vert_vel_up, vert_vel_down, w_min )
+
     return
 
   end subroutine calc_turb_adv_range
@@ -1949,7 +1956,7 @@ subroutine mean_vert_vel_up_down( nz, ngrdcol, &
 
     !------------------------- Begin Code -------------------------
 
-    !$acc declare create( mean_w_down_1st, mean_w_down_2nd, mean_w_up_1st, mean_w_up_2nd )
+    !$acc enter data create( mean_w_down_1st, mean_w_down_2nd, mean_w_up_1st, mean_w_up_2nd )
 
     call calc_mean_w_up_down_component( nz, ngrdcol, & ! intent(in)
                                         w_1_zm, varnce_w_1_zm, & ! intent(in)
@@ -1992,6 +1999,8 @@ subroutine mean_vert_vel_up_down( nz, ngrdcol, &
       end do
     end if ! l_stats_samp
 
+    !$acc exit data delete( mean_w_down_1st, mean_w_down_2nd, mean_w_up_1st, mean_w_up_2nd )
+
     return
 
   end subroutine mean_vert_vel_up_down
@@ -2071,7 +2080,7 @@ subroutine calc_mean_w_up_down_component( nz, ngrdcol, &
 
     !------------------------- Begin Code -------------------------
 
-    !$acc declare create( erf_cache, exp_cache )
+    !$acc enter data create( erf_cache, exp_cache )
 
     invrs_sqrt_2pi = one / sqrt_2pi
 
@@ -2124,7 +2133,7 @@ subroutine calc_mean_w_up_down_component( nz, ngrdcol, &
     !$acc end parallel loop
 
     ! Upper and lower levels are not used, set to 0 to besafe and avoid NaN problems
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       mean_w_down_i(i,1) = 0.0_core_rknd
       mean_w_up_i(i,1) = 0.0_core_rknd
@@ -2134,6 +2143,8 @@ subroutine calc_mean_w_up_down_component( nz, ngrdcol, &
     end do
     !$acc end parallel loop
 
+    !$acc exit data delete( erf_cache, exp_cache )
+
     return
 
   end subroutine calc_mean_w_up_down_component
diff --git a/src/CLUBB_core/sfc_varnce_module.F90 b/src/CLUBB_core/sfc_varnce_module.F90
index b0364d4e3..a73885ed0 100644
--- a/src/CLUBB_core/sfc_varnce_module.F90
+++ b/src/CLUBB_core/sfc_varnce_module.F90
@@ -204,9 +204,9 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
 
     !-------------------------- Begin Code --------------------------
 
-    !$acc declare create( uf, depth_pos_wpthlp, min_wp2_sfc_val, &
-    !$acc                 um_sfc_sqd, vm_sfc_sqd, usp2_sfc, vsp2_sfc, &
-    !$acc                 ustar, zeta, wp2_splat_sfc_correction )
+    !$acc enter data create( uf, depth_pos_wpthlp, min_wp2_sfc_val, &
+    !$acc                    um_sfc_sqd, vm_sfc_sqd, usp2_sfc, vsp2_sfc, &
+    !$acc                    ustar, zeta, wp2_splat_sfc_correction )
 
     up2_sfc_coef = clubb_params(iup2_sfc_coef)
 
@@ -242,7 +242,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       end do
     end if
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       ! Find thickness of layer near surface with positive heat flux.
       ! This is used when l_vary_convect_depth=.true. in order to determine wp2.
@@ -270,7 +270,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
     if ( l_andre_1978 ) then
 
       ! Calculate <u>^2 and <v>^2.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         um_sfc_sqd(i) = um(i,2)**2
         vm_sfc_sqd(i) = vm(i,2)**2
@@ -278,13 +278,13 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !$acc end parallel loop
 
       ! Calculate surface friction velocity, u*.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ustar(i) = max( ( upwp_sfc(i)**2 + vpwp_sfc(i)**2 )**(one_fourth), ufmin )
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         if ( abs(wpthlp(i,1)) > eps) then
 
@@ -317,7 +317,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !            zeta <= -0.212.
       !         3) The surface correlation of rt & thl is 1.
       ! Brian Griffin; February 2, 2008.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         if ( zeta(i) < zero ) then
 
@@ -353,7 +353,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         thlp2(i,1) = max( thl_tol**2, thlp2(i,1) )
         rtp2(i,1) = max( rt_tol**2, rtp2(i,1) )
@@ -365,7 +365,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       ! where z_i is the height of the mixed layer.  The value of CLUBB's
       ! upward component of mixing length, Lscale_up, at the surface will be
       ! used as z_i.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         wstar = ( (one/T0) * grav * wpthlp(i,1) * Lscale_up(i,2) )**(one_third)
 
@@ -399,7 +399,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !   of (w,thl) or (w,rt) outside (-1,1).
       ! Perhaps in the future we should also ensure that the correlations 
       !   of (w,u) and (w,v) are not outside (-1,1).
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         min_wp2_sfc_val(i) &
                = max( w_tol_sqd, &
@@ -408,7 +408,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         if ( wp2(i,1) - tau_zm(i,1) * lhs_splat_wp2(i,1) * wp2(i,1) &
              < min_wp2_sfc_val(i) ) then
@@ -424,7 +424,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         usp2_sfc(i) = usp2_sfc(i) - 0.5_core_rknd * wp2_splat_sfc_correction(i)
         vsp2_sfc(i) = vsp2_sfc(i) - 0.5_core_rknd * wp2_splat_sfc_correction(i)
@@ -436,7 +436,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !    <u'^2>|_sfc = <u_s'^2> * [ <u>^2 / ( <u>^2 + <v>^2 ) ]
       !                  + <v_s'^2> * [ <v>^2 / ( <u>^2 + <v>^2 ) ];
       ! where <u>^2 + <v>^2 /= 0.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         up2(i,1) = usp2_sfc(i) * ( um_sfc_sqd(i) / max( um_sfc_sqd(i) + vm_sfc_sqd(i) , eps ) )  &
                    + vsp2_sfc(i) * ( vm_sfc_sqd(i) / max( um_sfc_sqd(i) + vm_sfc_sqd(i) , eps ) )
@@ -448,7 +448,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !    <v'^2>|_sfc = <v_s'^2> * [ <u>^2 / ( <u>^2 + <v>^2 ) ]
       !                  + <u_s'^2> * [ <v>^2 / ( <u>^2 + <v>^2 ) ];
       ! where <u>^2 + <v>^2 /= 0.
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol 
         vp2(i,1) = vsp2_sfc(i) * ( um_sfc_sqd(i) / max( um_sfc_sqd(i) + vm_sfc_sqd(i) , eps ) )  &
                    + usp2_sfc(i) * ( vm_sfc_sqd(i) / max( um_sfc_sqd(i) + vm_sfc_sqd(i) , eps ) )
@@ -516,7 +516,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
     else ! Previous code.
 
       ! Compute ustar^2
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         ustar2 = sqrt( upwp_sfc(i) * upwp_sfc(i) + vpwp_sfc(i) * vpwp_sfc(i) )
 
@@ -543,7 +543,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !$acc end parallel loop
 
       ! Compute estimate for surface second order moments
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         wp2(i,1) = a_const * uf(i)**2
         up2(i,1) = up2_sfc_coef * a_const * uf(i)**2  ! From Andre, et al. 1978
@@ -557,7 +557,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       ! Brian Griffin; February 2, 2008.
 
       if ( .not. l_vary_convect_depth )  then
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           thlp2(i,1)   = 0.4_core_rknd * a_const * ( wpthlp(i,1) / uf(i) )**2
           rtp2(i,1)    = 0.4_core_rknd * a_const * ( wprtp_sfc(i) / uf(i) )**2
@@ -566,7 +566,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
         end do
         !$acc end parallel loop
       else
-        !$acc parallel loop default(present)
+        !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
           thlp2(i,1)   = ( wpthlp(i,1) / uf(i) )**2 / ( max_mag_correlation_flux**2 * a_const )
           rtp2(i,1)    = ( wprtp_sfc(i) / uf(i) )**2 / ( max_mag_correlation_flux**2 * a_const )
@@ -575,7 +575,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
         !$acc end parallel loop
       end if
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         thlp2(i,1) = max( thl_tol**2, thlp2(i,1) )
         rtp2(i,1)  = max( rt_tol**2, rtp2(i,1) )
@@ -587,7 +587,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       !   of (w,thl) or (w,rt) outside (-1,1).
       ! Perhaps in the future we should also ensure that the correlations 
       !   of (w,u) and (w,v) are not outside (-1,1).
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         min_wp2_sfc_val(i) &
             = max( w_tol_sqd, &
@@ -596,7 +596,7 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
       end do
       !$acc end parallel loop
 
-      !$acc parallel loop default(present)
+      !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         if ( wp2(i,1) - tau_zm(i,1) * lhs_splat_wp2(i,1) * wp2(i,1) &
              < min_wp2_sfc_val(i) ) then
@@ -674,13 +674,13 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
     endif ! l_andre_1978
 
     ! Clip wp2 at wp2_max, same as in advance_wp2_wp3
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       wp2(i,1) = min( wp2(i,1), wp2_max )
     end do
     !$acc end parallel loop
 
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       if ( abs(gr%zm(i,1)-sfc_elevation(i)) > abs(gr%zm(i,1)+sfc_elevation(i))*eps/2 ) then
 
@@ -790,6 +790,10 @@ subroutine calc_sfc_varnce( nz, ngrdcol, gr, dt, sfc_elevation, &
 
     endif ! clubb_at_least_debug_level ( 2 )! Update surface stats
 
+    !$acc exit data delete( uf, depth_pos_wpthlp, min_wp2_sfc_val, &
+    !$acc                   um_sfc_sqd, vm_sfc_sqd, usp2_sfc, vsp2_sfc, &
+    !$acc                   ustar, zeta, wp2_splat_sfc_correction )
+
     return
 
   end subroutine calc_sfc_varnce
diff --git a/src/CLUBB_core/sigma_sqd_w_module.F90 b/src/CLUBB_core/sigma_sqd_w_module.F90
index 28027daed..25470cf27 100644
--- a/src/CLUBB_core/sigma_sqd_w_module.F90
+++ b/src/CLUBB_core/sigma_sqd_w_module.F90
@@ -94,7 +94,7 @@ subroutine compute_sigma_sqd_w( nz, ngrdcol, &
 
     ! ---- Begin Code ----
 
-    !$acc declare create( max_corr_w_x_sqd )
+    !$acc enter data create( max_corr_w_x_sqd )
 
     !----------------------------------------------------------------
     ! Compute sigma_sqd_w with new formula from Vince
@@ -139,6 +139,8 @@ subroutine compute_sigma_sqd_w( nz, ngrdcol, &
     end do
     !$acc end parallel loop
 
+    !$acc exit data delete( max_corr_w_x_sqd )
+
     return
 
   end subroutine compute_sigma_sqd_w
diff --git a/src/CLUBB_core/turbulent_adv_pdf.F90 b/src/CLUBB_core/turbulent_adv_pdf.F90
index a494a79af..46eb4241c 100644
--- a/src/CLUBB_core/turbulent_adv_pdf.F90
+++ b/src/CLUBB_core/turbulent_adv_pdf.F90
@@ -892,7 +892,7 @@ pure subroutine xpyp_term_ta_pdf_rhs( nz, ngrdcol, gr, term_wpxpyp_explicit,  &
 
 
     ! Set lower boundary value to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs_ta(i,1) = zero
     end do
@@ -947,7 +947,7 @@ pure subroutine xpyp_term_ta_pdf_rhs( nz, ngrdcol, gr, term_wpxpyp_explicit,  &
     end if
 
     ! Set upper boundary value to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs_ta(i,nz) = zero
     end do
@@ -1018,7 +1018,7 @@ pure subroutine xpyp_term_ta_pdf_rhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
     !$acc      copyout( rhs_ta )
 
     ! Set lower boundary value to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs_ta(i,1) = 0.0_core_rknd
     end do
@@ -1042,7 +1042,7 @@ pure subroutine xpyp_term_ta_pdf_rhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
     !$acc end parallel loop
 
     ! Set upper boundary value to 0
-    !$acc parallel loop default(present)
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       rhs_ta(i,nz) = 0.0_core_rknd
     end do
diff --git a/src/clubb_driver.F90 b/src/clubb_driver.F90
index 08dec564f..a8d8b6195 100644
--- a/src/clubb_driver.F90
+++ b/src/clubb_driver.F90
@@ -1513,15 +1513,7 @@ subroutine run_clubb &
            l_host_applies_sfc_fluxes,                         & ! Intent(in)
            saturation_formula,                                & ! Intent(in)
            l_input_fields,                                    & ! Intent(in)
-           iiPDF_type,                                        & ! intent(in)
-           ipdf_call_placement,                               & ! intent(in)
-           l_predict_upwp_vpwp,                               & ! intent(in)
-           l_min_xp2_from_corr_wx,                            & ! intent(in)
-           l_prescribed_avg_deltaz,                           & ! intent(in)
-           l_damp_wp2_using_em,                               & ! intent(in)
-           l_stability_correct_tau_zm,                        & ! intent(in)
-           l_enable_relaxed_clipping,                         & ! intent(in)
-           l_diag_Lscale_from_tau,                            & ! intent(in)
+           clubb_config_flags,                                & ! intent(in)
            err_code_dummy )                                     ! Intent(out)
            
     ! Setup grid

From 398b8a728ce9f39a6ebc4be4e3cabba7c4bccc36 Mon Sep 17 00:00:00 2001
From: huebler <huebler@uwm.edu>
Date: Tue, 11 Jul 2023 13:08:17 -0500
Subject: [PATCH 2/7] Improving run_bin_diff.py to accept an argument to print
 the differences, and an argument to define a threshold to ignore differences.

---
 run_scripts/run_bindiff_all.py | 153 ++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 29 deletions(-)
 mode change 100644 => 100755 run_scripts/run_bindiff_all.py

diff --git a/run_scripts/run_bindiff_all.py b/run_scripts/run_bindiff_all.py
old mode 100644
new mode 100755
index 816868333..9357322aa
--- a/run_scripts/run_bindiff_all.py
+++ b/run_scripts/run_bindiff_all.py
@@ -5,52 +5,76 @@
 import sys
 import os
 import subprocess
+import numpy as np
+import tabulate
 
 scriptPath = os.path.realpath(__file__)[0:-18]
 #print(scriptPath)
 
+# Threshold used to ignore field values for calculating % diff
+field_threshold = 1.0e-7
+
 def main():
 
     parser = argparse.ArgumentParser()
     parser.add_argument("-o", "--output", action="store_true", help="add this option if you want to get output for each file compared put into a file made in the cwd.")
+    parser.add_argument("-d", "--diffs", action="store_true", help="print the differences found in variables")
+    parser.add_argument("-t", "--threshold", dest="threshold", type=float, action="store", help="print the differences found in variables")
     parser.add_argument("dirs", nargs=2, help="need 2 clubb output directories to diff. Usage: python run_bindiff_all.py dir_path1 dir_path2")
 
     args = parser.parse_args()
 
     paths_exist = os.path.exists(args.dirs[0]) and os.path.exists(args.dirs[1])
     
-    if(paths_exist):
+    if( paths_exist ):
+
         print("Directory 1 is", args.dirs[0])
-        print("Directory 2 is", args.dirs[1] + "\n")
+        print("Directory 2 is", args.dirs[1])
+
         dir1_files = os.listdir(args.dirs[0])
         dir2_files = os.listdir(args.dirs[1])
+
         #print(dir1_files, dir2_files)
+
         cases = get_cases(dir1_files, dir2_files)
-        diffs = run_diff(cases, args.dirs[0], args.dirs[1])
-        
-        diff_in_files = run_py_diff(diffs, args.dirs[0], args.dirs[1],args.output)
 
-        if(args.output and diff_in_files):
+        if ( args.threshold is not None ):
+          tot_abs_diff_thresh = args.threshold
+        else:
+          tot_abs_diff_thresh = 0.0
+
+        print( "Using reporting threshold: ", tot_abs_diff_thresh, "\n" )
+
+        diff_in_files = run_diff(cases, args.dirs[0], args.dirs[1], args.output, args.diffs, tot_abs_diff_thresh)
+
+        if( args.output and diff_in_files ):
             print("\nOutput files made and placed in " + os.getcwd())
 
-        if(diff_in_files):
+        if( diff_in_files ):
             print("\nThere were differences detected in netCDF (*.nc) files.")
             sys.exit(1)
-        if(args.output):
-            print("\nNo differences detected, so no files were made!")
         else:
-            print("\nNo differences detected!")
+            if( args.output ):
+                print("\nNo differences detected, so no files were made!")
+            else:
+                print("\nNo differences detected!")
     else:
         print("Chosen directories do not exist. Please input valid directories")
         sys.exit(1)
 
+
 def get_cases(dir1_files, dir2_files):
+
     cases = []
+
     with open(scriptPath + "RUN_CASES") as file:
         for line in file:
             #print(line.strip())
             case = line.strip()
-            #If statement goes through every case in the RUN_CASES file and checks if the case is able to be run, and if that case has the files in the directories to diff it. If it passes all these conditions, the case is added to the runnable cases. 
+
+            #If statement goes through every case in the RUN_CASES file and checks if the case is able to be run, 
+            # and if that case has the files in the directories to diff it. If it passes all these conditions, 
+            # the case is added to the runnable cases. 
             if(line[0] != "!" and len(line) > 1 and case + "_zt.nc" in dir1_files and case + "_zt.nc" in dir2_files
                     and case + "_zm.nc" in dir1_files and case + "_zm.nc" in dir2_files
                     and case + "_sfc.nc" in dir1_files and case + "_sfc.nc" in dir2_files):
@@ -59,32 +83,103 @@ def get_cases(dir1_files, dir2_files):
     return cases
 
 
-def run_diff(cases, dir1_path, dir2_path):
-    diff_list = []
+def run_diff(cases, dir1_path, dir2_path, l_output, l_diffs, thresh):
+
+    diff_in_files = False
+
     nc_data_formats = ["_zm.nc", "_zt.nc", "_sfc.nc"]
-#This for loop runs through all the cases you have the files to check, and each netcdf format. It runs the linux diff on the bonary netcdf files to see which files are needed to be sent through the diff_netcdf_outputs.py script.
+
+    # This for loop runs through all the cases you have the files to check, 
+    # and each netcdf format. It runs the linux diff on the binary netcdf files 
+    # to see which files should be looked at more closely
     for case in cases:
+
         print("DIFFING " + case + " netCDF (*.nc) files")
+
         for out_form in nc_data_formats:
 
-            if(len(subprocess.getoutput("diff " + dir1_path + "/" + case + out_form + " " + dir2_path + "/" + case + out_form)) > 0):
-                diff_list.append(case + out_form)
-    return diff_list
+            # Call linux diff command, if it shows a diff, check the netcdf values to confirm
+            if( len(subprocess.getoutput("diff " + dir1_path + "/" + case + out_form + " " + dir2_path + "/" + case + out_form)) > 0 ):
+                diff_in_files = run_py_diff(case+out_form, dir1_path, dir2_path, l_output, l_diffs, thresh) or diff_in_files
+
+    return diff_in_files
+
+
+def run_py_diff( test_file, dir1_path, dir2_path, l_save_output, l_print_diffs, tot_abs_diff_thresh ):
 
-def run_py_diff(diff_list, dir1_path, dir2_path, make_out_files):
     diff_in_files = False
-    for test_file in diff_list:
-        #print("DIFFING " + test_file + "netCDF (*.nc) files")
-        #print(dir1_path + test_file)
-        #print(dir2_path + test_file)
-        output = subprocess.getoutput("python " + scriptPath + "diff_netcdf_outputs.py " + dir1_path + test_file + " " + dir2_path + test_file)
-        if(len(output) > 0):
+    any_diff_above_thresh = False
+
+    #print("DIFFING " + test_file + "netCDF (*.nc) files")
+    #print(dir1_path + test_file)
+    #print(dir2_path + test_file)
+
+    # Create datasets from nc files
+    dset1 = netCDF4.Dataset(dir1_path+"/"+test_file)
+    dset2 = netCDF4.Dataset(dir2_path+"/"+test_file)
+
+    # Define table header, these are the values we're going to output
+    table = [[  'Var', 
+                'Max Abs Diff', 
+                'Max % Diff', 
+                'Total Abs Diff', 
+                'Avg Abs Diff' ]]
+
+    for var in dset1.variables:
+
+        # The CLUBB netcdf variables we are interested in are all 
+        # 4 dimensional (time, altitude, latitude, longitude),
+        # but currently we don't actually have latitude or longitude in clubb, so those
+        # dimensions are hardcoded to be 1. If in the future we remove those useless
+        # dimensions (unlikely), then the variables of interested would be 2D. So 
+        # for futureproofing, we will just check all variables with more than 1 dimension.
+        if( dset1[var].ndim > 1 ):
+
             diff_in_files = True
-            print("*** Differences detected in " + test_file + "! ***")
-            if(make_out_files):
-                f = open(test_file+"-diff_out", "w")
-                f.write(output)
-                f.close()
+
+            abs_diff = abs( dset1[var][:,:,:] - dset2[var][:,:,:] )
+
+            # If the sum of all absolute differences is less than the threshold, then ignore this var
+            if ( np.sum(abs_diff) < tot_abs_diff_thresh ):
+              continue
+            else:
+              any_diff_above_thresh = True
+
+            # Clip fields to ignore tiny values for the % diff
+            field_1_clipped = np.clip( dset1[var][:,:,:], a_min = field_threshold, a_max = 9999999.0  )
+            field_2_clipped = np.clip( dset2[var][:,:,:], a_min = field_threshold, a_max = 9999999.0 )
+
+            # Calculate the percent difference, 100 * (a-b) / ((a+b)/2)
+            percent_diff = 200.0 * ( field_1_clipped-field_2_clipped ) \
+                                   / ( field_1_clipped+field_2_clipped )
+                         
+            # Append the table array with the values we want to print   
+            table.append( [ var, 
+                            np.max(abs_diff), 
+                            np.max(percent_diff), 
+                            np.sum(abs_diff), 
+                            np.average(abs_diff) ] )
+
+    # Print results 
+    header = "************** Differences detected in " + test_file + "! **************"
+    print( header )
+
+    if( any_diff_above_thresh ):
+      output = tabulate.tabulate(table, headers='firstrow')
+    else:
+      output = "Total absolute value of all differences are below threshold: " + str(tot_abs_diff_thresh)
+
+    if( l_print_diffs ):
+        # Print a very pretty table of the values
+        print(output+"\n")
+
+    if( l_save_output ):
+        # Save all the output to a file
+        f = open(test_file + "-diff_out", "w")
+        f.write(header)
+        f.write(output)
+        f.close()
+
     return diff_in_files
 
 

From 0b9f422518f7159d7c2d040cc3b2b818a71e9143 Mon Sep 17 00:00:00 2001
From: Tyler Cernik <80285381+cernikt@users.noreply.github.com>
Date: Thu, 13 Jul 2023 21:23:09 -0500
Subject: [PATCH 3/7] Update requirements.txt

updated Pillow version to resolve related dependabot alerts
---
 postprocessing/pyplotgen/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/postprocessing/pyplotgen/requirements.txt b/postprocessing/pyplotgen/requirements.txt
index d91465a59..bc82ad7e9 100755
--- a/postprocessing/pyplotgen/requirements.txt
+++ b/postprocessing/pyplotgen/requirements.txt
@@ -1,4 +1,4 @@
-Pillow~=8.2.0
+Pillow~=10.0.0
 netCDF4~=1.5.3
 matplotlib~=3.2.1
 numpy~=1.18.2

From 4c8e71aad6664c648a84ba517dada0994b796fe5 Mon Sep 17 00:00:00 2001
From: Tyler Cernik <80285381+cernikt@users.noreply.github.com>
Date: Thu, 13 Jul 2023 21:37:32 -0500
Subject: [PATCH 4/7] Update requirements.txt

Also updating numpy version to 1.25.1
---
 postprocessing/pyplotgen/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/postprocessing/pyplotgen/requirements.txt b/postprocessing/pyplotgen/requirements.txt
index bc82ad7e9..b2599ffd5 100755
--- a/postprocessing/pyplotgen/requirements.txt
+++ b/postprocessing/pyplotgen/requirements.txt
@@ -1,7 +1,7 @@
 Pillow~=10.0.0
 netCDF4~=1.5.3
 matplotlib~=3.2.1
-numpy~=1.18.2
+numpy~=1.25.1
 cycler~=0.10.0
 fpdf~=1.7.2
 opencv-python~=4.4.0

From 13de9b8e3a8d758a44b8408fb64cc2ac5ffd2c23 Mon Sep 17 00:00:00 2001
From: Vincent Larson <vlarson@users.noreply.github.com>
Date: Sat, 15 Jul 2023 09:20:04 -0500
Subject: [PATCH 5/7] Loop to `nz-1` instead of `nz` because upper boundary
 condition

is set in the lines immediately below.
---
 src/CLUBB_core/mean_adv.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CLUBB_core/mean_adv.F90 b/src/CLUBB_core/mean_adv.F90
index f111495f1..69a5e2f20 100644
--- a/src/CLUBB_core/mean_adv.F90
+++ b/src/CLUBB_core/mean_adv.F90
@@ -244,7 +244,7 @@ pure subroutine term_ma_zt_lhs( nz, ngrdcol, wm_zt, weights_zt2zm, & ! Intent(in
 
       ! Most of the interior model; normal conditions.
       !$acc parallel loop gang vector collapse(2) default(present)
-      do k = 2, nz, 1
+      do k = 2, nz-1, 1
         do i = 1, ngrdcol
 
           ! Thermodynamic superdiagonal: [ x var_zt(k+1,<t+1>) ]

From 29ad556fe9b55c4071e4fc4b6145b79d471f7a1a Mon Sep 17 00:00:00 2001
From: domkesteffen <42725900+domkesteffen@users.noreply.github.com>
Date: Tue, 18 Jul 2023 17:49:07 -0500
Subject: [PATCH 6/7] Added code to subroutine run_clubb in
 src/clubb_driver.F90 which prints git log and git diff information to the
 <case>_setup.txt output file. (#1101)

CLUBB ticket #1098
---
 src/clubb_driver.F90 | 51 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/clubb_driver.F90 b/src/clubb_driver.F90
index a8d8b6195..e82baa4c4 100644
--- a/src/clubb_driver.F90
+++ b/src/clubb_driver.F90
@@ -1148,6 +1148,39 @@ subroutine run_clubb &
       ! Print the date and time
       call write_date( l_write_to_file, iunit )
 
+      call write_text( "", l_write_to_file, iunit )
+      call write_text( "--------------------------------------------------", &
+                       l_write_to_file, iunit )
+      call write_text( "Latest git log entry", l_write_to_file, iunit )
+      call write_text( "--------------------------------------------------", &
+                       l_write_to_file, iunit )
+      call write_text( "", l_write_to_file, iunit )
+      if ( l_write_to_file ) close( unit=iunit )
+
+      if ( l_write_to_file ) then
+        call execute_command_line( 'git log -n 1' )
+        call execute_command_line( 'echo "Branch name:" >> '//case_info_file )
+        call execute_command_line( 'git rev-parse --abbrev-ref HEAD >> '//case_info_file )
+        call execute_command_line( 'echo "(If no branch name is shown HEAD may be detached)"'// &
+                                   ' >> '//case_info_file )
+        call execute_command_line( 'echo "" >> '//case_info_file )
+        call execute_command_line( 'git log -n 1 >> '//case_info_file )
+      end if
+
+      if ( l_write_to_file ) then
+        open(unit=iunit, file=case_info_file, status='OLD', action='write', position='APPEND')
+      end if
+
+      call write_text( NEW_LINE('A')//"A detailed git diff can be found "// &
+                       "at the end of this file"//NEW_LINE('A'), &
+                       l_write_to_file, iunit )
+
+      call write_text( "--------------------------------------------------", &
+        l_write_to_file, iunit )
+      call write_text( "Tunable Parameters:", l_write_to_file, iunit)
+      call write_text( "--------------------------------------------------", &
+        l_write_to_file, iunit )
+
       ! Print the list of parameters that are being used before the run.
       call write_text( "Parameter          Value", l_write_to_file, iunit, '(4x,A24)')
       call write_text( "---------          -----", l_write_to_file, iunit, '(4x,A24)')
@@ -1497,9 +1530,23 @@ subroutine run_clubb &
       call write_text( "--------------------------------------------------", &
                        l_write_to_file, iunit )
 
-       call print_clubb_config_flags( iunit, clubb_config_flags ) ! Intent(in)
+      call print_clubb_config_flags( fstdout, clubb_config_flags ) ! Intent(in)
+      call print_clubb_config_flags( iunit, clubb_config_flags ) ! Intent(in)
 
-      if ( l_write_to_file ) close(unit=iunit)
+      call write_text( "--------------------------------------------------", &
+                       l_write_to_file, iunit )
+      call write_text( "git diff src/", l_write_to_file, iunit )
+      call write_text( "--------------------------------------------------", &
+                       l_write_to_file, iunit )
+
+      if ( l_write_to_file ) then
+        write(fstdout, *) "See *setup.txt file in output folder"//NEW_LINE('A')
+        close(unit=iunit)
+      end if
+
+      if ( l_write_to_file ) then
+        call execute_command_line( 'git --no-pager diff >> '//case_info_file )
+      end if
 
     end if ! clubb_at_least_debug_level( 1 )
 

From 1f4d0453a4d5c225be028a20ba8cfe44f3238336 Mon Sep 17 00:00:00 2001
From: huebleruwm <37674341+huebleruwm@users.noreply.github.com>
Date: Tue, 18 Jul 2023 20:27:07 -0500
Subject: [PATCH 7/7] Openacc tweaks and cleanup 2 (#1099)

* Making all end parallel directives specify end loop

* Replacing last acc declare in a procedure with acc enter/exit data commands

* Removing pure declarations, turns out they dont really improve performance and openmp isn't allowed within them.

* Adding explicit directives to penta_lu and tridiag_lu, as opposed to the previously used kernels directive. Also splitting up a loop to improve GPU performance.

* Splitting up a couple loops for performance reasons.

* Missed one somehow
---
 src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90     |   2 +-
 src/CLUBB_core/advance_clubb_core_module.F90  |   4 +-
 src/CLUBB_core/advance_helper_module.F90      |   2 +-
 .../advance_windm_edsclrm_module.F90          | 108 +++++++++---------
 src/CLUBB_core/advance_wp2_wp3_module.F90     |  57 +++++----
 src/CLUBB_core/advance_xm_wpxp_module.F90     |  12 +-
 src/CLUBB_core/advance_xp2_xpyp_module.F90    |   8 +-
 src/CLUBB_core/advance_xp3_module.F90         |   4 +-
 src/CLUBB_core/calc_roots.F90                 |   6 +-
 src/CLUBB_core/clip_explicit.F90              |  14 +--
 src/CLUBB_core/clip_semi_implicit.F90         |   2 +-
 src/CLUBB_core/clubb_api_module.F90           |   2 +-
 src/CLUBB_core/corr_varnce_module.F90         |   2 +-
 src/CLUBB_core/diffusion.F90                  |   6 +-
 src/CLUBB_core/fill_holes.F90                 |  18 ++-
 src/CLUBB_core/grid_class.F90                 |  18 +--
 src/CLUBB_core/mean_adv.F90                   |   4 +-
 src/CLUBB_core/mono_flux_limiter.F90          |   2 +-
 src/CLUBB_core/numerical_check.F90            |   2 +-
 src/CLUBB_core/penta_lu_solver.F90            | 102 ++++++++++-------
 src/CLUBB_core/tridiag_lu_solver.F90          |  48 +++++---
 src/CLUBB_core/turbulent_adv_pdf.F90          |   8 +-
 22 files changed, 245 insertions(+), 186 deletions(-)

diff --git a/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90 b/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90
index 09334a800..acf9fe634 100644
--- a/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90
+++ b/src/CLUBB_core/adg1_adg2_3d_luhar_pdf.F90
@@ -1360,7 +1360,7 @@ subroutine backsolve_Luhar_params( Sk_max, Skx,          & ! In
   end subroutine backsolve_Luhar_params
 
   !=============================================================================
-  pure function max_cubic_root( a_coef, b_coef, c_coef, d_coef ) &
+  function max_cubic_root( a_coef, b_coef, c_coef, d_coef ) &
   result( max_root )
 
     ! Description:
diff --git a/src/CLUBB_core/advance_clubb_core_module.F90 b/src/CLUBB_core/advance_clubb_core_module.F90
index 925448535..4ac548303 100644
--- a/src/CLUBB_core/advance_clubb_core_module.F90
+++ b/src/CLUBB_core/advance_clubb_core_module.F90
@@ -5157,7 +5157,7 @@ subroutine calc_trapezoid_zm( nz, ngrdcol, gr, variable_zm, variable_zt, &
                                  * ( gr%zm(i,k) - gr%zt(i,k) ) * gr%invrs_dzm(i,k)
         end do
       end do 
-      !$acc end parallel
+      !$acc end parallel loop
 
       return
     end subroutine calc_trapezoid_zm
@@ -5514,7 +5514,7 @@ subroutine set_Lscale_max( ngrdcol, l_implemented, host_dx, host_dy, &
     end subroutine set_Lscale_max
 
 !===============================================================================
-  pure subroutine calculate_thlp2_rad &
+  subroutine calculate_thlp2_rad &
                   ( nz, rcm_zm, thlprcp, radht_zm, & ! Intent(in)
                     clubb_params,                  & ! Intent(in)
                     thlp2_forcing )                  ! Intent(inout)
diff --git a/src/CLUBB_core/advance_helper_module.F90 b/src/CLUBB_core/advance_helper_module.F90
index 19602e375..5fb4c88b2 100644
--- a/src/CLUBB_core/advance_helper_module.F90
+++ b/src/CLUBB_core/advance_helper_module.F90
@@ -2023,7 +2023,7 @@ function vertical_avg( total_idx, rho_ds, field, dz )
   end function vertical_avg
 
   !=============================================================================
-  pure function vertical_integral( total_idx, rho_ds, &
+  function vertical_integral( total_idx, rho_ds, &
                                        field, dz )
 
     ! Description:
diff --git a/src/CLUBB_core/advance_windm_edsclrm_module.F90 b/src/CLUBB_core/advance_windm_edsclrm_module.F90
index 37c639c24..3e502d531 100644
--- a/src/CLUBB_core/advance_windm_edsclrm_module.F90
+++ b/src/CLUBB_core/advance_windm_edsclrm_module.F90
@@ -294,7 +294,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
     do i = 1, ngrdcol
       nu_zero(i) = zero
     end do
-    !$acc end parallel
+    !$acc end parallel loop
     
     !$acc parallel loop gang vector collapse(2) default(present)
     do k = 1, nz
@@ -302,7 +302,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
         Km_zm_p_nu10(i,k) = Km_zm(i,k) + nu_vert_res_dep%nu10(i)
       end do
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     l_perturbed_wind = ( .not. l_predict_upwp_vpwp ) .and. l_linearize_pbl_winds
     
@@ -320,7 +320,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           end do
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
     end if
 
     if ( .not. l_predict_upwp_vpwp ) then
@@ -333,7 +333,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           Km_zt(i,k) = max( Km_zt(i,k), zero )
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Calculate diffusion terms
       call diffusion_zt_lhs( nz, ngrdcol, gr, Km_zm, Km_zt, nu_vert_res_dep%nu10, & ! In
@@ -357,7 +357,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           ! Thermodynamic subdiagonal: [ x xm(k-1,<t+1>) ]
           lhs_diff(km1_tdiag,i,2) = zero
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       else
         !$acc parallel loop gang vector default(present)
         do i = 1, ngrdcol
@@ -374,7 +374,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           ! Thermodynamic subdiagonal: [ x xm(k-1,<t+1>) ]
           lhs_diff(km1_tdiag,i,2) = zero
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       end if
 
       if ( l_lmm_stepping ) then
@@ -385,7 +385,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             vm_old(i,k) = vm(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       end if ! l_lmm_stepping
 
       !----------------------------------------------------------------
@@ -419,14 +419,14 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           wind_speed(i,k) = max( sqrt( um(i,k)**2 + vm(i,k)**2 ), eps )
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Compute u_star_sqd according to the definition of u_star.
       !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         u_star_sqd(i) = sqrt( upwp(i,1)**2 + vpwp(i,1)**2 )
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Compute the explicit portion of the um equation.
       ! Build the right-hand side vector.
@@ -464,7 +464,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           upwp(i,k) = -one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
       
       call calc_xpwp( nz, ngrdcol, gr, &
                       Km_zm_p_nu10, vm, &
@@ -476,7 +476,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           vpwp(i,k) = -one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! A zero-flux boundary condition at the top of the model, d(xm)/dz = 0,
       ! means that x'w' at the top model level is 0,
@@ -486,7 +486,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
         upwp(i,nz) = zero
         vpwp(i,nz) = zero
       end do
-      !$acc end parallel
+      !$acc end parallel loop
       
       ! Compute the implicit portion of the um and vm equations.
       ! Build the left-hand side matrix.
@@ -522,7 +522,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           um(i,k) = solution(i,k,windm_edsclrm_um)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       !----------------------------------------------------------------
       ! Update meridional (south-to-north) component of mean wind, vm
@@ -533,7 +533,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           vm(i,k) = solution(i,k,windm_edsclrm_vm)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       if ( l_stats_samp ) then
 
@@ -571,7 +571,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
         um(i,1) = um(i,2)
         vm(i,1) = vm(i,2)
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       if ( l_lmm_stepping ) then
         !$acc parallel loop gang vector collapse(2) default(present)
@@ -581,7 +581,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             vm(i,k) = one_half * ( vm_old(i,k) + vm(i,k) )
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       endif ! l_lmm_stepping ) then
 
       if ( uv_sponge_damp_settings%l_sponge_damping ) then
@@ -645,7 +645,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           upwp(i,k) = upwp(i,k) - one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
                                                       
       call calc_xpwp( nz, ngrdcol, gr, &
                       Km_zm_p_nu10, vm, &
@@ -657,7 +657,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           vpwp(i,k) = vpwp(i,k) - one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Adjust um and vm if nudging is turned on.
       if ( l_uv_nudge ) then
@@ -680,7 +680,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             vm(i,k) = vm(i,k) - ( ( vm(i,k) - vm_ref(i,k) ) * (dt/ts_nudge) )
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
         if ( l_stats_samp ) then
           !$acc update host( um, vm )
@@ -800,14 +800,14 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           wind_speed_pert(i,k) = max( sqrt( (um_pert(i,k))**2 + (vm_pert(i,k))**2 ), eps )
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Compute u_star_sqd according to the definition of u_star.
       !$acc parallel loop gang vector default(present)
       do i = 1, ngrdcol
         u_star_sqd_pert(i) = sqrt( upwp_pert(i,1)**2 + vpwp_pert(i,1)**2 )
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Compute the explicit portion of the um equation.
       ! Build the right-hand side vector.
@@ -845,7 +845,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           upwp_pert(i,k) = -one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
                                                   
       call calc_xpwp( nz, ngrdcol, gr, &
                       Km_zm_p_nu10, vm_pert, &
@@ -857,7 +857,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           vpwp_pert(i,k) = -one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
       
       ! A zero-flux boundary condition at the top of the model, d(xm)/dz = 0,
       ! means that x'w' at the top model level is 0,
@@ -867,7 +867,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
         upwp_pert(i,nz) = zero
         vpwp_pert(i,nz) = zero
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Compute the implicit portion of the um and vm equations.
       ! Build the left-hand side matrix.
@@ -903,7 +903,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           um_pert(i,k) = solution(i,k,windm_edsclrm_um)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       !----------------------------------------------------------------
       ! Update meridional (south-to-north) component of mean wind, vm
@@ -914,7 +914,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           vm_pert(i,k) = solution(i,k,windm_edsclrm_vm)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
   
       ! The values of um(1) and vm(1) are located below the model surface and
       ! do not affect the rest of the model.  The values of um(1) or vm(1) are
@@ -927,7 +927,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
         um_pert(i,1) = um_pert(i,2)
         vm_pert(i,1) = vm_pert(i,2)
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Second part of momentum (implicit component)
 
@@ -943,7 +943,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           upwp_pert(i,k) = upwp_pert(i,k) - one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
       
       call calc_xpwp( nz, ngrdcol, gr, &
                       Km_zm_p_nu10, vm_pert, &
@@ -955,7 +955,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           vpwp_pert(i,k) = vpwp_pert(i,k) - one_half * xpwp(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
       
       if ( l_tke_aniso ) then
 
@@ -1034,7 +1034,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           Kmh_zt(i,k) = max( Kmh_zt(i,k), zero )
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Calculate diffusion terms
       call diffusion_zt_lhs( nz, ngrdcol, gr, Kmh_zm, Kmh_zt, nu_zero,  & ! intent(in)
@@ -1059,7 +1059,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           ! Thermodynamic subdiagonal: [ x xm(k-1,<t+1>) ]
           lhs_diff(km1_tdiag,i,2) = zero
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
       else
 
@@ -1076,7 +1076,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           ! Thermodynamic subdiagonal: [ x xm(k-1,<t+1>) ]
           lhs_diff(km1_tdiag,i,2) = zero
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
       end if
 
@@ -1089,7 +1089,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             end do
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       endif ! l_lmm_stepping
 
       ! Eddy-scalar surface fluxes, x'w'|_sfc, are applied through an explicit
@@ -1130,7 +1130,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             wpedsclrp(i,k,edsclr) = -one_half * xpwp(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       end do
 
       ! A zero-flux boundary condition at the top of the model, d(xm)/dz = 0,
@@ -1142,7 +1142,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           wpedsclrp(i,nz,j) = zero
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! Compute the implicit portion of the xm (eddy-scalar) equations.
       ! Build the left-hand side matrix.
@@ -1177,7 +1177,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           end do
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       ! The value of edsclrm(1) is located below the model surface and does not
       ! effect the rest of the model.  The value of edsclrm(1) is simply set to
@@ -1188,7 +1188,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
           edsclrm(i,1,edsclr) = edsclrm(i,2,edsclr)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       if ( l_lmm_stepping ) then
         !$acc parallel loop gang vector collapse(3) default(present)
@@ -1199,7 +1199,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             end do
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       endif ! l_lmm_stepping
 
       ! Second part of momentum (implicit component)
@@ -1218,7 +1218,7 @@ subroutine advance_windm_edsclrm( nz, ngrdcol, gr, dt, &
             wpedsclrp(i,k,edsclr) = -one_half * xpwp(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
       end do
 
       ! Note that the w'edsclr' terms are not clipped, since we don't compute
@@ -2153,7 +2153,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
             xm_gf(i,k) = - fcor(i) * perp_wind_g(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
         
         !$acc parallel loop gang vector collapse(2) default(present)
         do k = 1, nz
@@ -2161,7 +2161,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
             xm_cf(i,k) = fcor(i) * perp_wind_m(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
       case ( windm_edsclrm_vm )
 
@@ -2175,7 +2175,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
             xm_gf(i,k) = fcor(i) * perp_wind_g(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
         !$acc parallel loop gang vector collapse(2) default(present)
         do k = 1, nz
@@ -2183,7 +2183,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
             xm_cf(i,k) = -fcor(i) * perp_wind_m(i,k)
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
       case default
 
@@ -2198,7 +2198,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
             xm_cf(i,k) = 0._core_rknd
           end do
         end do
-        !$acc end parallel
+        !$acc end parallel loop
 
       end select
 
@@ -2208,7 +2208,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
           xm_tndcy(i,k) = xm_gf(i,k) + xm_cf(i,k) + xm_forcing(i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       if ( l_stats_samp ) then
 
@@ -2237,7 +2237,7 @@ subroutine compute_uv_tndcy( nz, ngrdcol, solve_type, &
           xm_tndcy(i,k) = 0.0_core_rknd
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
     endif
 
@@ -2329,7 +2329,7 @@ subroutine windm_edsclrm_lhs( nz, ngrdcol, gr, dt, &
       lhs(2,i,1) = 1.0_core_rknd
       lhs(3,i,1) = 0.0_core_rknd
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     ! Add terms to lhs
     !$acc parallel loop gang vector collapse(2) default(present)
@@ -2345,7 +2345,7 @@ subroutine windm_edsclrm_lhs( nz, ngrdcol, gr, dt, &
         lhs(3,i,k) = 0.5_core_rknd * lhs_diff(3,i,k)
       end do
     end do
-    !$acc end parallel 
+    !$acc end parallel loop
 
     ! LHS mean advection term.
     if ( .not. l_implemented ) then
@@ -2355,7 +2355,7 @@ subroutine windm_edsclrm_lhs( nz, ngrdcol, gr, dt, &
           lhs(1:3,i,k) = lhs(1:3,i,k) + lhs_ma_zt(:,i,k)
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
     endif
 
@@ -2367,7 +2367,7 @@ subroutine windm_edsclrm_lhs( nz, ngrdcol, gr, dt, &
         lhs(2,i,2) = lhs(2,i,2) + invrs_rho_ds_zt(i,2) * gr%invrs_dzt(i,2) &
                                   * rho_ds_zm(i,1) * ( u_star_sqd(i) / wind_speed(i,2) )
       end do
-      !$acc end parallel
+      !$acc end parallel loop
     end if ! l_imp_sfc_momentum_flux
 
     return
@@ -2488,7 +2488,7 @@ subroutine windm_edsclrm_rhs( nz, ngrdcol, gr, solve_type, dt, &
     do i = 1, ngrdcol
       rhs(i,1) = 0.0_core_rknd
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     ! Non-boundary rhs calculation, this is a highly vectorized loop
     !$acc parallel loop gang vector collapse(2) default(present)
@@ -2502,7 +2502,7 @@ subroutine windm_edsclrm_rhs( nz, ngrdcol, gr, solve_type, dt, &
                    + invrs_dt * xm(i,k)                     ! RHS time tendency
       end do
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     ! Upper boundary calculation
     !$acc parallel loop gang vector default(present)
@@ -2513,7 +2513,7 @@ subroutine windm_edsclrm_rhs( nz, ngrdcol, gr, solve_type, dt, &
                   + xm_tndcy(i,nz)                     & ! RHS forcings
                   + invrs_dt * xm(i,nz)                  ! RHS time tendency
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     if ( l_stats_samp .and. ixm_ta > 0 ) then
 
@@ -2555,7 +2555,7 @@ subroutine windm_edsclrm_rhs( nz, ngrdcol, gr, solve_type, dt, &
                               * gr%invrs_dzt(i,2)  &
                               * rho_ds_zm(i,1) * xpwp_sfc(i)
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       if ( l_stats_samp .and. ixm_ta > 0 ) then
 
diff --git a/src/CLUBB_core/advance_wp2_wp3_module.F90 b/src/CLUBB_core/advance_wp2_wp3_module.F90
index a79562f8f..921e4309e 100644
--- a/src/CLUBB_core/advance_wp2_wp3_module.F90
+++ b/src/CLUBB_core/advance_wp2_wp3_module.F90
@@ -1985,7 +1985,6 @@ subroutine wp23_lhs( nz, ngrdcol, gr, dt, &
       do i = 1, ngrdcol
 
         k_wp3 = 2*k - 1
-        k_wp2 = 2*k
 
         ! ------ w'3 ------
 
@@ -2018,8 +2017,16 @@ subroutine wp23_lhs( nz, ngrdcol, gr, dt, &
 
         ! LHS mean advection (ma) and diffusion (diff) terms
         lhs(5,i,k_wp3) = lhs(5,i,k_wp3) + lhs_ma_zt(3,i,k) + lhs_diff_zt(3,i,k)
+      end do
+    end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector collapse(2) default(present)
+    do k = 2, nz-1, 1
+      do i = 1, ngrdcol
 
+        k_wp2 = 2*k
+        
         ! ------ w'2 ------
 
         ! LHS mean advection (ma) and diffusion (diff) terms
@@ -2473,7 +2480,6 @@ subroutine wp23_rhs( nz, ngrdcol, gr, dt, &
       do i = 1, ngrdcol
         
         k_wp3 = 2*k - 1
-        k_wp2 = 2*k
 
         ! ------ Combine terms for 3rd moment of vertical velocity, <w'^3> ------ !
 
@@ -2494,8 +2500,17 @@ subroutine wp23_rhs( nz, ngrdcol, gr, dt, &
         ! RHS "over implicit" pressure term 1 (pr1).
         rhs(i,k_wp3)  = rhs(i,k_wp3) + ( one - gamma_over_implicit_ts ) &
                                        * ( - lhs_pr1_wp3(i,k) * wp3(i,k) )
+      end do
+    end do
+    !$acc end parallel loop
 
 
+    !$acc parallel loop gang vector collapse(2) default(present)
+    do k = 2, nz-1
+      do i = 1, ngrdcol
+
+        k_wp2 = 2*k
+
         ! ------ Combine terms for 2nd moment of vertical velocity, <w'^2> ------ !
 
         ! RHS time tendency.
@@ -2906,7 +2921,7 @@ subroutine wp23_rhs( nz, ngrdcol, gr, dt, &
   end subroutine wp23_rhs
 
   !=============================================================================
-  pure subroutine wp2_term_ta_lhs( nz, ngrdcol, gr, &
+  subroutine wp2_term_ta_lhs( nz, ngrdcol, gr, &
                                    rho_ds_zt, invrs_rho_ds_zm, &
                                    lhs_ta_wp2 )
 
@@ -3027,7 +3042,7 @@ pure subroutine wp2_term_ta_lhs( nz, ngrdcol, gr, &
   end subroutine wp2_term_ta_lhs
 
   !=============================================================================
-  pure subroutine wp2_terms_ac_pr2_lhs( nz, ngrdcol, gr, C_uu_shr, wm_zt, &
+  subroutine wp2_terms_ac_pr2_lhs( nz, ngrdcol, gr, C_uu_shr, wm_zt, &
                                         lhs_ac_pr2_wp2 )
 
     ! Description:
@@ -3147,7 +3162,7 @@ pure subroutine wp2_terms_ac_pr2_lhs( nz, ngrdcol, gr, C_uu_shr, wm_zt, &
   end subroutine wp2_terms_ac_pr2_lhs
 
   !=============================================================================
-  pure subroutine wp2_term_dp1_lhs( nz, ngrdcol, &
+  subroutine wp2_term_dp1_lhs( nz, ngrdcol, &
                                     C1_Skw_fnc, invrs_tau1m, &
                                     lhs_dp1_wp2 )
 
@@ -3241,7 +3256,7 @@ pure subroutine wp2_term_dp1_lhs( nz, ngrdcol, &
   end subroutine wp2_term_dp1_lhs
 
   !=============================================================================
-  pure subroutine wp2_term_pr1_lhs( nz, ngrdcol, C4, invrs_tau_C4_zm, &
+  subroutine wp2_term_pr1_lhs( nz, ngrdcol, C4, invrs_tau_C4_zm, &
                                     lhs_pr1_wp2 )
 
     ! Description
@@ -3342,7 +3357,7 @@ pure subroutine wp2_term_pr1_lhs( nz, ngrdcol, C4, invrs_tau_C4_zm, &
   end subroutine wp2_term_pr1_lhs
 
   !=============================================================================
-  pure subroutine wp2_terms_bp_pr2_rhs( nz, ngrdcol, C_uu_buoy, &
+  subroutine wp2_terms_bp_pr2_rhs( nz, ngrdcol, C_uu_buoy, &
                                         thv_ds_zm, wpthvp, &
                                         rhs_bp_pr2_wp2 )
 
@@ -3435,7 +3450,7 @@ pure subroutine wp2_terms_bp_pr2_rhs( nz, ngrdcol, C_uu_buoy, &
   end subroutine wp2_terms_bp_pr2_rhs
 
   !=============================================================================
-  pure subroutine wp2_term_dp1_rhs( nz, ngrdcol, C1_Skw_fnc, &
+  subroutine wp2_term_dp1_rhs( nz, ngrdcol, C1_Skw_fnc, &
                                     invrs_tau1m, threshold, up2, vp2, &
                                     l_damp_wp2_using_em, &
                                     rhs_dp1_wp2 )
@@ -3547,7 +3562,7 @@ pure subroutine wp2_term_dp1_rhs( nz, ngrdcol, C1_Skw_fnc, &
   end subroutine wp2_term_dp1_rhs
 
   !=============================================================================
-  pure subroutine wp2_term_pr3_rhs( nz, ngrdcol, gr, C_uu_shr, C_uu_buoy, &
+  subroutine wp2_term_pr3_rhs( nz, ngrdcol, gr, C_uu_shr, C_uu_buoy, &
                                     thv_ds_zm, wpthvp, upwp, &
                                     um, vpwp, vm, &
                                     rhs_pr3_wp2 )
@@ -3685,7 +3700,7 @@ pure subroutine wp2_term_pr3_rhs( nz, ngrdcol, gr, C_uu_shr, C_uu_buoy, &
   end subroutine wp2_term_pr3_rhs
 
   !=============================================================================
-  pure subroutine wp2_term_pr1_rhs( nz, ngrdcol, C4, &
+  subroutine wp2_term_pr1_rhs( nz, ngrdcol, C4, &
                                     up2, vp2, invrs_tau_C4_zm, &
                                     rhs_pr1_wp2 )
 
@@ -3778,7 +3793,7 @@ pure subroutine wp2_term_pr1_rhs( nz, ngrdcol, C4, &
   end subroutine wp2_term_pr1_rhs
 
   !=============================================================================
-  pure subroutine wp2_term_pr_dfsn_rhs( nz, ngrdcol, gr, C_wp2_pr_dfsn, &
+  subroutine wp2_term_pr_dfsn_rhs( nz, ngrdcol, gr, C_wp2_pr_dfsn, &
                                         rho_ds_zt, invrs_rho_ds_zm, &
                                         wpup2, wpvp2, wp3, &
                                         rhs_pr_dfsn_wp2 )
@@ -3887,7 +3902,7 @@ pure subroutine wp2_term_pr_dfsn_rhs( nz, ngrdcol, gr, C_wp2_pr_dfsn, &
   end subroutine wp2_term_pr_dfsn_rhs
 
   !=============================================================================
-  pure subroutine wp3_term_ta_new_pdf_lhs( nz, ngrdcol, gr, coef_wp4_implicit, &
+  subroutine wp3_term_ta_new_pdf_lhs( nz, ngrdcol, gr, coef_wp4_implicit, &
                                            wp2, rho_ds_zm, invrs_rho_ds_zt, &
                                            lhs_ta_wp3 )
 
@@ -4047,7 +4062,7 @@ pure subroutine wp3_term_ta_new_pdf_lhs( nz, ngrdcol, gr, coef_wp4_implicit, &
   end subroutine wp3_term_ta_new_pdf_lhs
 
   !=============================================================================
-  pure subroutine wp3_term_ta_ADG1_lhs( nz, ngrdcol, gr, &
+  subroutine wp3_term_ta_ADG1_lhs( nz, ngrdcol, gr, &
                                         wp2, a1, a1_zt, a3, a3_zt, &
                                         wp3_on_wp2, rho_ds_zm, &
                                         rho_ds_zt, invrs_rho_ds_zt, &
@@ -4384,7 +4399,7 @@ pure subroutine wp3_term_ta_ADG1_lhs( nz, ngrdcol, gr, &
   end subroutine wp3_term_ta_ADG1_lhs
 
   !=============================================================================
-  pure subroutine wp3_term_tp_lhs( nz, ngrdcol, gr, coef_wp3_tp, & 
+  subroutine wp3_term_tp_lhs( nz, ngrdcol, gr, coef_wp3_tp, & 
                                    wp2, rho_ds_zm, invrs_rho_ds_zt, &
                                    lhs_tp_wp3 )
 
@@ -4536,7 +4551,7 @@ pure subroutine wp3_term_tp_lhs( nz, ngrdcol, gr, coef_wp3_tp, &
   end subroutine wp3_term_tp_lhs
 
   !=============================================================================
-  pure subroutine wp3_terms_ac_pr2_lhs( nz, ngrdcol, gr, C11_Skw_fnc, wm_zm, &
+  subroutine wp3_terms_ac_pr2_lhs( nz, ngrdcol, gr, C11_Skw_fnc, wm_zm, &
                                         lhs_ac_pr2_wp3 )
 
     ! Description:
@@ -4655,7 +4670,7 @@ pure subroutine wp3_terms_ac_pr2_lhs( nz, ngrdcol, gr, C11_Skw_fnc, wm_zm, &
   end subroutine wp3_terms_ac_pr2_lhs
 
   !=============================================================================
-  pure subroutine wp3_term_pr1_lhs( nz, ngrdcol, C8, C8b, &
+  subroutine wp3_term_pr1_lhs( nz, ngrdcol, C8, C8b, &
                                     invrs_tau_wp3_zt, Skw_zt, &
                                     l_damp_wp3_Skw_squared, &
                                     lhs_pr1_wp3 )
@@ -4787,7 +4802,7 @@ pure subroutine wp3_term_pr1_lhs( nz, ngrdcol, C8, C8b, &
   end subroutine wp3_term_pr1_lhs
 
   !=============================================================================
-  pure subroutine wp3_term_ta_explicit_rhs( nz, ngrdcol, gr, &
+  subroutine wp3_term_ta_explicit_rhs( nz, ngrdcol, gr, &
                                             wp4, rho_ds_zm, invrs_rho_ds_zt, &
                                             rhs_ta_wp3 )
 
@@ -4898,7 +4913,7 @@ pure subroutine wp3_term_ta_explicit_rhs( nz, ngrdcol, gr, &
   end subroutine wp3_term_ta_explicit_rhs
 
   !=============================================================================
-  pure subroutine wp3_terms_bp1_pr2_rhs( nz, ngrdcol, C11_Skw_fnc, &
+  subroutine wp3_terms_bp1_pr2_rhs( nz, ngrdcol, C11_Skw_fnc, &
                                          thv_ds_zt, wp2thvp, &
                                          rhs_bp1_pr2_wp3 )
 
@@ -4986,7 +5001,7 @@ pure subroutine wp3_terms_bp1_pr2_rhs( nz, ngrdcol, C11_Skw_fnc, &
   end subroutine wp3_terms_bp1_pr2_rhs
 
   !=============================================================================
-  pure subroutine wp3_term_pr_turb_rhs( nz, ngrdcol, gr, C_wp3_pr_turb, Kh_zt, wpthvp, &
+  subroutine wp3_term_pr_turb_rhs( nz, ngrdcol, gr, C_wp3_pr_turb, Kh_zt, wpthvp, &
                                         dum_dz, dvm_dz, &
                                         upwp, vpwp, &
                                         thv_ds_zt, &
@@ -5106,7 +5121,7 @@ pure subroutine wp3_term_pr_turb_rhs( nz, ngrdcol, gr, C_wp3_pr_turb, Kh_zt, wpt
   end subroutine wp3_term_pr_turb_rhs
 
   !=============================================================================
-  pure subroutine wp3_term_pr_dfsn_rhs( nz, ngrdcol, gr, C_wp3_pr_dfsn, &
+  subroutine wp3_term_pr_dfsn_rhs( nz, ngrdcol, gr, C_wp3_pr_dfsn, &
                                         rho_ds_zm, invrs_rho_ds_zt, &
                                         wp2up2, wp2vp2, wp4, &
                                         up2, vp2, wp2, &
@@ -5220,7 +5235,7 @@ pure subroutine wp3_term_pr_dfsn_rhs( nz, ngrdcol, gr, C_wp3_pr_dfsn, &
   end subroutine wp3_term_pr_dfsn_rhs
 
   !=============================================================================
-  pure subroutine wp3_term_pr1_rhs( nz, ngrdcol, gr, C8, C8b, &
+  subroutine wp3_term_pr1_rhs( nz, ngrdcol, gr, C8, C8b, &
                                     invrs_tau_wp3_zt, Skw_zt, wp3, &
                                     l_damp_wp3_Skw_squared, &
                                     rhs_pr1_wp3 )
diff --git a/src/CLUBB_core/advance_xm_wpxp_module.F90 b/src/CLUBB_core/advance_xm_wpxp_module.F90
index 78648bbad..ca7d671c7 100644
--- a/src/CLUBB_core/advance_xm_wpxp_module.F90
+++ b/src/CLUBB_core/advance_xm_wpxp_module.F90
@@ -5052,7 +5052,7 @@ subroutine xm_wpxp_clipping_and_stats( &
   end subroutine xm_wpxp_clipping_and_stats
 
   !=============================================================================
-  pure subroutine xm_term_ta_lhs( nz, ngrdcol, gr, &
+  subroutine xm_term_ta_lhs( nz, ngrdcol, gr, &
                                   rho_ds_zm, invrs_rho_ds_zt, &
                                   lhs_ta_xm )
 
@@ -5164,7 +5164,7 @@ pure subroutine xm_term_ta_lhs( nz, ngrdcol, gr, &
   end subroutine xm_term_ta_lhs
 
   !=============================================================================
-  pure subroutine wpxp_term_tp_lhs( nz, ngrdcol, gr, wp2, & 
+  subroutine wpxp_term_tp_lhs( nz, ngrdcol, gr, wp2, & 
                                     lhs_tp )
 
     ! Description:
@@ -5277,7 +5277,7 @@ pure subroutine wpxp_term_tp_lhs( nz, ngrdcol, gr, wp2, &
   end subroutine wpxp_term_tp_lhs
     
   !=============================================================================
-  pure subroutine wpxp_terms_ac_pr2_lhs( nz, ngrdcol, C7_Skw_fnc, &
+  subroutine wpxp_terms_ac_pr2_lhs( nz, ngrdcol, C7_Skw_fnc, &
                                          wm_zt, invrs_dzm, &
                                          lhs_ac_pr2  ) 
 
@@ -5392,7 +5392,7 @@ pure subroutine wpxp_terms_ac_pr2_lhs( nz, ngrdcol, C7_Skw_fnc, &
   end subroutine wpxp_terms_ac_pr2_lhs
 
   !=============================================================================
-  pure subroutine wpxp_term_pr1_lhs( nz, ngrdcol, C6rt_Skw_fnc, C6thl_Skw_fnc, C7_Skw_fnc, &
+  subroutine wpxp_term_pr1_lhs( nz, ngrdcol, C6rt_Skw_fnc, C6thl_Skw_fnc, C7_Skw_fnc, &
                                      invrs_tau_C6_zm, l_scalar_calc, &
                                      lhs_pr1_wprtp, lhs_pr1_wpthlp, &
                                      lhs_pr1_wpsclrp )
@@ -5522,7 +5522,7 @@ pure subroutine wpxp_term_pr1_lhs( nz, ngrdcol, C6rt_Skw_fnc, C6thl_Skw_fnc, C7_
   end subroutine wpxp_term_pr1_lhs
 
   !=============================================================================
-  pure subroutine wpxp_terms_bp_pr3_rhs( nz, ngrdcol, C7_Skw_fnc, thv_ds_zm, xpthvp, &
+  subroutine wpxp_terms_bp_pr3_rhs( nz, ngrdcol, C7_Skw_fnc, thv_ds_zm, xpthvp, &
                                          rhs_bp_pr3 )
 
     ! Description:
@@ -5902,7 +5902,7 @@ end subroutine damp_coefficient
   !-----------------------------------------------------------------------
 
   !=====================================================================================
-  pure subroutine diagnose_upxp( nz, ngrdcol, gr, ypwp, xm, wpxp, ym, &
+  subroutine diagnose_upxp( nz, ngrdcol, gr, ypwp, xm, wpxp, ym, &
                                  C6x_Skw_fnc, tau_C6_zm, C7_Skw_fnc, &
                                  ypxp )
     ! Description:
diff --git a/src/CLUBB_core/advance_xp2_xpyp_module.F90 b/src/CLUBB_core/advance_xp2_xpyp_module.F90
index 3c91399a2..b6a09be50 100644
--- a/src/CLUBB_core/advance_xp2_xpyp_module.F90
+++ b/src/CLUBB_core/advance_xp2_xpyp_module.F90
@@ -5193,7 +5193,7 @@ subroutine calc_xp2_xpyp_ta_terms( nz, ngrdcol, gr, wprtp, wprtp2, wpthlp, wpthl
   end subroutine calc_xp2_xpyp_ta_terms
 
   !=============================================================================
-  pure function term_tp( xamp1, xam, xbmp1, xbm,  & 
+  function term_tp( xamp1, xam, xbmp1, xbm,  & 
                          wpxbp, wpxap, invrs_dzm ) & 
   result( rhs )
   !$acc routine seq
@@ -5255,7 +5255,7 @@ pure function term_tp( xamp1, xam, xbmp1, xbm,  &
   end function term_tp
 
   !=============================================================================
-  pure function term_dp1_lhs( Cn, invrs_tau_zm )  & 
+  function term_dp1_lhs( Cn, invrs_tau_zm )  & 
   result( lhs )
   !$acc routine seq
 
@@ -5327,7 +5327,7 @@ pure function term_dp1_lhs( Cn, invrs_tau_zm )  &
   end function term_dp1_lhs
 
   !=============================================================================
-  pure function term_dp1_rhs( Cn, invrs_tau_zm, threshold ) &
+  function term_dp1_rhs( Cn, invrs_tau_zm, threshold ) &
   result( rhs )
   !$acc routine seq
 
@@ -5390,7 +5390,7 @@ pure function term_dp1_rhs( Cn, invrs_tau_zm, threshold ) &
   end function term_dp1_rhs
 
   !=============================================================================
-  pure function term_pr1( C4, C14, xbp2, wp2, invrs_tau_C4_zm, invrs_tau_C14_zm ) & 
+  function term_pr1( C4, C14, xbp2, wp2, invrs_tau_C4_zm, invrs_tau_C14_zm ) & 
   result( rhs )
   !$acc routine seq
 
diff --git a/src/CLUBB_core/advance_xp3_module.F90 b/src/CLUBB_core/advance_xp3_module.F90
index 1ab734e79..a239c3391 100644
--- a/src/CLUBB_core/advance_xp3_module.F90
+++ b/src/CLUBB_core/advance_xp3_module.F90
@@ -464,7 +464,7 @@ subroutine advance_xp3_simplified( nz, ngrdcol, gr, solve_type, dt, & ! Intent(i
   end subroutine advance_xp3_simplified
 
   !=============================================================================
-  pure function term_tp_rhs( xp2_zt, wpxp, wpxpm1, &
+  function term_tp_rhs( xp2_zt, wpxp, wpxpm1, &
                              rho_ds_zm, rho_ds_zmm1, &
                              invrs_rho_ds_zt, &
                              invrs_dzt ) &
@@ -541,7 +541,7 @@ pure function term_tp_rhs( xp2_zt, wpxp, wpxpm1, &
   end function term_tp_rhs
 
   !=============================================================================
-  pure function term_ac_rhs( xm_zm, xm_zmm1, wpxp2, &
+  function term_ac_rhs( xm_zm, xm_zmm1, wpxp2, &
                              invrs_dzt ) &
   result( term_ac )
 
diff --git a/src/CLUBB_core/calc_roots.F90 b/src/CLUBB_core/calc_roots.F90
index 9c2f663b4..15eff0c5f 100644
--- a/src/CLUBB_core/calc_roots.F90
+++ b/src/CLUBB_core/calc_roots.F90
@@ -14,7 +14,7 @@ module calc_roots
   contains
 
   !=============================================================================
-  pure function cubic_solve( nz, a_coef, b_coef, c_coef, d_coef ) &
+  function cubic_solve( nz, a_coef, b_coef, c_coef, d_coef ) &
   result( roots )
 
     ! Description:
@@ -172,7 +172,7 @@ pure function cubic_solve( nz, a_coef, b_coef, c_coef, d_coef ) &
   end function cubic_solve
 
   !=============================================================================
-  pure function quadratic_solve( nz, a_coef, b_coef, c_coef ) &
+  function quadratic_solve( nz, a_coef, b_coef, c_coef ) &
   result( roots )
 
     ! Description:
@@ -261,7 +261,7 @@ pure function quadratic_solve( nz, a_coef, b_coef, c_coef ) &
   end function quadratic_solve
 
   !=============================================================================
-  pure function cube_root( x )
+  function cube_root( x )
 
     ! Description:
     ! Calculates the cube root of x.
diff --git a/src/CLUBB_core/clip_explicit.F90 b/src/CLUBB_core/clip_explicit.F90
index 40825cad1..22e3842a1 100644
--- a/src/CLUBB_core/clip_explicit.F90
+++ b/src/CLUBB_core/clip_explicit.F90
@@ -970,7 +970,7 @@ subroutine clip_variance( nz, ngrdcol, gr, solve_type, dt, threshold, &
         end if
       end do
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     if ( l_stats_samp ) then
       !$acc update host( xp2 )
@@ -1205,7 +1205,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
         wp2_zt_cubed(i,k) = wp2_zt(i,k)**3
       end do
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     if ( l_use_wp3_lim_with_smth_Heaviside ) then 
 
@@ -1217,7 +1217,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
           zagl_thresh(i,k) = zagl_thresh(i,k)  - 1.0_core_rknd 
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
       H_zagl(:,:) = smooth_heaviside_peskin(nz, ngrdcol, zagl_thresh(:,:), 0.6_core_rknd) 
 
@@ -1230,7 +1230,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
                                      * 0.0021_core_rknd *Skw_max_mag**2 )
         end do
       end do
-     !$acc end parallel
+     !$acc end parallel loop
 
     else ! default method 
 
@@ -1248,7 +1248,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
           endif
         end do
       end do
-      !$acc end parallel
+      !$acc end parallel loop
 
     end if
   
@@ -1263,7 +1263,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
         end if
       end do
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     ! Clipping abs(wp3) to 100. This keeps wp3 from growing too large in some 
     ! deep convective cases, which helps prevent these cases from blowing up.
@@ -1275,7 +1275,7 @@ subroutine clip_skewness_core( nz, ngrdcol, gr, sfc_elevation, &
         end if
       end do
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     !$acc exit data delete( wp2_zt_cubed, wp3_lim_sqd, zagl_thresh, H_zagl )
 
diff --git a/src/CLUBB_core/clip_semi_implicit.F90 b/src/CLUBB_core/clip_semi_implicit.F90
index ae1f53cad..65d7947df 100644
--- a/src/CLUBB_core/clip_semi_implicit.F90
+++ b/src/CLUBB_core/clip_semi_implicit.F90
@@ -366,7 +366,7 @@ function clip_semi_imp_lhs( dt, f_unclipped,  &
   end function clip_semi_imp_lhs
 
   !=============================================================================
-  pure function compute_clip_lhs( dt_clip, B_fnc ) & 
+  function compute_clip_lhs( dt_clip, B_fnc ) & 
   result( lhs_contribution )
 
     ! Description:
diff --git a/src/CLUBB_core/clubb_api_module.F90 b/src/CLUBB_core/clubb_api_module.F90
index 4a54552dc..afd3038a2 100644
--- a/src/CLUBB_core/clubb_api_module.F90
+++ b/src/CLUBB_core/clubb_api_module.F90
@@ -3877,7 +3877,7 @@ end function calculate_spurious_source_api
   !================================================================================================
   ! calculate_thlp2_rad - Computes the contribution of radiative cooling to thlp2
   !================================================================================================
-  pure subroutine calculate_thlp2_rad_api &
+  subroutine calculate_thlp2_rad_api &
                   ( nz, rcm_zm, thlprcp, radht_zm, & ! Intent(in)
                     clubb_params,                  & ! Intent(in)
                     thlp2_forcing )                  ! Intent(inout)
diff --git a/src/CLUBB_core/corr_varnce_module.F90 b/src/CLUBB_core/corr_varnce_module.F90
index cb1269c96..ea254b639 100644
--- a/src/CLUBB_core/corr_varnce_module.F90
+++ b/src/CLUBB_core/corr_varnce_module.F90
@@ -224,7 +224,7 @@ subroutine init_default_corr_arrays(  )
   end subroutine init_default_corr_arrays
 
   !-----------------------------------------------------------------------------
-  pure function def_corr_idx( iiPDF_x ) result(ii_def_corr)
+  function def_corr_idx( iiPDF_x ) result(ii_def_corr)
 
     ! Description:
     !   Map from a iiPDF index to the corresponding index in the default 
diff --git a/src/CLUBB_core/diffusion.F90 b/src/CLUBB_core/diffusion.F90
index f0ce296e6..3ab8793f1 100644
--- a/src/CLUBB_core/diffusion.F90
+++ b/src/CLUBB_core/diffusion.F90
@@ -31,7 +31,7 @@ module diffusion
   contains
 
   !=============================================================================
-  pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
+  subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
                                     invrs_rho_ds_zt, rho_ds_zm,       & ! In
                                     lhs )                               ! Out
 
@@ -522,7 +522,7 @@ pure subroutine diffusion_zt_lhs( nz, ngrdcol, gr, K_zm, K_zt, nu,  & ! In
   end subroutine diffusion_zt_lhs
 
   !=============================================================================
-  pure function diffusion_cloud_frac_zt_lhs &
+  function diffusion_cloud_frac_zt_lhs &
                 ( gr, K_zm, K_zmm1, cloud_frac_zt, cloud_frac_ztm1, &
                   cloud_frac_ztp1, cloud_frac_zm, &
                   cloud_frac_zmm1, &
@@ -693,7 +693,7 @@ pure function diffusion_cloud_frac_zt_lhs &
   end function diffusion_cloud_frac_zt_lhs
 
   !=============================================================================
-  pure subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
+  subroutine diffusion_zm_lhs( nz, ngrdcol, gr, K_zt, K_zm, nu,   & ! In
                                     invrs_rho_ds_zm, rho_ds_zt, & ! In 
                                     lhs )                   ! Out
 
diff --git a/src/CLUBB_core/fill_holes.F90 b/src/CLUBB_core/fill_holes.F90
index 81b49d48a..96f6496ae 100644
--- a/src/CLUBB_core/fill_holes.F90
+++ b/src/CLUBB_core/fill_holes.F90
@@ -120,10 +120,8 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
 
     ! --------------------- Begin Code --------------------- 
 
-    !$acc declare copyin( dz, rho_ds ) &
-    !$acc           copy( field ) &
-    !$acc         create( invrs_denom_integral, field_clipped, denom_integral_global, rho_ds_dz, &
-    !$acc                 numer_integral_global, field_avg_global, mass_fraction_global )
+    !$acc enter data create( invrs_denom_integral, field_clipped, denom_integral_global, rho_ds_dz, &
+    !$acc                    numer_integral_global, field_avg_global, mass_fraction_global )
 
     l_field_below_threshold = .false.
 
@@ -140,6 +138,8 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
 
     ! If all field values are above the specified threshold, no hole filling is required
     if ( .not. l_field_below_threshold ) then
+      !$acc exit data delete( invrs_denom_integral, field_clipped, denom_integral_global, rho_ds_dz, &
+      !$acc                   numer_integral_global, field_avg_global, mass_fraction_global )
       return
     end if
 
@@ -236,6 +236,8 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
 
     ! If all field values are above the threshold, no further hole filling is required
     if ( .not. l_field_below_threshold ) then
+      !$acc exit data delete( invrs_denom_integral, field_clipped, denom_integral_global, rho_ds_dz, &
+      !$acc                   numer_integral_global, field_avg_global, mass_fraction_global )
       return
     end if
 
@@ -322,6 +324,9 @@ subroutine fill_holes_vertical( nz, ngrdcol, num_draw_pts, threshold, upper_hf_l
 
     end do
     !$acc end parallel loop
+    
+    !$acc exit data delete( invrs_denom_integral, field_clipped, denom_integral_global, rho_ds_dz, &
+    !$acc                   numer_integral_global, field_avg_global, mass_fraction_global )
 
     return
 
@@ -822,12 +827,17 @@ subroutine fill_holes_driver( gr, nz, dt, hydromet_dim,        & ! Intent(in)
 
          if ( hydromet_name(1:1) == "r" .and. l_hole_fill ) then
 
+            !$acc data copyin( gr, gr%dzt, rho_ds_zt ) &
+            !$acc        copy( hydromet(:,i) )
+
             ! Apply the hole filling algorithm
             ! upper_hf_level = nz since we are filling the zt levels
             call fill_holes_vertical( gr%nz, 1, num_hf_draw_points, zero_threshold, gr%nz, & ! In
                                       gr%dzt, rho_ds_zt,                                   & ! In
                                       hydromet(:,i) )                                      ! InOut
 
+            !$acc end data
+
          endif ! Variable is a mixing ratio and l_hole_fill is true
 
       endif ! hydromet(:,i) < 0
diff --git a/src/CLUBB_core/grid_class.F90 b/src/CLUBB_core/grid_class.F90
index a94de605c..7bdb7f905 100644
--- a/src/CLUBB_core/grid_class.F90
+++ b/src/CLUBB_core/grid_class.F90
@@ -1447,7 +1447,7 @@ function redirect_interpolated_azt_2D( nz, ngrdcol, gr, azm )
   end function redirect_interpolated_azt_2D
   
   !=============================================================================
-  pure subroutine linear_interpolated_azm_2D( nz, ngrdcol, gr, azt, &
+  subroutine linear_interpolated_azm_2D( nz, ngrdcol, gr, azt, &
                                               linear_interpolated_azm )
 
     ! Description:
@@ -1708,7 +1708,7 @@ function cubic_interpolated_azm_2D( nz, ngrdcol, gr, azt )
   end function cubic_interpolated_azm_2D
 
   !=============================================================================
-  pure subroutine calc_zt2zm_weights( nz, ngrdcol, &
+  subroutine calc_zt2zm_weights( nz, ngrdcol, &
                                       gr ) 
 
     ! Description:
@@ -1918,7 +1918,7 @@ pure subroutine calc_zt2zm_weights( nz, ngrdcol, &
   end subroutine calc_zt2zm_weights
   
   !=============================================================================
-  pure subroutine linear_interpolated_azt_2D( nz, ngrdcol, gr, azm, &
+  subroutine linear_interpolated_azt_2D( nz, ngrdcol, gr, azm, &
                                               linear_interpolated_azt )
 
     ! Description:
@@ -2073,7 +2073,7 @@ function cubic_interpolated_azt_2D( nz, ngrdcol, gr, azm )
   end function cubic_interpolated_azt_2D
 
   !=============================================================================
-  pure subroutine calc_zm2zt_weights( nz, ngrdcol, &
+  subroutine calc_zm2zt_weights( nz, ngrdcol, &
                                       gr )
 
     ! Description:
@@ -2286,7 +2286,7 @@ end subroutine calc_zm2zt_weights
   
   !=============================================================================
   ! Wrapped in interface ddzm
-  pure function gradzm_2D( nz, ngrdcol, gr, azm )
+  function gradzm_2D( nz, ngrdcol, gr, azm )
 
     ! Description:
     !  2D version of gradzm
@@ -2339,7 +2339,7 @@ end function gradzm_2D
   
   !=============================================================================
   ! Wrapped in interface ddzm
-  pure function gradzm_1D( gr, azm )
+  function gradzm_1D( gr, azm )
 
     ! Description:
     !  2D version of gradzm
@@ -2380,7 +2380,7 @@ end function gradzm_1D
   
   !=============================================================================
   ! Wrapped in interface ddzt
-  pure function gradzt_2D( nz, ngrdcol, gr, azt )
+  function gradzt_2D( nz, ngrdcol, gr, azt )
 
     ! Description:
     !  2D version of gradzt
@@ -2433,7 +2433,7 @@ end function gradzt_2D
   
   !=============================================================================
   ! Wrapped in interface ddzt
-  pure function gradzt_1D( gr, azt )
+  function gradzt_1D( gr, azt )
 
     ! Description:
     !  2D version of gradzt
@@ -2474,7 +2474,7 @@ pure function gradzt_1D( gr, azt )
   end function gradzt_1D
 
   !=============================================================================
-  pure function flip( x, xdim )
+  function flip( x, xdim )
 
     ! Description:
     !   Flips a single dimension array (i.e. a vector), so the first element
diff --git a/src/CLUBB_core/mean_adv.F90 b/src/CLUBB_core/mean_adv.F90
index 69a5e2f20..8535cab6a 100644
--- a/src/CLUBB_core/mean_adv.F90
+++ b/src/CLUBB_core/mean_adv.F90
@@ -30,7 +30,7 @@ module mean_adv
   contains
 
   !=============================================================================
-  pure subroutine term_ma_zt_lhs( nz, ngrdcol, wm_zt, weights_zt2zm, & ! Intent(in)
+  subroutine term_ma_zt_lhs( nz, ngrdcol, wm_zt, weights_zt2zm, & ! Intent(in)
                                   invrs_dzt, invrs_dzm,     & ! Intent(in)
                                   l_upwind_xm_ma,           & ! Intent(in)
                                   lhs_ma )                    ! Intent(out)
@@ -354,7 +354,7 @@ pure subroutine term_ma_zt_lhs( nz, ngrdcol, wm_zt, weights_zt2zm, & ! Intent(in
   end subroutine term_ma_zt_lhs
 
   !=============================================================================
-  pure subroutine term_ma_zm_lhs( nz, ngrdcol, wm_zm, &
+  subroutine term_ma_zm_lhs( nz, ngrdcol, wm_zm, &
                                   invrs_dzm, weights_zm2zt, & 
                                   lhs_ma )
 
diff --git a/src/CLUBB_core/mono_flux_limiter.F90 b/src/CLUBB_core/mono_flux_limiter.F90
index 4b95fce9a..dcf64118e 100644
--- a/src/CLUBB_core/mono_flux_limiter.F90
+++ b/src/CLUBB_core/mono_flux_limiter.F90
@@ -1342,7 +1342,7 @@ subroutine mfl_xm_solve( nz, ngrdcol, solve_type, tridiag_solve_method, &
     do i = 1, ngrdcol
       xm(i,1) = xm(i,2)
     end do
-    !$acc end parallel
+    !$acc end parallel loop
 
     return
   end subroutine mfl_xm_solve
diff --git a/src/CLUBB_core/numerical_check.F90 b/src/CLUBB_core/numerical_check.F90
index d749a4ca1..a1569eb95 100644
--- a/src/CLUBB_core/numerical_check.F90
+++ b/src/CLUBB_core/numerical_check.F90
@@ -1035,7 +1035,7 @@ end subroutine check_nan_sclr
 !-------------------------------------------------------------------------
 
 !-----------------------------------------------------------------------
-  pure function calculate_spurious_source( integral_after, integral_before, &
+  function calculate_spurious_source( integral_after, integral_before, &
                                            flux_top, flux_sfc, & 
                                            integral_forcing, dt ) &
   result( spurious_source )
diff --git a/src/CLUBB_core/penta_lu_solver.F90 b/src/CLUBB_core/penta_lu_solver.F90
index f7f503d8d..2bb2aad8b 100644
--- a/src/CLUBB_core/penta_lu_solver.F90
+++ b/src/CLUBB_core/penta_lu_solver.F90
@@ -125,18 +125,18 @@ subroutine penta_lu_solve_single_rhs_multiple_lhs( ndim, ngrdcol, lhs, rhs, &
       upper_1,          & ! First U band 
       upper_2,          & ! Second U band 
       lower_diag_invrs, & ! Inverse of the diagonal of L
-      lower_1             ! First L band
+      lower_1,          & ! First L band
+      lower_2             ! Second L band
 
     integer :: i, k, j    ! Loop variables
 
     ! ----------------------- Begin Code -----------------------
        
-    !$acc data create( upper_1, upper_2, lower_1, lower_diag_invrs ) &
+    !$acc data create( upper_1, upper_2, lower_1, lower_2, lower_diag_invrs ) &
     !$acc      copyin( rhs, lhs ) &
     !$acc      copyout( soln )
-    
-    !$acc kernels
 
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lower_diag_invrs(i,1) = 1.0_core_rknd / lhs(0,i,1)
       upper_1(i,1)          = lower_diag_invrs(i,1) * lhs(-1,i,1) 
@@ -147,38 +147,45 @@ subroutine penta_lu_solve_single_rhs_multiple_lhs( ndim, ngrdcol, lhs, rhs, &
       upper_1(i,2)          = lower_diag_invrs(i,2) * ( lhs(-1,i,2) - lower_1(i,2) * upper_2(i,1) )
       upper_2(i,2)          = lower_diag_invrs(i,2) * lhs(-2,i,2)
     end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
+    do i = 1, ngrdcol
+      do k = 3, ndim-2
+        lower_2(i,k) = lhs(2,i,k)
+        lower_1(i,k) = lhs(1,i,k) - lower_2(i,k) * upper_1(i,k-2)
 
-    do k = 3, ndim-2
-      do i = 1, ngrdcol
-        lower_1(i,k)          = lhs(1,i,k) - lhs(2,i,k) * upper_1(i,k-2)
-
-        lower_diag_invrs(i,k) = 1.0_core_rknd / ( lhs(0,i,k) - lhs(2,i,k) * upper_2(i,k-2) &
+        lower_diag_invrs(i,k) = 1.0_core_rknd / ( lhs(0,i,k) - lower_2(i,k) * upper_2(i,k-2) &
                                                              - lower_1(i,k) * upper_1(i,k-1) )
 
-        upper_1(i,k)          = lower_diag_invrs(i,k) * ( lhs(-1,i,k) - lower_1(i,k) * upper_2(i,k-1) )
-        upper_2(i,k)          = lower_diag_invrs(i,k) * lhs(-2,i,k)
+        upper_1(i,k) = lower_diag_invrs(i,k) * ( lhs(-1,i,k) - lower_1(i,k) * upper_2(i,k-1) )
+        upper_2(i,k) = lower_diag_invrs(i,k) * lhs(-2,i,k)
       end do
     end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
-      lower_1(i,ndim-1) = lhs(1,i,ndim-1) - lhs(2,i,ndim-1) * upper_1(i,ndim-3)
+      lower_2(i,ndim-1) = lhs(2,i,ndim-1)
+      lower_1(i,ndim-1) = lhs(1,i,ndim-1) - lower_2(i,ndim-1) * upper_1(i,ndim-3)
 
       lower_diag_invrs(i,ndim-1) = 1.0_core_rknd  &
-                                   / ( lhs(0,i,ndim-1) - lhs(2,i,ndim-1) * upper_2(i,ndim-3) &
+                                   / ( lhs(0,i,ndim-1) - lower_2(i,ndim-1) * upper_2(i,ndim-3) &
                                                        - lower_1(i,ndim-1) * upper_1(i,ndim-2) )
 
       upper_1(i,ndim-1)  = lower_diag_invrs(i,ndim-1) * ( lhs(-1,i,ndim-1) - lower_1(i,ndim-1) &
-                                                      * upper_2(i,ndim-2) )
+                                                                             * upper_2(i,ndim-2) )
 
-      lower_1(i,ndim) = lhs(1,i,ndim) - lhs(2,i,ndim) * upper_1(i,ndim-2)
+      lower_2(i,ndim) = lhs(2,i,ndim)
+      lower_1(i,ndim) = lhs(1,i,ndim) - lower_2(i,ndim) * upper_1(i,ndim-2)
 
       lower_diag_invrs(i,ndim) = 1.0_core_rknd  &
-                                 / ( lhs(0,i,ndim-1) - lhs(2,i,ndim) * upper_2(i,ndim-2) &
+                                 / ( lhs(0,i,ndim-1) - lower_2(i,ndim) * upper_2(i,ndim-2) &
                                                      - lower_1(i,ndim) * upper_1(i,ndim-1) )
     end do
-
+    !$acc end parallel loop
     
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol 
 
       soln(i,1)   = lower_diag_invrs(i,1) * rhs(i,1) 
@@ -186,10 +193,14 @@ subroutine penta_lu_solve_single_rhs_multiple_lhs( ndim, ngrdcol, lhs, rhs, &
       soln(i,2)   = lower_diag_invrs(i,2) * ( rhs(i,2) - lower_1(i,2) * soln(i,1) )
 
       do k = 3, ndim
-        soln(i,k) = lower_diag_invrs(i,k) * ( rhs(i,k) - lhs(2,i,k) * soln(i,k-2) &
+        soln(i,k) = lower_diag_invrs(i,k) * ( rhs(i,k) - lower_2(i,k) * soln(i,k-2) &
                                                        - lower_1(i,k) * soln(i,k-1) )
       end do
+    end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
+    do i = 1, ngrdcol 
       soln(i,ndim-1) = soln(i,ndim-1) - upper_1(i,ndim-1) * soln(i,ndim)
 
       do k = ndim-2, 1, -1
@@ -197,8 +208,7 @@ subroutine penta_lu_solve_single_rhs_multiple_lhs( ndim, ngrdcol, lhs, rhs, &
       end do
 
     end do
-
-    !$acc end kernels
+    !$acc end parallel loop
 
     !$acc end data
 
@@ -237,18 +247,18 @@ subroutine penta_lu_solve_multiple_rhs_lhs( ndim, nrhs, ngrdcol, lhs, rhs, &
       upper_1,          & ! First U band 
       upper_2,          & ! Second U band 
       lower_diag_invrs, & ! Inverse of the diagonal of L
-      lower_1             ! First L band
+      lower_1,          & ! First L band
+      lower_2             ! Second L band
 
     integer :: i, k, j    ! Loop variables
 
     ! ----------------------- Begin Code -----------------------
        
-    !$acc data create( upper_1, upper_2, lower_1, lower_diag_invrs ) &
+    !$acc data create( upper_1, upper_2, lower_1, lower_2, lower_diag_invrs ) &
     !$acc      copyin( rhs, lhs ) &
     !$acc      copyout( soln )
 
-    !$acc kernels
-
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lower_diag_invrs(i,1) = 1.0_core_rknd / lhs(0,i,1)
       upper_1(i,1)          = lower_diag_invrs(i,1) * lhs(-1,i,1) 
@@ -259,38 +269,45 @@ subroutine penta_lu_solve_multiple_rhs_lhs( ndim, nrhs, ngrdcol, lhs, rhs, &
       upper_1(i,2)          = lower_diag_invrs(i,2) * ( lhs(-1,i,2) - lower_1(i,2) * upper_2(i,1) )
       upper_2(i,2)          = lower_diag_invrs(i,2) * lhs(-2,i,2)
     end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
+    do i = 1, ngrdcol
+      do k = 3, ndim-2
+        lower_2(i,k) = lhs(2,i,k)
+        lower_1(i,k) = lhs(1,i,k) - lower_2(i,k) * upper_1(i,k-2)
 
-    do k = 3, ndim-2
-      do i = 1, ngrdcol
-        lower_1(i,k)          = lhs(1,i,k) - lhs(2,i,k) * upper_1(i,k-2)
-
-        lower_diag_invrs(i,k) = 1.0_core_rknd / ( lhs(0,i,k) - lhs(2,i,k) * upper_2(i,k-2) &
+        lower_diag_invrs(i,k) = 1.0_core_rknd / ( lhs(0,i,k) - lower_2(i,k) * upper_2(i,k-2) &
                                                              - lower_1(i,k) * upper_1(i,k-1) )
 
-        upper_1(i,k)          = lower_diag_invrs(i,k) * ( lhs(-1,i,k) - lower_1(i,k) * upper_2(i,k-1) )
-        upper_2(i,k)          = lower_diag_invrs(i,k) * lhs(-2,i,k)
+        upper_1(i,k) = lower_diag_invrs(i,k) * ( lhs(-1,i,k) - lower_1(i,k) * upper_2(i,k-1) )
+        upper_2(i,k) = lower_diag_invrs(i,k) * lhs(-2,i,k)
       end do
     end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
-      lower_1(i,ndim-1) = lhs(1,i,ndim-1) - lhs(2,i,ndim-1) * upper_1(i,ndim-3)
+      lower_2(i,ndim-1) = lhs(2,i,ndim-1)
+      lower_1(i,ndim-1) = lhs(1,i,ndim-1) - lower_2(i,ndim-1) * upper_1(i,ndim-3)
 
       lower_diag_invrs(i,ndim-1) = 1.0_core_rknd  &
-                                   / ( lhs(0,i,ndim-1) - lhs(2,i,ndim-1) * upper_2(i,ndim-3) &
+                                   / ( lhs(0,i,ndim-1) - lower_2(i,ndim-1) * upper_2(i,ndim-3) &
                                                        - lower_1(i,ndim-1) * upper_1(i,ndim-2) )
 
       upper_1(i,ndim-1) = lower_diag_invrs(i,ndim-1) * ( lhs(-1,i,ndim-1) - lower_1(i,ndim-1) &
-                                                     * upper_2(i,ndim-2) )
+                                                                            * upper_2(i,ndim-2) )
 
-      lower_1(i,ndim) = lhs(1,i,ndim) - lhs(2,i,ndim) * upper_1(i,ndim-2)
+      lower_2(i,ndim) = lhs(2,i,ndim)
+      lower_1(i,ndim) = lhs(1,i,ndim) - lower_2(i,ndim) * upper_1(i,ndim-2)
 
       lower_diag_invrs(i,ndim) = 1.0_core_rknd  &
-                                 / ( lhs(0,i,ndim-1) - lhs(2,i,ndim) * upper_2(i,ndim-2) &
+                                 / ( lhs(0,i,ndim-1) - lower_2(i,ndim) * upper_2(i,ndim-2) &
                                                      - lower_1(  i,ndim) * upper_1(i,ndim-1) )
     end do
+    !$acc end parallel loop
 
-    
+    !$acc parallel loop gang vector collapse(2) default(present)
     do j = 1, nrhs
       do i = 1, ngrdcol 
 
@@ -299,10 +316,16 @@ subroutine penta_lu_solve_multiple_rhs_lhs( ndim, nrhs, ngrdcol, lhs, rhs, &
         soln(i,2,j)   = lower_diag_invrs(i,2) * ( rhs(i,2,j) - lower_1(i,2) * soln(i,1,j) )
 
         do k = 3, ndim
-          soln(i,k,j) = lower_diag_invrs(i,k) * ( rhs(i,k,j) - lhs(2,i,k) * soln(i,k-2,j) &
+          soln(i,k,j) = lower_diag_invrs(i,k) * ( rhs(i,k,j) - lower_2(i,k) * soln(i,k-2,j) &
                                                              - lower_1(i,k) * soln(i,k-1,j) )
         end do
+      end do
+    end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector collapse(2) default(present)
+    do j = 1, nrhs
+      do i = 1, ngrdcol 
         soln(i,ndim-1,j) = soln(i,ndim-1,j) - upper_1(i,ndim-1) * soln(i,ndim,j)
 
         do k = ndim-2, 1, -1
@@ -311,8 +334,7 @@ subroutine penta_lu_solve_multiple_rhs_lhs( ndim, nrhs, ngrdcol, lhs, rhs, &
 
       end do
     end do
-
-    !$acc end kernels
+    !$acc end parallel loop
 
     !$acc end data
 
diff --git a/src/CLUBB_core/tridiag_lu_solver.F90 b/src/CLUBB_core/tridiag_lu_solver.F90
index 3aefdfaf3..6bcd094b2 100644
--- a/src/CLUBB_core/tridiag_lu_solver.F90
+++ b/src/CLUBB_core/tridiag_lu_solver.F90
@@ -123,8 +123,8 @@ subroutine tridiag_lu_solve_single_rhs_lhs( ndim, lhs, rhs, &
     upper(1)            = lower_diag_invrs(1) * lhs(-1,1) 
 
     do k = 2, ndim-1
-        lower_diag_invrs(k) = 1.0_core_rknd / ( lhs(0,k) - lhs(1,k) * upper(k-1)  )
-        upper(k)            = lower_diag_invrs(k) * lhs(-1,k) 
+      lower_diag_invrs(k) = 1.0_core_rknd / ( lhs(0,k) - lhs(1,k) * upper(k-1)  )
+      upper(k)            = lower_diag_invrs(k) * lhs(-1,k) 
     end do
 
     lower_diag_invrs(ndim) = 1.0_core_rknd / ( lhs(0,ndim) - lhs(1,ndim) * upper(ndim-1)  )
@@ -139,7 +139,6 @@ subroutine tridiag_lu_solve_single_rhs_lhs( ndim, lhs, rhs, &
       soln(k) = soln(k) - upper(k) * soln(k+1)
     end do
 
-
     !$acc end kernels
 
     !$acc end data
@@ -185,26 +184,29 @@ subroutine tridiag_lu_solve_single_rhs_multiple_lhs( ndim, ngrdcol, lhs, rhs, &
     !$acc      copyin( rhs, lhs ) &
     !$acc      copyout( soln )
     
-    !$acc kernels
-    
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lower_diag_invrs(i,1) = 1.0_core_rknd / lhs(0,i,1)
       upper(i,1)            = lower_diag_invrs(i,1) * lhs(-1,i,1) 
     end do
+    !$acc end parallel loop
 
-
+    !$acc parallel loop gang vector default(present)
     do k = 2, ndim-1
       do i = 1, ngrdcol
         lower_diag_invrs(i,k) = 1.0_core_rknd / ( lhs(0,i,k) - lhs(1,i,k) * upper(i,k-1)  )
         upper(i,k)            = lower_diag_invrs(i,k) * lhs(-1,i,k) 
       end do
     end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lower_diag_invrs(i,ndim) = 1.0_core_rknd / ( lhs(0,i,ndim) - lhs(1,i,ndim) * upper(i,ndim-1)  )
     end do
+    !$acc end parallel loop
 
-    
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol 
 
       soln(i,1)   = lower_diag_invrs(i,1) * rhs(i,1) 
@@ -212,14 +214,17 @@ subroutine tridiag_lu_solve_single_rhs_multiple_lhs( ndim, ngrdcol, lhs, rhs, &
       do k = 2, ndim
         soln(i,k) = lower_diag_invrs(i,k) * ( rhs(i,k) - lhs(1,i,k) * soln(i,k-1) )
       end do
+    end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
+    do i = 1, ngrdcol 
       do k = ndim-1, 1, -1
         soln(i,k) = soln(i,k) - upper(i,k) * soln(i,k+1)
       end do
 
     end do
-
-    !$acc end kernels
+    !$acc end parallel loop
 
     !$acc end data
 
@@ -266,26 +271,29 @@ subroutine tridiag_lu_solve_multiple_rhs_lhs( ndim, nrhs, ngrdcol, lhs, rhs, &
     !$acc      copyin( rhs, lhs ) &
     !$acc      copyout( soln )
 
-    !$acc kernels
-
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lower_diag_invrs(i,1) = 1.0_core_rknd / lhs(0,i,1)
       upper(i,1)            = lower_diag_invrs(i,1) * lhs(-1,i,1) 
     end do
+    !$acc end parallel loop
 
-
-    do k = 2, ndim-1
-      do i = 1, ngrdcol
+    !$acc parallel loop gang vector default(present)
+    do i = 1, ngrdcol
+      do k = 2, ndim-1
         lower_diag_invrs(i,k) = 1.0_core_rknd / ( lhs(0,i,k) - lhs(1,i,k) * upper(i,k-1)  )
         upper(i,k)            = lower_diag_invrs(i,k) * lhs(-1,i,k) 
       end do
     end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector default(present)
     do i = 1, ngrdcol
       lower_diag_invrs(i,ndim) = 1.0_core_rknd / ( lhs(0,i,ndim) - lhs(1,i,ndim) * upper(i,ndim-1)  )
     end do
+    !$acc end parallel loop
 
-    
+    !$acc parallel loop gang vector collapse(2) default(present)
     do j = 1, nrhs
       do i = 1, ngrdcol 
 
@@ -294,15 +302,19 @@ subroutine tridiag_lu_solve_multiple_rhs_lhs( ndim, nrhs, ngrdcol, lhs, rhs, &
         do k = 2, ndim
           soln(i,k,j) = lower_diag_invrs(i,k) * ( rhs(i,k,j) - lhs(1,i,k) * soln(i,k-1,j) )
         end do
+      end do
+    end do
+    !$acc end parallel loop
 
+    !$acc parallel loop gang vector collapse(2) default(present)
+    do j = 1, nrhs
+      do i = 1, ngrdcol 
         do k = ndim-1, 1, -1
           soln(i,k,j) = soln(i,k,j) - upper(i,k) * soln(i,k+1,j)
         end do
-
       end do
     end do
-
-    !$acc end kernels
+    !$acc end parallel loop
 
     !$acc end data
 
diff --git a/src/CLUBB_core/turbulent_adv_pdf.F90 b/src/CLUBB_core/turbulent_adv_pdf.F90
index 46eb4241c..2d340b82d 100644
--- a/src/CLUBB_core/turbulent_adv_pdf.F90
+++ b/src/CLUBB_core/turbulent_adv_pdf.F90
@@ -30,7 +30,7 @@ module turbulent_adv_pdf
   contains
 
   !=============================================================================
-  pure subroutine xpyp_term_ta_pdf_lhs( nz, ngrdcol, gr, coef_wpxpyp_implicit, & ! In
+  subroutine xpyp_term_ta_pdf_lhs( nz, ngrdcol, gr, coef_wpxpyp_implicit, & ! In
                                         rho_ds_zt, rho_ds_zm,            & ! In
                                         invrs_rho_ds_zm,                 & ! In
                                         l_upwind_xpyp_turbulent_adv,     & ! In
@@ -478,7 +478,7 @@ pure subroutine xpyp_term_ta_pdf_lhs( nz, ngrdcol, gr, coef_wpxpyp_implicit, & !
   end subroutine xpyp_term_ta_pdf_lhs
 
   !=============================================================================================
-  pure subroutine xpyp_term_ta_pdf_lhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
+  subroutine xpyp_term_ta_pdf_lhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
                                                 coef_wpxpyp_implicit, & ! Intent(in)
                                                 invrs_rho_ds_zm, rho_ds_zm,  & ! Intent(in)
                                                 lhs_ta )
@@ -589,7 +589,7 @@ pure subroutine xpyp_term_ta_pdf_lhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
   end subroutine xpyp_term_ta_pdf_lhs_godunov
 
   !=============================================================================
-  pure subroutine xpyp_term_ta_pdf_rhs( nz, ngrdcol, gr, term_wpxpyp_explicit,  & ! In
+  subroutine xpyp_term_ta_pdf_rhs( nz, ngrdcol, gr, term_wpxpyp_explicit,  & ! In
                                         rho_ds_zt, rho_ds_zm,                   & ! In
                                         invrs_rho_ds_zm,                        & ! In
                                         l_upwind_xpyp_turbulent_adv,            & ! In
@@ -960,7 +960,7 @@ pure subroutine xpyp_term_ta_pdf_rhs( nz, ngrdcol, gr, term_wpxpyp_explicit,  &
   end subroutine xpyp_term_ta_pdf_rhs
 
   !=============================================================================
-  pure subroutine xpyp_term_ta_pdf_rhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
+  subroutine xpyp_term_ta_pdf_rhs_godunov( nz, ngrdcol, gr, & ! Intent(in)
                                                 term_wpxpyp_explicit_zm, & ! Intent(in)
                                                 invrs_rho_ds_zm, & ! Intent(in)
                                                 sgn_turbulent_vel, & ! Intent(in)