From 096e09d83f113767ec3f85bb03a1f7d24c612e64 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sat, 20 Jul 2024 00:33:56 +0530
Subject: [PATCH 01/36] Add luma_meter and tonemapper

---
 .../nbl/builtin/hlsl/luma_meter/luma_meter.hlsl  | 16 ++++++++++++++++
 .../nbl/builtin/hlsl/tonemapper/operators.hlsl   | 16 ++++++++++++++++
 src/nbl/builtin/CMakeLists.txt                   |  4 ++++
 3 files changed, 36 insertions(+)
 create mode 100644 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
 create mode 100644 include/nbl/builtin/hlsl/tonemapper/operators.hlsl

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
new file mode 100644
index 0000000000..4e18655852
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -0,0 +1,16 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+
+namespace nbl
+{
+namespace hls
+{
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
new file mode 100644
index 0000000000..5ebb5b2ffa
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -0,0 +1,16 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+
+namespace nbl
+{
+namespace hls
+{
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 8f797b9454..9dd9ddfd42 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -34,6 +34,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/utils.glsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
+# luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
+# tonemapper
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")
 # bump mapping
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl`
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/utils.glsl")

From 4fd700fe69709ec127f7f42ec09b4f7f4ce0260c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sat, 20 Jul 2024 00:34:17 +0530
Subject: [PATCH 02/36] Update submodule pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index c6d5ee3498..87d4794dcc 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit c6d5ee349859ce0b5229bc62a2372fa1d4b6b17c
+Subproject commit 87d4794dcc5de8264528292c4a30b5284979754a

From 52e7ab24dedb16f6c94855d6f0037e7ea77fba81 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 1 Aug 2024 21:20:52 +0530
Subject: [PATCH 03/36] Convert morton.h to hlsl

---
 include/nbl/asset/utils/IMeshPacker.h     |   2 +-
 include/nbl/asset/utils/IVirtualTexture.h |   3 +-
 include/nbl/builtin/hlsl/math/morton.hlsl | 283 ++++++++++++++++++++++
 src/nbl/builtin/CMakeLists.txt            |   2 +
 4 files changed, 288 insertions(+), 2 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/math/morton.hlsl

diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h
index 3f09062b18..355d792782 100644
--- a/include/nbl/asset/utils/IMeshPacker.h
+++ b/include/nbl/asset/utils/IMeshPacker.h
@@ -6,7 +6,7 @@
 #define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__
 
 #include "nbl/asset/utils/IMeshManipulator.h"
-#include "nbl/core/math/morton.h"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
 
 namespace nbl
 {
diff --git a/include/nbl/asset/utils/IVirtualTexture.h b/include/nbl/asset/utils/IVirtualTexture.h
index ec26f56103..64ea49cbe7 100644
--- a/include/nbl/asset/utils/IVirtualTexture.h
+++ b/include/nbl/asset/utils/IVirtualTexture.h
@@ -7,7 +7,6 @@
 
 #include <functional>
 
-#include "nbl/core/math/morton.h"
 #include "nbl/core/memory/memory.h"
 #include "nbl/core/alloc/GeneralpurposeAddressAllocator.h"
 #include "nbl/core/alloc/PoolAddressAllocator.h"
@@ -19,6 +18,8 @@
 #include "nbl/asset/filters/CPaddedCopyImageFilter.h"
 #include "nbl/asset/filters/CFillImageFilter.h"
 
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
 namespace nbl::asset
 {
 
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
new file mode 100644
index 0000000000..64b0b66cb7
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -0,0 +1,283 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+
+#ifdef __HLSL_VERSION
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#else
+#include <cstdint>
+#endif
+
+namespace nbl
+{
+namespace core
+{
+
+namespace impl
+{
+
+#ifdef __HLSL_VERSION
+template <typename T>
+T morton2d_mask(uint16_t _n) const
+{
+    const static uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+
+template <typename T>
+T morton3d_mask(uint16_t _n) const
+{
+    const static uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+template <typename T>
+T morton4d_mask(uint16_t _n) const
+{
+    const static uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+
+template <typename T, uint32_t bitDepth>
+inline T morton2d_decode(T x)
+{
+    x = x & morton2d_mask<T>(0);
+    x = (x | (x >> 1)) & morton2d_mask<T>(1);
+    x = (x | (x >> 2)) & morton2d_mask<T>(2);
+    if (bitDepth > 8u)
+    {
+        x = (x | (x >> 4)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x >> 8)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 32u)
+    {
+        x = (x | (x >> 16));
+    }
+    return x;
+}
+
+//! Puts bits on even positions filling gaps with 0s
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_2d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 16)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 8)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 4)) & morton2d_mask<T>(2);
+    }
+    x = (x | (x << 2)) & morton2d_mask<T>(1);
+    x = (x | (x << 1)) & morton2d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_3d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 32)) & morton3d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 16)) & morton3d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 8)) & morton3d_mask<T>(2);
+    }
+    x = (x | (x << 4)) & morton3d_mask<T>(1);
+    x = (x | (x << 2)) & morton3d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_4d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 24)) & morton4d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 12)) & morton4d_mask<T>(2);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 6)) & morton4d_mask<T>(1);
+    }
+    x = (x | (x << 3)) & morton4d_mask<T>(0);
+
+    return x;
+}
+#else
+template <typename T>
+constexpr T morton2d_mask(uint8_t _n)
+{
+    constexpr uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+template <typename T>
+constexpr T morton3d_mask(uint8_t _n)
+{
+    constexpr uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+template <typename T>
+constexpr T morton4d_mask(uint8_t _n)
+{
+    constexpr uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+
+template <typename T, uint32_t bitDepth>
+inline T morton2d_decode(T x)
+{
+    x = x & morton2d_mask<T>(0);
+    x = (x | (x >> 1)) & morton2d_mask<T>(1);
+    x = (x | (x >> 2)) & morton2d_mask<T>(2);
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x >> 4)) & morton2d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x >> 8)) & morton2d_mask<T>(4);
+    }
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x >> 16));
+    }
+    return x;
+}
+
+//! Puts bits on even positions filling gaps with 0s
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_2d(T x)
+{
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x << 16)) & morton2d_mask<T>(4);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x << 8)) & morton2d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x << 4)) & morton2d_mask<T>(2);
+    }
+    x = (x | (x << 2)) & morton2d_mask<T>(1);
+    x = (x | (x << 1)) & morton2d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_3d(T x)
+{
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x << 32)) & morton3d_mask<T>(4);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x << 16)) & morton3d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x << 8)) & morton3d_mask<T>(2);
+    }
+    x = (x | (x << 4)) & morton3d_mask<T>(1);
+    x = (x | (x << 2)) & morton3d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_4d(T x)
+{
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x << 24)) & morton4d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x << 12)) & morton4d_mask<T>(2);
+    }
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x << 6)) & morton4d_mask<T>(1);
+    }
+    x = (x | (x << 3)) & morton4d_mask<T>(0);
+
+    return x;
+}
+#endif
+}
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); }
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); }
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 8a7775c7a5..df61293d4a 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -281,6 +281,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl")
 #extra math
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl")
+#morton
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/morton.hlsl")
 #acceleration structures
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl")
 #colorspace

From 1cc26bdcd583bbbc354c8c5e951f06e6cb1d3f28 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 2 Aug 2024 19:00:47 +0530
Subject: [PATCH 04/36] Fix HLSL morton code

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 64b0b66cb7..4150af637a 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -21,7 +21,7 @@ namespace impl
 
 #ifdef __HLSL_VERSION
 template <typename T>
-T morton2d_mask(uint16_t _n) const
+T morton2d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -31,11 +31,11 @@ T morton2d_mask(uint16_t _n) const
         0x00FF00FF00FF00FFull,
         0x0000FFFF0000FFFFull
     };
-    return static_cast<T>(mask[_n]);
+    return mask[_n];
 }
 
 template <typename T>
-T morton3d_mask(uint16_t _n) const
+T morton3d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -45,10 +45,10 @@ T morton3d_mask(uint16_t _n) const
         0x001F0000FF0000FFull,
         0x001F00000000FFFFull
     };
-    return static_cast<T>(mask[_n]);
+    return mask[_n];
 }
 template <typename T>
-T morton4d_mask(uint16_t _n) const
+T morton4d_mask(uint16_t _n)
 {
     const static uint64_t mask[4] =
     {
@@ -57,7 +57,7 @@ T morton4d_mask(uint16_t _n) const
         0x000F000F000F000Full,
         0x000000FF000000FFull
     };
-    return static_cast<T>(mask[_n]);
+    return mask[_n];
 }
 
 template <typename T, uint32_t bitDepth>

From 6922d0c41b509a125be89d86627ba206d565b053 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 5 Aug 2024 19:02:04 +0530
Subject: [PATCH 05/36] Create geom_luma_meter and computeLuma

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 4e18655852..d2c33602c8 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -5,11 +5,56 @@
 #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
+#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
+
 namespace nbl
 {
-namespace hls
+namespace hlsl
+{
+namespace luma_meter
+{
+
+struct LumaMeteringWindow
 {
+	float32_t2 meteringWindowScale;
+	float32_t2 meteringWindowOffset;
+};
+
+template<uint32_t SubgroupSize, uint32_t SubgroupCount, typename SharedAccessor, typename TexAccessor>
+struct geom_luma_meter {
+    using this_t = geom_luma_meter<SubgroupSize, SubgroupCount, SharedAccessor, TexAccessor>;
+
+    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window)
+    {
+        this_t retval;
+        retval.window = window;
+        return retval;
+    }
 
+    float32_t computeLuma(NBL_REF_ARG(TexAccessor) tex, uint32_t2 sampleCount, uint32_t2 sampleIndex, float32_t2 viewportSize)
+    {
+        float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
+        float32_t2 samplePos = stride * sampleIndex;
+        float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize;
+        float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos));
+        float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
+
+        const float32_t minLuma = 1.0 / 4096.0;
+        const float32_t maxLuma = 32768.0;
+
+        luma = clamp(luma, minLuma, maxLuma);
+
+        return log2(luma / minLuma) / log2(maxLuma / minLuma);
+    }
+
+    LumaMeteringWindow window;
+};
+}
 }
 }
 

From 603a92f87a5831dc491ff4e4b53e99f5af9a57ce Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 7 Aug 2024 19:22:52 +0530
Subject: [PATCH 06/36] Add gatherLuma method

---
 include/nbl/asset/utils/IVirtualTexture.h     |  4 +-
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 54 +++++++++++++++++--
 include/nbl/builtin/hlsl/math/morton.hlsl     |  2 +-
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/include/nbl/asset/utils/IVirtualTexture.h b/include/nbl/asset/utils/IVirtualTexture.h
index 64ea49cbe7..b715c40cfc 100644
--- a/include/nbl/asset/utils/IVirtualTexture.h
+++ b/include/nbl/asset/utils/IVirtualTexture.h
@@ -922,7 +922,7 @@ class IVirtualTexture : public core::IReferenceCounted, public IVirtualTextureBa
         storage->incrTileCounter(neededPhysPages);
 
         return offsetToTextureData(
-            page_tab_offset_t(core::morton2d_decode_x(addr), core::morton2d_decode_y(addr), pgtLayer),
+            page_tab_offset_t(hlsl::morton2d_decode_x(addr), hlsl::morton2d_decode_y(addr), pgtLayer),
             extent,
             _subres.levelCount,
             _wrapu,
@@ -934,7 +934,7 @@ class IVirtualTexture : public core::IReferenceCounted, public IVirtualTextureBa
     {
         uint32_t sz = computeSquareSz(_addr.origsize_x, _addr.origsize_y);
         sz *= sz;
-        const uint32_t addr = core::morton2d_encode(_addr.pgTab_x, _addr.pgTab_y);
+        const uint32_t addr = hlsl::morton2d_encode(_addr.pgTab_x, _addr.pgTab_y);
 
         core::address_allocator_traits<pg_tab_addr_alctr_t>::multi_free_addr(m_pageTableLayerAllocators[_addr.pgTab_layer], 1u, &addr, &sz);
 
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index d2c33602c8..7ed9604c4f 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -7,6 +7,9 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
 #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
@@ -25,9 +28,9 @@ struct LumaMeteringWindow
 	float32_t2 meteringWindowOffset;
 };
 
-template<uint32_t SubgroupSize, uint32_t SubgroupCount, typename SharedAccessor, typename TexAccessor>
+template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_luma_meter {
-    using this_t = geom_luma_meter<SubgroupSize, SubgroupCount, SharedAccessor, TexAccessor>;
+    using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
     static this_t create(NBL_REF_ARG(LumaMeteringWindow) window)
     {
@@ -36,7 +39,18 @@ struct geom_luma_meter {
         return retval;
     }
 
-    float32_t computeLuma(NBL_REF_ARG(TexAccessor) tex, uint32_t2 sampleCount, uint32_t2 sampleIndex, float32_t2 viewportSize)
+    float32_t reduction(float32_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    {
+        return workgroup::reduction < plus < float32_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float32_t computeLuma(
+        NBL_REF_ARG(TexAccessor) tex,
+        uint32_t2 sampleCount,
+        uint32_t2 sampleIndex,
+        float32_t2 viewportSize
+    )
     {
         float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
         float32_t2 samplePos = stride * sampleIndex;
@@ -52,6 +66,40 @@ struct geom_luma_meter {
         return log2(luma / minLuma) / log2(maxLuma / minLuma);
     }
 
+    void gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        uint32_t2 sampleCount,
+        float32_t2 viewportSize
+    ) {
+        uint32_t2 coord = {
+            morton2d_decode_x(glsl::gl_LocalInvocationIndex()),
+            morton2d_decode_y(glsl::gl_LocalInvocationIndex())
+        };
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+        uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1);
+        float32_t luma = 0.0f;
+
+        if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) {
+            luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize);
+            float32_t lumaSum = reduction(luma, sdata);
+
+            sdata.workgroupExecutionAndMemoryBarrier();
+
+            if (tid == GroupSize - 1) {
+                uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+                uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+                uint32_t lumaSumBitPattern = uint32_t(clamp(lumaSum, 0.f, float((1 << fixedPointBitsLeft) - 1)));
+                uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
+                uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
+
+                val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+            }
+        }
+    }
+
     LumaMeteringWindow window;
 };
 }
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 4150af637a..1f35016cb6 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -13,7 +13,7 @@
 
 namespace nbl
 {
-namespace core
+namespace hlsl
 {
 
 namespace impl

From 810a6ac1cc2ff6662dca36edd0413288b4f1b1ea Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:29:20 +0530
Subject: [PATCH 07/36] Add getGatheredLuma()

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 7ed9604c4f..21bd813439 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -32,10 +32,12 @@ template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, ty
 struct geom_luma_meter {
     using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window)
+    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {
         this_t retval;
         retval.window = window;
+        retval.minLuma = lumaMinimum;
+        retval.maxLuma = lumaMaximum;
         return retval;
     }
 
@@ -58,9 +60,6 @@ struct geom_luma_meter {
         float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos));
         float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
 
-        const float32_t minLuma = 1.0 / 4096.0;
-        const float32_t maxLuma = 32768.0;
-
         luma = clamp(luma, minLuma, maxLuma);
 
         return log2(luma / minLuma) / log2(maxLuma / minLuma);
@@ -72,7 +71,8 @@ struct geom_luma_meter {
         NBL_REF_ARG(SharedAccessor) sdata,
         uint32_t2 sampleCount,
         float32_t2 viewportSize
-    ) {
+    )
+    {
         uint32_t2 coord = {
             morton2d_decode_x(glsl::gl_LocalInvocationIndex()),
             morton2d_decode_y(glsl::gl_LocalInvocationIndex())
@@ -91,7 +91,9 @@ struct geom_luma_meter {
             if (tid == GroupSize - 1) {
                 uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
                 uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-                uint32_t lumaSumBitPattern = uint32_t(clamp(lumaSum, 0.f, float((1 << fixedPointBitsLeft) - 1)));
+
+                uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
                 uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
                 uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
 
@@ -100,7 +102,18 @@ struct geom_luma_meter {
         }
     }
 
+    float32_t getGatheredLuma(
+        NBL_REF_ARG(ValueAccessor) val,
+        uint32_t2 sampleCount
+    )
+    {
+        uint32_t lumaSumBitPattern = val.get(glsl::gl_SubgroupInvocationID());
+        float32_t lumaSumValue = float32_t(lumaSumBitPattern) / (log2(maxLuma) - log2(minLuma)) + log2(minLuma);
+        return glsl::subgroupAdd(lumaSumValue) / (sampleCount.x * sampleCount.y);
+    }
+
     LumaMeteringWindow window;
+    float32_t minLuma, maxLuma;
 };
 }
 }

From 69a73c1d90a0702894ecead0de1455d459d8b2ca Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:59:59 +0530
Subject: [PATCH 08/36] Add reinhard and aces hlsl operators

---
 .../builtin/hlsl/tonemapper/operators.hlsl    | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 5ebb5b2ffa..cc5728e9ff 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -5,10 +5,67 @@
 #ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
 #define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
 
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
 namespace nbl
 {
-namespace hls
+namespace hlsl
+{
+
+struct ReinhardParams
+{
+	float32_t keyAndManualLinearExposure;
+	float32_t rcpWhite2;
+};
+
+struct ACESParams
+{
+	float32_t gamma; // 1.0
+	float32_t exposure; // actualExposure+midGrayLog2
+};
+
+
+float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor)
+{
+	float32_t exposureFactors = params.keyAndManualLinearExposure;
+	float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+	float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma));
+	return rawCIEXYZcolor * colorMultiplier;
+}
+
+float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
 {
+	float32_t3 tonemapped = rawCIEXYZcolor;
+	if (tonemapped.y > 1.175494351e-38)
+		tonemapped *= exp2(log2(tonemapped.y) * (params.gamma - 1.0) + (params.exposure) * params.gamma);
+
+	// XYZ => RRT_SAT
+	// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
+	const float32_t3x3 XYZ_RRT_Input = float32_t3x3(
+		float32_t3(1.594168310, -0.262608051, -0.231993079),
+		float32_t3(-0.6332771780, 1.5840380200, 0.0164147373),
+		float32_t3(0.00892840419, 0.03648501260, 0.87711471300)
+	);
+
+	// this is obviously fitted to some particular simulated sensor/film and display
+	float32_t3 v = mul(XYZ_RRT_Input, tonemapped);
+	float32_t3 a = v * (v + float32_t3(0.0245786)) - float32_t3(0.000090537);
+	float32_t3 b = v * (v * float32_t(0.983729) + float32_t3(0.4329510)) + float32_t3(0.238081);
+	v = a / b;
+
+	// ODT_SAT => XYZ
+	// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
+	const float32_t3x3 ODT_XYZ_Output = float32_t3x3(
+		float32_t3(0.624798000, 0.164064825, 0.161605373),
+		float32_t3(0.268048108, 0.674283803, 0.057667464),
+		float32_t3(0.0157514643, 0.0526682511, 1.0204007600)
+	);
+	return mul(ODT_XYZ_Output, v);
+}
+
+// ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php
+// or get proper ACES RRT and ODTs
+// https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use-
 
 }
 }

From 4c70cf5bb919abab9c82e36320de45be88fe02ee Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:47:49 +0530
Subject: [PATCH 09/36] cast mask values to correct type

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 1f35016cb6..1cd2105dc5 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -31,7 +31,7 @@ T morton2d_mask(uint16_t _n)
         0x00FF00FF00FF00FFull,
         0x0000FFFF0000FFFFull
     };
-    return mask[_n];
+    return (T)mask[_n];
 }
 
 template <typename T>
@@ -45,7 +45,7 @@ T morton3d_mask(uint16_t _n)
         0x001F0000FF0000FFull,
         0x001F00000000FFFFull
     };
-    return mask[_n];
+    return (T)mask[_n];
 }
 template <typename T>
 T morton4d_mask(uint16_t _n)
@@ -57,7 +57,7 @@ T morton4d_mask(uint16_t _n)
         0x000F000F000F000Full,
         0x000000FF000000FFull
     };
-    return mask[_n];
+    return (T)mask[_n];
 }
 
 template <typename T, uint32_t bitDepth>

From d9d6dd8c19a1c896ea03dce1182791bfb2e1834b Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 16:35:19 +0530
Subject: [PATCH 10/36] Add create methods to tonemapper params

---
 .../builtin/hlsl/tonemapper/operators.hlsl    | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index cc5728e9ff..daff652bbd 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -11,15 +11,34 @@ namespace nbl
 {
 namespace hlsl
 {
+namespace tonemapper
+{
 
 struct ReinhardParams
 {
+	using this_t = ReinhardParams;
+	static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f)
+	{
+		this_t retval;
+		retval.keyAndManualLinearExposure = key * exp2(EV);
+		retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV);
+		return retval;
+	}
+
 	float32_t keyAndManualLinearExposure;
 	float32_t rcpWhite2;
 };
 
 struct ACESParams
 {
+	using this_t = ACESParams;
+	static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) {
+		this_t retval;
+		retval.gamma = Contrast;
+		retval.exposure = EV + log2(key * 0.77321666f);
+		return retval;
+	}
+
 	float32_t gamma; // 1.0
 	float32_t exposure; // actualExposure+midGrayLog2
 };
@@ -49,8 +68,8 @@ float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
 
 	// this is obviously fitted to some particular simulated sensor/film and display
 	float32_t3 v = mul(XYZ_RRT_Input, tonemapped);
-	float32_t3 a = v * (v + float32_t3(0.0245786)) - float32_t3(0.000090537);
-	float32_t3 b = v * (v * float32_t(0.983729) + float32_t3(0.4329510)) + float32_t3(0.238081);
+	float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537);
+	float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081);
 	v = a / b;
 
 	// ODT_SAT => XYZ
@@ -67,6 +86,7 @@ float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
 // or get proper ACES RRT and ODTs
 // https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use-
 
+}
 }
 }
 

From 305f7e7430077c72a9bbf0b814ed5a6bd9e691a6 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 16:35:49 +0530
Subject: [PATCH 11/36] Remove getGatheredLuma from luma_meter

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 21bd813439..94b898670b 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -13,6 +13,7 @@
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
 #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
+#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
 #include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
 
 namespace nbl
@@ -57,7 +58,7 @@ struct geom_luma_meter {
         float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
         float32_t2 samplePos = stride * sampleIndex;
         float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize;
-        float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos));
+        float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos));
         float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
 
         luma = clamp(luma, minLuma, maxLuma);
@@ -102,16 +103,6 @@ struct geom_luma_meter {
         }
     }
 
-    float32_t getGatheredLuma(
-        NBL_REF_ARG(ValueAccessor) val,
-        uint32_t2 sampleCount
-    )
-    {
-        uint32_t lumaSumBitPattern = val.get(glsl::gl_SubgroupInvocationID());
-        float32_t lumaSumValue = float32_t(lumaSumBitPattern) / (log2(maxLuma) - log2(minLuma)) + log2(minLuma);
-        return glsl::subgroupAdd(lumaSumValue) / (sampleCount.x * sampleCount.y);
-    }
-
     LumaMeteringWindow window;
     float32_t minLuma, maxLuma;
 };

From 3f4f6e93163e5c0c1a67f88b8906a07916ddbe84 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:28:48 +0530
Subject: [PATCH 12/36] Separate LumaMeteringWindow into a common header

---
 .../nbl/builtin/hlsl/luma_meter/common.hlsl   | 27 +++++++++++++++++++
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 12 +++------
 src/nbl/builtin/CMakeLists.txt                |  1 +
 3 files changed, 31 insertions(+), 9 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/luma_meter/common.hlsl

diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
new file mode 100644
index 0000000000..210039390e
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+struct MeteringWindow
+{
+	float32_t2 meteringWindowScale;
+	float32_t2 meteringWindowOffset;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 94b898670b..e865d61c0d 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -5,7 +5,6 @@
 #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
@@ -15,6 +14,7 @@
 #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
 #include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
 #include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
+#include "nbl/builtin/hlsl/luma_meter/common.hlsl"
 
 namespace nbl
 {
@@ -23,17 +23,11 @@ namespace hlsl
 namespace luma_meter
 {
 
-struct LumaMeteringWindow
-{
-	float32_t2 meteringWindowScale;
-	float32_t2 meteringWindowOffset;
-};
-
 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_luma_meter {
     using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
+    static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {
         this_t retval;
         retval.window = window;
@@ -103,7 +97,7 @@ struct geom_luma_meter {
         }
     }
 
-    LumaMeteringWindow window;
+    MeteringWindow window;
     float32_t minLuma, maxLuma;
 };
 }
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index df61293d4a..b4346c428e 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -35,6 +35,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
 # luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
 # tonemapper
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")

From 515512a9dc5287dd68acce86205c53b5b219ba54 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:32:27 +0530
Subject: [PATCH 13/36] Simplify luma_meter naming

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index e865d61c0d..fb07acb8f4 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -24,8 +24,8 @@ namespace luma_meter
 {
 
 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
-struct geom_luma_meter {
-    using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+struct geom_meter {
+    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
     static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {

From 1919e53ed6ecb319f7892005d0faad86706288a2 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 19:06:03 +0530
Subject: [PATCH 14/36] Simplify morton code

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 135 +---------------------
 1 file changed, 6 insertions(+), 129 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 1cd2105dc5..c0769fc88b 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -19,9 +19,8 @@ namespace hlsl
 namespace impl
 {
 
-#ifdef __HLSL_VERSION
 template <typename T>
-T morton2d_mask(uint16_t _n)
+NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -31,11 +30,11 @@ T morton2d_mask(uint16_t _n)
         0x00FF00FF00FF00FFull,
         0x0000FFFF0000FFFFull
     };
-    return (T)mask[_n];
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
 }
 
 template <typename T>
-T morton3d_mask(uint16_t _n)
+NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -45,10 +44,10 @@ T morton3d_mask(uint16_t _n)
         0x001F0000FF0000FFull,
         0x001F00000000FFFFull
     };
-    return (T)mask[_n];
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
 }
 template <typename T>
-T morton4d_mask(uint16_t _n)
+NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n)
 {
     const static uint64_t mask[4] =
     {
@@ -57,7 +56,7 @@ T morton4d_mask(uint16_t _n)
         0x000F000F000F000Full,
         0x000000FF000000FFull
     };
-    return (T)mask[_n];
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
 }
 
 template <typename T, uint32_t bitDepth>
@@ -141,128 +140,6 @@ inline T separate_bits_4d(T x)
 
     return x;
 }
-#else
-template <typename T>
-constexpr T morton2d_mask(uint8_t _n)
-{
-    constexpr uint64_t mask[5] =
-    {
-        0x5555555555555555ull,
-        0x3333333333333333ull,
-        0x0F0F0F0F0F0F0F0Full,
-        0x00FF00FF00FF00FFull,
-        0x0000FFFF0000FFFFull
-    };
-    return static_cast<T>(mask[_n]);
-}
-template <typename T>
-constexpr T morton3d_mask(uint8_t _n)
-{
-    constexpr uint64_t mask[5] =
-    {
-        0x1249249249249249ull,
-        0x10C30C30C30C30C3ull,
-        0x010F00F00F00F00Full,
-        0x001F0000FF0000FFull,
-        0x001F00000000FFFFull
-    };
-    return static_cast<T>(mask[_n]);
-}
-template <typename T>
-constexpr T morton4d_mask(uint8_t _n)
-{
-    constexpr uint64_t mask[4] =
-    {
-        0x1111111111111111ull,
-        0x0303030303030303ull,
-        0x000F000F000F000Full,
-        0x000000FF000000FFull
-    };
-    return static_cast<T>(mask[_n]);
-}
-
-template <typename T, uint32_t bitDepth>
-inline T morton2d_decode(T x)
-{
-    x = x & morton2d_mask<T>(0);
-    x = (x | (x >> 1)) & morton2d_mask<T>(1);
-    x = (x | (x >> 2)) & morton2d_mask<T>(2);
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x >> 4)) & morton2d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x >> 8)) & morton2d_mask<T>(4);
-    }
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x >> 16));
-    }
-    return x;
-}
-
-//! Puts bits on even positions filling gaps with 0s
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_2d(T x)
-{
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x << 16)) & morton2d_mask<T>(4);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x << 8)) & morton2d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x << 4)) & morton2d_mask<T>(2);
-    }
-    x = (x | (x << 2)) & morton2d_mask<T>(1);
-    x = (x | (x << 1)) & morton2d_mask<T>(0);
-
-    return x;
-}
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_3d(T x)
-{
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x << 32)) & morton3d_mask<T>(4);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x << 16)) & morton3d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x << 8)) & morton3d_mask<T>(2);
-    }
-    x = (x | (x << 4)) & morton3d_mask<T>(1);
-    x = (x | (x << 2)) & morton3d_mask<T>(0);
-
-    return x;
-}
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_4d(T x)
-{
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x << 24)) & morton4d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x << 12)) & morton4d_mask<T>(2);
-    }
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x << 6)) & morton4d_mask<T>(1);
-    }
-    x = (x | (x << 3)) & morton4d_mask<T>(0);
-
-    return x;
-}
-#endif
 }
 
 template<typename T, uint32_t bitDepth = sizeof(T) * 8u>

From 4c582382e8adca012b959577367138a8f1a92dfd Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 19:09:24 +0530
Subject: [PATCH 15/36] Add missing comment

---
 include/nbl/builtin/hlsl/tonemapper/operators.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index daff652bbd..1481fd92b2 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -35,7 +35,8 @@ struct ACESParams
 	static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) {
 		this_t retval;
 		retval.gamma = Contrast;
-		retval.exposure = EV + log2(key * 0.77321666f);
+		const float reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
+		retval.exposure = EV + log2(key * reinhardMatchCorrection);
 		return retval;
 	}
 

From 3c3f8b84025dfddb3464d4bc9ed5ca76f651b07c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:09:02 +0530
Subject: [PATCH 16/36] Refactor tonemapping operators

---
 .../builtin/hlsl/tonemapper/operators.hlsl    | 106 +++++++++---------
 1 file changed, 56 insertions(+), 50 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 1481fd92b2..854f78e302 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -6,6 +6,7 @@
 #define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
 
 namespace nbl
 {
@@ -14,10 +15,13 @@ namespace hlsl
 namespace tonemapper
 {
 
-struct ReinhardParams
+template<typename T = float32_t>
+struct Reinhard
 {
-	using this_t = ReinhardParams;
-	static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f)
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+	using this_t = Reinhard<float_t>;
+	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
 	{
 		this_t retval;
 		retval.keyAndManualLinearExposure = key * exp2(EV);
@@ -25,63 +29,65 @@ struct ReinhardParams
 		return retval;
 	}
 
-	float32_t keyAndManualLinearExposure;
-	float32_t rcpWhite2;
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		float_t exposureFactors = keyAndManualLinearExposure;
+		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+		float_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * rcpWhite2) / (1.0 + exposedLuma));
+		return rawCIEXYZcolor * colorMultiplier;
+	}
+
+	float_t3 keyAndManualLinearExposure;
+	float_t3 rcpWhite2;
 };
 
-struct ACESParams
+template<typename T = float32_t>
+struct ACES
 {
-	using this_t = ACESParams;
-	static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) {
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+	using float_t3x3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3x3, float16_t3x3>::type;
+
+	using this_t = ACES<T>;
+	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
 		this_t retval;
 		retval.gamma = Contrast;
-		const float reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
+		const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
 		retval.exposure = EV + log2(key * reinhardMatchCorrection);
 		return retval;
 	}
 
-	float32_t gamma; // 1.0
-	float32_t exposure; // actualExposure+midGrayLog2
-};
-
-
-float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor)
-{
-	float32_t exposureFactors = params.keyAndManualLinearExposure;
-	float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
-	float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma));
-	return rawCIEXYZcolor * colorMultiplier;
-}
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		float_t3 tonemapped = rawCIEXYZcolor;
+		if (tonemapped.y > 1.175494351e-38)
+			tonemapped *= exp2(log2(tonemapped.y) * (gamma - 1.0) + (exposure) * gamma);
+
+		// XYZ => RRT_SAT
+		// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
+		const float_t3x3 XYZ_RRT_Input = float_t3x3(
+			float_t3(1.594168310, -0.262608051, -0.231993079),
+			float_t3(-0.6332771780, 1.5840380200, 0.0164147373),
+			float_t3(0.00892840419, 0.03648501260, 0.87711471300)
+		);
+
+		// this is obviously fitted to some particular simulated sensor/film and display
+		float_t3 v = mul(XYZ_RRT_Input, tonemapped);
+		float_t3 a = v * (v + float_t3(0.0245786, 0.0245786, 0.0245786)) - float_t3(0.000090537, 0.000090537, 0.000090537);
+		float_t3 b = v * (v * float_t3(0.983729, 0.983729, 0.983729) + float_t3(0.4329510, 0.4329510, 0.4329510)) + float_t3(0.238081, 0.238081, 0.238081);
+		v = a / b;
+
+		// ODT_SAT => XYZ
+		// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
+		const float_t3x3 ODT_XYZ_Output = float_t3x3(
+			float_t3(0.624798000, 0.164064825, 0.161605373),
+			float_t3(0.268048108, 0.674283803, 0.057667464),
+			float_t3(0.0157514643, 0.0526682511, 1.0204007600)
+		);
+		return mul(ODT_XYZ_Output, v);
+	}
 
-float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
-{
-	float32_t3 tonemapped = rawCIEXYZcolor;
-	if (tonemapped.y > 1.175494351e-38)
-		tonemapped *= exp2(log2(tonemapped.y) * (params.gamma - 1.0) + (params.exposure) * params.gamma);
-
-	// XYZ => RRT_SAT
-	// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
-	const float32_t3x3 XYZ_RRT_Input = float32_t3x3(
-		float32_t3(1.594168310, -0.262608051, -0.231993079),
-		float32_t3(-0.6332771780, 1.5840380200, 0.0164147373),
-		float32_t3(0.00892840419, 0.03648501260, 0.87711471300)
-	);
-
-	// this is obviously fitted to some particular simulated sensor/film and display
-	float32_t3 v = mul(XYZ_RRT_Input, tonemapped);
-	float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537);
-	float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081);
-	v = a / b;
-
-	// ODT_SAT => XYZ
-	// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
-	const float32_t3x3 ODT_XYZ_Output = float32_t3x3(
-		float32_t3(0.624798000, 0.164064825, 0.161605373),
-		float32_t3(0.268048108, 0.674283803, 0.057667464),
-		float32_t3(0.0157514643, 0.0526682511, 1.0204007600)
-	);
-	return mul(ODT_XYZ_Output, v);
-}
+	float_t gamma; // 1.0
+	float_t exposure; // actualExposure+midGrayLog2
+};
 
 // ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php
 // or get proper ACES RRT and ODTs

From b0e07505a374d3e81e18e9e71c39152e4599051c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:17:38 +0530
Subject: [PATCH 17/36] Small fixes

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index fb07acb8f4..af128b0f98 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -27,7 +27,7 @@ template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, ty
 struct geom_meter {
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
+    static this_t create(NBL_CONST_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {
         this_t retval;
         retval.window = window;
@@ -68,11 +68,12 @@ struct geom_meter {
         float32_t2 viewportSize
     )
     {
+
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
         uint32_t2 coord = {
-            morton2d_decode_x(glsl::gl_LocalInvocationIndex()),
-            morton2d_decode_y(glsl::gl_LocalInvocationIndex())
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
         };
-        uint32_t tid = workgroup::SubgroupContiguousIndex();
 
         uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1);
         float32_t luma = 0.0f;
@@ -81,8 +82,6 @@ struct geom_meter {
             luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize);
             float32_t lumaSum = reduction(luma, sdata);
 
-            sdata.workgroupExecutionAndMemoryBarrier();
-
             if (tid == GroupSize - 1) {
                 uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
                 uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

From e8e46c9d042e76adb3bfd449982fcff70986cfba Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:20:32 +0530
Subject: [PATCH 18/36] Use promote to simplify code

---
 include/nbl/builtin/hlsl/tonemapper/operators.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 854f78e302..e5e6a9a97c 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -71,8 +71,8 @@ struct ACES
 
 		// this is obviously fitted to some particular simulated sensor/film and display
 		float_t3 v = mul(XYZ_RRT_Input, tonemapped);
-		float_t3 a = v * (v + float_t3(0.0245786, 0.0245786, 0.0245786)) - float_t3(0.000090537, 0.000090537, 0.000090537);
-		float_t3 b = v * (v * float_t3(0.983729, 0.983729, 0.983729) + float_t3(0.4329510, 0.4329510, 0.4329510)) + float_t3(0.238081, 0.238081, 0.238081);
+		float_t3 a = v * (v + promote<float_t3>(0.0245786)) - promote<float_t3>(0.000090537);
+		float_t3 b = v * (v * promote<float_t3>(0.983729) + promote<float_t3>(0.4329510)) + promote<float_t3>(0.238081);
 		v = a / b;
 
 		// ODT_SAT => XYZ

From ee5affe6f20f25e1c7eb2675e07fe340be9204fb Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:07:34 +0530
Subject: [PATCH 19/36] Add static create to MeteringWindow

---
 include/nbl/builtin/hlsl/luma_meter/common.hlsl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
index 210039390e..55d1713619 100644
--- a/include/nbl/builtin/hlsl/luma_meter/common.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -16,8 +16,16 @@ namespace luma_meter
 
 struct MeteringWindow
 {
+	using this_t = MeteringWindow;
 	float32_t2 meteringWindowScale;
 	float32_t2 meteringWindowOffset;
+
+	static this_t create(float32_t2 scale, float32_t2 offset) {
+		this_t retval;
+		retval.meteringWindowScale = scale;
+		retval.meteringWindowOffset = offset;
+		return retval;
+	}
 };
 
 }

From 56389f45a6f5689889d232fb051a15b0001e43f7 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 18:31:28 +0530
Subject: [PATCH 20/36] Infer sample count from viewportSize

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 49 +++++++------------
 .../builtin/hlsl/tonemapper/operators.hlsl    |  4 +-
 2 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index af128b0f98..23deac8bbe 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -27,12 +27,10 @@ template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, ty
 struct geom_meter {
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_CONST_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
+    static this_t create(float32_t2 lumaMinMax)
     {
         this_t retval;
-        retval.window = window;
-        retval.minLuma = lumaMinimum;
-        retval.maxLuma = lumaMaximum;
+        retval.lumaMinMax = lumaMinMax;
         return retval;
     }
 
@@ -43,61 +41,52 @@ struct geom_meter {
     }
 
     float32_t computeLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
-        uint32_t2 sampleCount,
-        uint32_t2 sampleIndex,
-        float32_t2 viewportSize
+        float32_t2 shiftedCoord
     )
     {
-        float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
-        float32_t2 samplePos = stride * sampleIndex;
-        float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize;
+        float32_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
         float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos));
         float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
 
-        luma = clamp(luma, minLuma, maxLuma);
+        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
-        return log2(luma / minLuma) / log2(maxLuma / minLuma);
+        return log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x);
     }
 
     void gatherLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        uint32_t2 sampleCount,
-        float32_t2 viewportSize
+        float32_t2 tileOffset
     )
     {
-
         uint32_t tid = workgroup::SubgroupContiguousIndex();
         uint32_t2 coord = {
             morton2d_decode_x(tid),
             morton2d_decode_y(tid)
         };
 
-        uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1);
         float32_t luma = 0.0f;
+        luma = computeLuma(window, tex, tileOffset + (float32_t2)(coord));
+        float32_t lumaSum = reduction(luma, sdata);
 
-        if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) {
-            luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize);
-            float32_t lumaSum = reduction(luma, sdata);
-
-            if (tid == GroupSize - 1) {
-                uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-                uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+        if (tid == GroupSize - 1) {
+            uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+            uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
-                uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+            uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(lumaMinMax.x)) * (log2(lumaMinMax.y) - log2(lumaMinMax.x)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
 
-                uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
-                uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
+            uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
+            uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
 
-                val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
-            }
+            val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
         }
     }
 
-    MeteringWindow window;
-    float32_t minLuma, maxLuma;
+    float32_t2 lumaMinMax;
 };
 }
 }
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index e5e6a9a97c..824e31d68a 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -36,8 +36,8 @@ struct Reinhard
 		return rawCIEXYZcolor * colorMultiplier;
 	}
 
-	float_t3 keyAndManualLinearExposure;
-	float_t3 rcpWhite2;
+	float_t keyAndManualLinearExposure;
+	float_t rcpWhite2;
 };
 
 template<typename T = float32_t>

From 23771d1610b50e2af60b2f4661d11c06e50d854f Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 22 Aug 2024 23:02:11 +0530
Subject: [PATCH 21/36] Rename gatherLuma, add toXYZ method and templatize the
 float type

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 23deac8bbe..b0b19b3a82 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -11,9 +11,6 @@
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
-#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
-#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
-#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"
 
 namespace nbl
@@ -25,42 +22,45 @@ namespace luma_meter
 
 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_meter {
+    using float_t = typename SharedAccessor::type;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float32_t2 lumaMinMax)
+    static this_t create(float_t2 lumaMinMax)
     {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
         return retval;
     }
 
-    float32_t reduction(float32_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        return workgroup::reduction < plus < float32_t >, GroupSize >::
+        return workgroup::reduction < plus < float_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
-    float32_t computeLuma(
+    float_t computeLumaLog2(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
-        float32_t2 shiftedCoord
+        float_t2 shiftedCoord
     )
     {
-        float32_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-        float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos));
-        float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = TexAccessor::toXYZ(color);
 
         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
-        return log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x);
+        return max(log2(luma), log2(lumaMinMax.x));
     }
 
-    void gatherLuma(
+    void sampleLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        float32_t2 tileOffset
+        float_t2 tileOffset
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -69,9 +69,9 @@ struct geom_meter {
             morton2d_decode_y(tid)
         };
 
-        float32_t luma = 0.0f;
-        luma = computeLuma(window, tex, tileOffset + (float32_t2)(coord));
-        float32_t lumaSum = reduction(luma, sdata);
+        float_t luma = 0.0f;
+        luma = computeLumaLog2(window, tex, tileOffset + (float32_t2)(coord));
+        float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
             uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
@@ -86,7 +86,7 @@ struct geom_meter {
         }
     }
 
-    float32_t2 lumaMinMax;
+    float_t2 lumaMinMax;
 };
 }
 }

From ac390393cca2c89237532b57f12d95cc5584f0be Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 27 Aug 2024 00:41:14 +0530
Subject: [PATCH 22/36] Add uploadFloat, downloadFloat and gatherLuma

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 63 ++++++++++++++++---
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index b0b19b3a82..c39b2e3ab6 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -7,6 +7,7 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
@@ -27,10 +28,11 @@ struct geom_meter {
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax)
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
     {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
         return retval;
     }
 
@@ -55,6 +57,34 @@ struct geom_meter {
         return max(log2(luma), log2(lumaMinMax.x));
     }
 
+    void uploadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+        val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+    }
+
+    float_t downloadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        float_t luma = (float_t)val.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        luma = luma / rangeLog2 + minLog2;
+        return luma;
+    }
+
     void sampleLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
@@ -74,18 +104,37 @@ struct geom_meter {
         float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
-            uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-            uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-            uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(lumaMinMax.x)) * (log2(lumaMinMax.y) - log2(lumaMinMax.x)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
-
             uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
             uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
 
-            val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+            uploadFloat(
+                val,
+                workgroupIndex,
+                lumaSum,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            );
         }
     }
 
+    void gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val
+    )
+    {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        float_t lumaSum = glsl::subgroupAdd(
+            downloadFloat(
+                val,
+                tid,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            )
+        );
+
+        uploadFloat(val, 0, lumaSum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x));
+    }
+
+    float_t sampleCount;
     float_t2 lumaMinMax;
 };
 }

From 49a80499c4ee3c7b09ce20e1f7a995d63cc7a73d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:37:11 +0530
Subject: [PATCH 23/36] Normalize tileOffset and coord to uv before computing
 Luma

---
 .../nbl/builtin/hlsl/luma_meter/luma_meter.hlsl    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index c39b2e3ab6..6804c1d631 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -50,7 +50,7 @@ struct geom_meter {
     {
         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
         float_t3 color = tex.get(uvPos);
-        float_t luma = TexAccessor::toXYZ(color);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
 
         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
@@ -80,7 +80,7 @@ struct geom_meter {
         float_t rangeLog2
     )
     {
-        float_t luma = (float_t)val.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
         luma = luma / rangeLog2 + minLog2;
         return luma;
     }
@@ -90,7 +90,8 @@ struct geom_meter {
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset
+        float_t2 tileOffset,
+        float_t2 viewportSize
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -100,7 +101,8 @@ struct geom_meter {
         };
 
         float_t luma = 0.0f;
-        luma = computeLumaLog2(window, tex, tileOffset + (float32_t2)(coord));
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = computeLumaLog2(window, tex, shiftedCoord);
         float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
@@ -117,7 +119,7 @@ struct geom_meter {
         }
     }
 
-    void gatherLuma(
+    float_t gatherLuma(
         NBL_REF_ARG(ValueAccessor) val
     )
     {
@@ -131,7 +133,7 @@ struct geom_meter {
             )
         );
 
-        uploadFloat(val, 0, lumaSum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x));
+        return lumaSum;
     }
 
     float_t sampleCount;

From 8a10ae2e12f36d48f39ff3350920d800da1cc47e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 29 Sep 2024 18:16:56 +0100
Subject: [PATCH 24/36] Simplify return statement

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 6804c1d631..266d6e6a2a 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -81,8 +81,7 @@ struct geom_meter {
     )
     {
         float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
-        luma = luma / rangeLog2 + minLog2;
-        return luma;
+        return luma / rangeLog2 + minLog2;
     }
 
     void sampleLuma(

From 6b01b6ddd4e687684e6e7a5f8073f7e556ad6967 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 11 Dec 2024 00:26:02 +0000
Subject: [PATCH 25/36] Update submodule pointers

---
 3rdparty/dxc/dxc         | 2 +-
 3rdparty/libexpat        | 2 +-
 3rdparty/nbl_spirv_cross | 2 +-
 3rdparty/openexr         | 2 +-
 3rdparty/volk            | 2 +-
 examples_tests           | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index a08b6cbeb1..29a5e1258e 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
+Subproject commit 29a5e1258e2f01dd15ef1f58e24a02337c96c8f7
diff --git a/3rdparty/libexpat b/3rdparty/libexpat
index e2004f9195..39e487da35 160000
--- a/3rdparty/libexpat
+++ b/3rdparty/libexpat
@@ -1 +1 @@
-Subproject commit e2004f9195700bb8248c8c954578f14fda58be27
+Subproject commit 39e487da353b20bb3a724311d179ba0fddffc65b
diff --git a/3rdparty/nbl_spirv_cross b/3rdparty/nbl_spirv_cross
index f4accc2a4b..b52e6a55ca 160000
--- a/3rdparty/nbl_spirv_cross
+++ b/3rdparty/nbl_spirv_cross
@@ -1 +1 @@
-Subproject commit f4accc2a4b478c42038c920aa0e43a8aab7d135c
+Subproject commit b52e6a55ca2d9805a18dccfc45c7a2e692c1d8e1
diff --git a/3rdparty/openexr b/3rdparty/openexr
index fca936a964..824ed557b3 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit fca936a964da5983daecdbed7cd249934701b41a
+Subproject commit 824ed557b3c59288a685356c708e5806b1122fe1
diff --git a/3rdparty/volk b/3rdparty/volk
index b6be5ba0af..efb96f9031 160000
--- a/3rdparty/volk
+++ b/3rdparty/volk
@@ -1 +1 @@
-Subproject commit b6be5ba0af5567974cc8a0261471573418f0f34f
+Subproject commit efb96f90317e1c902d6b45ae95d14e67779a2241
diff --git a/examples_tests b/examples_tests
index 8b6675b3ba..36633f5c2c 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8b6675b3ba9fe1ca00f2c6573a4888abb8477da7
+Subproject commit 36633f5c2cae3e8e870a837c86e71f3a50061a3e

From f95f1c1e7eb5fe5c930b1c0badba345f4e27033e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 11 Dec 2024 00:54:41 +0000
Subject: [PATCH 26/36] Update submodule pointer

---
 3rdparty/imgui | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/imgui b/3rdparty/imgui
index e489e40a85..a29e9dba30 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
+Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175

From 1a5827379821023273130a547b8ba50141cd85a9 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 13 Dec 2024 04:34:45 +0000
Subject: [PATCH 27/36] Update submodule pointer

---
 3rdparty/Vulkan-Headers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 2c823b7f27..31aa7f634b 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
+Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3

From b6e1f57110c4e34715bd6c15223a1db9224c47ff Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 13 Dec 2024 04:46:17 +0000
Subject: [PATCH 28/36] Update submodule pointer

---
 3rdparty/volk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/volk b/3rdparty/volk
index efb96f9031..b6be5ba0af 160000
--- a/3rdparty/volk
+++ b/3rdparty/volk
@@ -1 +1 @@
-Subproject commit efb96f90317e1c902d6b45ae95d14e67779a2241
+Subproject commit b6be5ba0af5567974cc8a0261471573418f0f34f

From 5239c29945cd2f609d13f40c66af3dcc4bd2f6a2 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 14 Jan 2025 00:42:26 +0000
Subject: [PATCH 29/36] Update submodule pointer

---
 3rdparty/Vulkan-Headers   | 2 +-
 3rdparty/dxc/dxc          | 2 +-
 3rdparty/imgui            | 2 +-
 3rdparty/libexpat         | 2 +-
 3rdparty/nbl_spirv_cross  | 2 +-
 3rdparty/openexr          | 2 +-
 3rdparty/parallel-hashmap | 2 +-
 examples_tests            | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 31aa7f634b..2c823b7f27 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3
+Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 5adc27f9e4..a08b6cbeb1 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367
+Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
diff --git a/3rdparty/imgui b/3rdparty/imgui
index a29e9dba30..e489e40a85 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175
+Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
diff --git a/3rdparty/libexpat b/3rdparty/libexpat
index 39e487da35..e2004f9195 160000
--- a/3rdparty/libexpat
+++ b/3rdparty/libexpat
@@ -1 +1 @@
-Subproject commit 39e487da353b20bb3a724311d179ba0fddffc65b
+Subproject commit e2004f9195700bb8248c8c954578f14fda58be27
diff --git a/3rdparty/nbl_spirv_cross b/3rdparty/nbl_spirv_cross
index b52e6a55ca..f4accc2a4b 160000
--- a/3rdparty/nbl_spirv_cross
+++ b/3rdparty/nbl_spirv_cross
@@ -1 +1 @@
-Subproject commit b52e6a55ca2d9805a18dccfc45c7a2e692c1d8e1
+Subproject commit f4accc2a4b478c42038c920aa0e43a8aab7d135c
diff --git a/3rdparty/openexr b/3rdparty/openexr
index c8a74d9ac9..fca936a964 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd
+Subproject commit fca936a964da5983daecdbed7cd249934701b41a
diff --git a/3rdparty/parallel-hashmap b/3rdparty/parallel-hashmap
index 7684faf186..fd7b8fb87d 160000
--- a/3rdparty/parallel-hashmap
+++ b/3rdparty/parallel-hashmap
@@ -1 +1 @@
-Subproject commit 7684faf186806e2c88554a78188c18185b21f127
+Subproject commit fd7b8fb87d74cc990591c3443b2ef21e9e137500
diff --git a/examples_tests b/examples_tests
index 36633f5c2c..f79caed8b5 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 36633f5c2cae3e8e870a837c86e71f3a50061a3e
+Subproject commit f79caed8b54499c1a4e848672dec38ce85d9a184

From 06c915e42162869f11ae951b7a081c722505d4e8 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 21 Jan 2025 16:11:27 +0100
Subject: [PATCH 30/36] stop rolling back my modules!

---
 3rdparty/Vulkan-Headers | 2 +-
 3rdparty/imgui          | 2 +-
 3rdparty/imguizmo       | 2 +-
 3rdparty/openexr        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 2c823b7f27..31aa7f634b 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
+Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3
diff --git a/3rdparty/imgui b/3rdparty/imgui
index e489e40a85..a29e9dba30 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
+Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175
diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo
index 6f4b2197ef..b10e91756d 160000
--- a/3rdparty/imguizmo
+++ b/3rdparty/imguizmo
@@ -1 +1 @@
-Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6
+Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88
diff --git a/3rdparty/openexr b/3rdparty/openexr
index fca936a964..c8a74d9ac9 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit fca936a964da5983daecdbed7cd249934701b41a
+Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd

From 90d20c44783c9f3837f554ae8a05beb1ecd9f956 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 21 Jan 2025 16:49:29 +0100
Subject: [PATCH 31/36] point submodule at head

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index f79caed8b5..9e26a74aa1 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f79caed8b54499c1a4e848672dec38ce85d9a184
+Subproject commit 9e26a74aa1bcbe5e26ee14a79d4f2ef9e2701e0d

From 4edd38c002531e3bbf55a8f0649af187223a1077 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:57:14 +0000
Subject: [PATCH 32/36] Add capabilities for atomic ops

---
 include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 2ecb08cdb2..973a313e9c 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -61,37 +61,45 @@ pointer_t<StorageClass,T> copyObject([[vk::ext_reference]] T v);
 // Here's the thing with atomics, it's not only the data type that dictates whether you can do an atomic or not.
 // It's the storage class that has the most effect (shared vs storage vs image) and we can't check that easily
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

From f1e3e9866682fc79fa830d4a1c888674e24f58f7 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:58:01 +0000
Subject: [PATCH 33/36] Fix luma_meter

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 266d6e6a2a..9808b9e26d 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -105,8 +105,8 @@ struct geom_meter {
         float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
-            uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
-            uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
+            uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();
+            uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64;
 
             uploadFloat(
                 val,
@@ -122,8 +122,8 @@ struct geom_meter {
         NBL_REF_ARG(ValueAccessor) val
     )
     {
-        uint32_t tid = workgroup::SubgroupContiguousIndex();
-        float_t lumaSum = glsl::subgroupAdd(
+        uint32_t tid = glsl::gl_SubgroupInvocationID();
+        float_t luma = glsl::subgroupAdd(
             downloadFloat(
                 val,
                 tid,
@@ -132,7 +132,10 @@ struct geom_meter {
             )
         );
 
-        return lumaSum;
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
     }
 
     float_t sampleCount;

From f1b7d170718d1ba0d48eef0b69af842be0463bea Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 16 Mar 2025 11:07:47 +0000
Subject: [PATCH 34/36] Add median_luma_meter

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 9808b9e26d..c17a64c437 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -141,6 +141,151 @@ struct geom_meter {
     float_t sampleCount;
     float_t2 lumaMinMax;
 };
+
+template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+struct median_meter {
+    using int_t = typename SharedAccessor::type;
+    using float_t  = float32_t;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount) {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
+        return retval;
+    }
+
+    int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+        return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t computeLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    ) {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        return clamp(luma, lumaMinMax.x, lumaMinMax.y);
+    }
+
+    int_t float2Int(
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+    }
+
+    float_t int2Float(
+        int_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        return val / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(vid, 0);
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t2 coord = {
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
+        };
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = computeLuma(window, tex, shiftedCoord);
+
+        float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
+        uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
+
+        sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t histogram_value;
+        sdata.get(tid, histogram_value);
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t sum = inclusive_scan(histogram_value, sdata);
+        histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        const bool is_last_wg_invocation = tid == (GroupSize - 1);
+        const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
+
+        for (int i = 1; i < RoundedBinCount; i++) {
+            uint32_t keyBucketStart = GroupSize * i;
+            uint32_t vid = tid + keyBucketStart;
+
+            // no if statement about the last iteration needed
+            if (is_last_wg_invocation) {
+                float_t beforeSum;
+                sdata.get(keyBucketStart, beforeSum);
+                sdata.set(keyBucketStart, beforeSum + sum);
+            }
+
+            // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
+            sdata.workgroupExecutionAndMemoryBarrier();
+
+            // no aliasing anymore
+            float_t atVid;
+            sdata.get(vid, atVid);
+            sum = inclusive_scan(atVid, sdata);
+            if (vid < BinCount) {
+                histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+            }
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(SharedAccessor) sdata
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(
+                vid,
+                histo.get(vid & (BinCount - 1))
+            );
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t percentile40, percentile60;
+        sdata.get(BinCount * 0.4, percentile40);
+        sdata.get(BinCount * 0.6, percentile60);
+
+        return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+    }
+
+    float_t sampleCount;
+    float_t2 lumaMinMax;
+};
+
 }
 }
 }

From 83ac633896008509ea16f8d896e4048f98eb888d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 16 Mar 2025 11:49:58 +0000
Subject: [PATCH 35/36] Update submodule pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 06dad8c118..498ffd21a0 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 06dad8c118027d6ebc8ee04e19340ba643079a63
+Subproject commit 498ffd21a06b9e9c74d20b37860421d17fe7cf49

From 2b5e502d23c14b8cba96cb8a7ff7a4b6d4d5b4e3 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:11:48 +0000
Subject: [PATCH 36/36] Make changes to luma_meter

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 48 ++++++++-----------
 .../builtin/hlsl/tonemapper/operators.hlsl    | 20 +++++---
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index c17a64c437..20af804603 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -36,13 +36,13 @@ struct geom_meter {
         return retval;
     }
 
-    float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
         return workgroup::reduction < plus < float_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
-    float_t computeLumaLog2(
+    float_t __computeLumaLog2(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
         float_t2 shiftedCoord
@@ -54,26 +54,26 @@ struct geom_meter {
 
         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
-        return max(log2(luma), log2(lumaMinMax.x));
+        return log2(luma);
     }
 
-    void uploadFloat(
+    void __uploadFloat(
         NBL_REF_ARG(ValueAccessor) val_accessor,
-        uint32_t index,
         float_t val,
         float_t minLog2,
         float_t rangeLog2
     )
     {
         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
         uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
 
-        val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
     }
 
-    float_t downloadFloat(
+    float_t __downloadFloat(
         NBL_REF_ARG(ValueAccessor) val_accessor,
         uint32_t index,
         float_t minLog2,
@@ -101,17 +101,13 @@ struct geom_meter {
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        luma = computeLumaLog2(window, tex, shiftedCoord);
-        float_t lumaSum = reduction(luma, sdata);
-
-        if (tid == GroupSize - 1) {
-            uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();
-            uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64;
+        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
 
-            uploadFloat(
+        if (tid == 0) {
+            __uploadFloat(
                 val,
-                workgroupIndex,
-                lumaSum,
+                lumaLog2Sum,
                 log2(lumaMinMax.x),
                 log2(lumaMinMax.y / lumaMinMax.x)
             );
@@ -124,7 +120,7 @@ struct geom_meter {
     {
         uint32_t tid = glsl::gl_SubgroupInvocationID();
         float_t luma = glsl::subgroupAdd(
-            downloadFloat(
+            __downloadFloat(
                 val,
                 tid,
                 log2(lumaMinMax.x),
@@ -150,19 +146,18 @@ struct median_meter {
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax, float_t sampleCount) {
+    static this_t create(float_t2 lumaMinMax) {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
-        retval.sampleCount = sampleCount;
         return retval;
     }
 
-    int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
         return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
-    float_t computeLuma(
+    float_t __computeLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
         float_t2 shiftedCoord
@@ -174,7 +169,7 @@ struct median_meter {
         return clamp(luma, lumaMinMax.x, lumaMinMax.y);
     }
 
-    int_t float2Int(
+    int_t __float2Int(
         float_t val,
         float_t minLog2,
         float_t rangeLog2
@@ -185,7 +180,7 @@ struct median_meter {
         return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
     }
 
-    float_t int2Float(
+    float_t __int2Float(
         int_t val,
         float_t minLog2,
         float_t rangeLog2
@@ -216,7 +211,7 @@ struct median_meter {
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        luma = computeLuma(window, tex, shiftedCoord);
+        luma = __computeLuma(window, tex, shiftedCoord);
 
         float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
         uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
@@ -255,7 +250,7 @@ struct median_meter {
             sdata.get(vid, atVid);
             sum = inclusive_scan(atVid, sdata);
             if (vid < BinCount) {
-                histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+                histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
             }
         }
     }
@@ -279,10 +274,9 @@ struct median_meter {
         sdata.get(BinCount * 0.4, percentile40);
         sdata.get(BinCount * 0.6, percentile60);
 
-        return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+        return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
     }
 
-    float_t sampleCount;
     float_t2 lumaMinMax;
 };
 
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 824e31d68a..46d241c76c 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -19,20 +19,25 @@ template<typename T = float32_t>
 struct Reinhard
 {
 	using float_t = enable_if_t<is_floating_point<T>::value, T>;
-	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+	using float_t3 = vector<float_t, 3>;
 	using this_t = Reinhard<float_t>;
+
 	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
 	{
 		this_t retval;
+
+		const float_t unit = 1.0;
 		retval.keyAndManualLinearExposure = key * exp2(EV);
-		retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV);
+		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
+
 		return retval;
 	}
 
 	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
 		float_t exposureFactors = keyAndManualLinearExposure;
 		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
-		float_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * rcpWhite2) / (1.0 + exposedLuma));
+		float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma));
 		return rawCIEXYZcolor * colorMultiplier;
 	}
 
@@ -44,8 +49,8 @@ template<typename T = float32_t>
 struct ACES
 {
 	using float_t = enable_if_t<is_floating_point<T>::value, T>;
-	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-	using float_t3x3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3x3, float16_t3x3>::type;
+	using float_t3 = vector<float_t, 3>;
+	using float_t3x3 = matrix<float_t, 3, 3>;
 
 	using this_t = ACES<T>;
 	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
@@ -57,9 +62,10 @@ struct ACES
 	}
 
 	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
 		float_t3 tonemapped = rawCIEXYZcolor;
-		if (tonemapped.y > 1.175494351e-38)
-			tonemapped *= exp2(log2(tonemapped.y) * (gamma - 1.0) + (exposure) * gamma);
+		if (tonemapped.y > bit_cast<float_t>(numeric_limits<float_t>::min))
+			tonemapped *= exp2(log2(tonemapped.y) * (gamma - unit) + (exposure) * gamma);
 
 		// XYZ => RRT_SAT
 		// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)