Merge pull request #257 from Tom94/perf

Various performance improvements
Tom94 · Feb 24, 2025 · 8a89453 · 8a89453
2 parents 5e742b1 + 2a1bdb1
commit 8a89453
Show file tree

Hide file tree

Showing 10 changed files with 196 additions and 121 deletions.
diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt
@@ -120,6 +120,9 @@ if (TEV_USE_LIBHEIF)
 
     add_library(lcms2 STATIC ${LCMS_SRCS} ${LCMS_HDRS})
     set_target_properties(lcms2 PROPERTIES PUBLIC_HEADER "${LCMS_HDRS}")
+
+    # We don't use the SSE2 components of CMS in tev; disable to simplify ARM compilation
+    target_compile_definitions(lcms2 PRIVATE -DCMS_DONT_USE_SSE2=1)
     target_include_directories(lcms2 PUBLIC
         "${CMAKE_CURRENT_SOURCE_DIR}/Little-CMS/include"
         # "${CMAKE_CURRENT_SOURCE_DIR}/Little-CMS/plugins/fast_float/include"

diff --git a/include/tev/Channel.h b/include/tev/Channel.h
@@ -25,45 +25,41 @@ class Channel {
 
     static nanogui::Color color(std::string fullChannel);
 
-    Channel(const std::string& name, const nanogui::Vector2i& size);
+    Channel(
+        const std::string& name,
+        const nanogui::Vector2i& size,
+        std::shared_ptr<std::vector<float>> data = nullptr,
+        size_t dataOffset = 0,
+        size_t dataStride = 1
+    );
 
     const std::string& name() const { return mName; }
 
-    const std::vector<float>& data() const { return mData; }
-
-    float eval(size_t index) const {
-        if (index >= mData.size()) {
-            return 0;
-        }
-
-        return mData[index];
-    }
-
     float eval(nanogui::Vector2i index) const {
         if (index.x() < 0 || index.x() >= mSize.x() || index.y() < 0 || index.y() >= mSize.y()) {
             return 0;
         }
 
-        return mData[index.x() + index.y() * (size_t)mSize.x()];
+        return at(index.x() + (size_t)index.y() * (size_t)mSize.x());
     }
 
-    float& at(size_t index) { return mData[index]; }
-
-    float at(size_t index) const { return mData[index]; }
-
     float& at(nanogui::Vector2i index) { return at(index.x() + index.y() * (size_t)mSize.x()); }
 
     float at(nanogui::Vector2i index) const { return at(index.x() + index.y() * (size_t)mSize.x()); }
 
-    size_t numPixels() const { return mData.size(); }
+    size_t numPixels() const { return (size_t)mSize.x() * mSize.y(); }
 
     const nanogui::Vector2i& size() const { return mSize; }
 
     std::tuple<float, float, float> minMaxMean() const {
         float min = std::numeric_limits<float>::infinity();
         float max = -std::numeric_limits<float>::infinity();
         float mean = 0;
-        for (float f : mData) {
+
+        const size_t nPixels = numPixels();
+        for (size_t i = 0; i < nPixels; ++i) {
+            const float f = at(i);
+
             mean += f;
             if (f < min) {
                 min = f;
@@ -74,21 +70,41 @@ class Channel {
             }
         }
 
-        return {min, max, mean / numPixels()};
+        return {min, max, mean / nPixels};
     }
 
     Task<void> divideByAsync(const Channel& other, int priority);
 
     Task<void> multiplyWithAsync(const Channel& other, int priority);
 
-    void setZero() { memset(mData.data(), 0, mData.size() * sizeof(float)); }
+    void setZero() {
+        if (mDataStride == 1) {
+            memset(data(), 0, numPixels() * sizeof(float));
+        } else {
+            const size_t nPixels = numPixels();
+            for (size_t i = 0; i < nPixels; ++i) {
+                at(i) = 0.0f;
+            }
+        }
+    }
 
     void updateTile(int x, int y, int width, int height, const std::vector<float>& newData);
 
+    float& at(size_t index) { return data()[index * mDataStride]; }
+
+    float at(size_t index) const { return data()[index * mDataStride]; }
+
+    float* data() const { return mData->data() + mDataOffset; }
+
+    size_t offset() const { return mDataOffset; }
+    size_t stride() const { return mDataStride; }
+
 private:
     std::string mName;
     nanogui::Vector2i mSize;
-    std::vector<float> mData;
+    std::shared_ptr<std::vector<float>> mData;
+    size_t mDataOffset;
+    size_t mDataStride;
 };
 
 } // namespace tev
diff --git a/include/tev/Common.h b/include/tev/Common.h
@@ -46,7 +46,7 @@
 
 #define TEV_ASSERT(cond, description, ...) \
     if (UNLIKELY(!(cond)))                 \
-        throw std::runtime_error{fmt::format(description, ##__VA_ARGS__)};
+        throw std::runtime_error{fmt::format(description, ##__VA_ARGS__)}
 
 #ifndef TEV_VERSION
 #   define TEV_VERSION "undefined"

diff --git a/include/tev/Image.h b/include/tev/Image.h
@@ -111,6 +111,16 @@ class Image {
     bool hasChannel(const std::string& channelName) const { return mData.hasChannel(channelName); }
 
     const Channel* channel(const std::string& channelName) const { return mData.channel(channelName); }
+    std::vector<const Channel*> channels(const std::vector<std::string>& channelNames) const {
+        std::vector<const Channel*> result;
+        for (const auto& channelName : channelNames) {
+            result.push_back(channel(channelName));
+        }
+
+        return result;
+    }
+
+    bool isInterleavedRgba(const std::vector<std::string>& channelNames) const;
 
     nanogui::Texture* texture(const std::string& channelGroupName);
     nanogui::Texture* texture(const std::vector<std::string>& channelNames);

diff --git a/src/Channel.cpp b/src/Channel.cpp
@@ -4,7 +4,7 @@
 #include <tev/Channel.h>
 #include <tev/ThreadPool.h>
 
-#include <numeric>
+#include <memory>
 
 using namespace nanogui;
 using namespace std;
@@ -42,8 +42,17 @@ Color Channel::color(string channel) {
     return Color(1.0f, 1.0f);
 }
 
-Channel::Channel(const std::string& name, const nanogui::Vector2i& size) : mName{name}, mSize{size} {
-    mData.resize((size_t)mSize.x() * mSize.y());
+Channel::Channel(const string& name, const nanogui::Vector2i& size, shared_ptr<vector<float>> data, size_t dataOffset, size_t dataStride) :
+    mName{name}, mSize{size} {
+    if (data) {
+        mData = data;
+        mDataOffset = dataOffset;
+        mDataStride = dataStride;
+    } else {
+        mData = make_shared<vector<float>>((size_t)size.x() * size.y());
+        mDataOffset = 0;
+        mDataStride = 1;
+    }
 }
 
 Task<void> Channel::divideByAsync(const Channel& other, int priority) {

diff --git a/src/Image.cpp b/src/Image.cpp
@@ -232,6 +232,26 @@ string Image::shortName() const {
     return result;
 }
 
+bool Image::isInterleavedRgba(const vector<string>& channelNames) const {
+    const float* interleavedData = nullptr;
+    for (size_t i = 0; i < 4; ++i) {
+        const auto* chan = channel(channelNames[i]);
+        if (!chan) {
+            break;
+        }
+
+        if (i == 0) {
+            interleavedData = chan->data();
+        }
+
+        if (interleavedData != chan->data() - i || chan->stride() != 4) {
+            return false;
+        }
+    }
+
+    return interleavedData;
+}
+
 Texture* Image::texture(const string& channelGroupName) { return texture(channelsInGroup(channelGroupName)); }
 
 Texture* Image::texture(const vector<string>& channelNames) {
@@ -264,33 +284,37 @@ Texture* Image::texture(const vector<string>& channelNames) {
     );
     auto& texture = mTextures.at(lookup).nanoguiTexture;
 
-    auto numPixels = this->numPixels();
-    vector<float> data(numPixels * 4);
+    // Check if channel layout is already interleaved. If yes, can directly copy onto GPU!
+    if (isInterleavedRgba(channelNames)) {
+        texture->upload((uint8_t*)channel(channelNames[0])->data());
+    } else {
+        auto numPixels = this->numPixels();
+        vector<float> data = vector<float>(numPixels * 4);
 
-    vector<Task<void>> tasks;
-    for (size_t i = 0; i < 4; ++i) {
-        float defaultVal = i == 3 ? 1 : 0;
-        if (i < channelNames.size()) {
-            const auto* chan = channel(channelNames[i]);
-            if (!chan) {
-                tasks.emplace_back(ThreadPool::global().parallelForAsync<size_t>(
-                    0, numPixels, [&data, defaultVal, i](size_t j) { data[j * 4 + i] = defaultVal; }, std::numeric_limits<int>::max()
-                ));
+        vector<Task<void>> tasks;
+        for (size_t i = 0; i < 4; ++i) {
+            float defaultVal = i == 3 ? 1 : 0;
+            if (i < channelNames.size()) {
+                const auto* chan = channel(channelNames[i]);
+                if (!chan) {
+                    tasks.emplace_back(ThreadPool::global().parallelForAsync<size_t>(
+                        0, numPixels, [&data, defaultVal, i](size_t j) { data[j * 4 + i] = defaultVal; }, std::numeric_limits<int>::max()
+                    ));
+                } else {
+                    tasks.emplace_back(ThreadPool::global().parallelForAsync<size_t>(
+                        0, numPixels, [chan, &data, i](size_t j) { data[j * 4 + i] = chan->at(j); }, std::numeric_limits<int>::max()
+                    ));
+                }
             } else {
-                const auto& channelData = chan->data();
                 tasks.emplace_back(ThreadPool::global().parallelForAsync<size_t>(
-                    0, numPixels, [&channelData, &data, i](size_t j) { data[j * 4 + i] = channelData[j]; }, std::numeric_limits<int>::max()
+                    0, numPixels, [&data, defaultVal, i](size_t j) { data[j * 4 + i] = defaultVal; }, std::numeric_limits<int>::max()
                 ));
             }
-        } else {
-            tasks.emplace_back(ThreadPool::global().parallelForAsync<size_t>(
-                0, numPixels, [&data, defaultVal, i](size_t j) { data[j * 4 + i] = defaultVal; }, std::numeric_limits<int>::max()
-            ));
         }
+        waitAll(tasks);
+        texture->upload((uint8_t*)data.data());
     }
-    waitAll(tasks);
 
-    texture->upload((uint8_t*)data.data());
     texture->generate_mipmap();
     return texture.get();
 }