From 52ebdd42c6ee2f7b22dc2c7a4f23d50a2940b4f5 Mon Sep 17 00:00:00 2001
From: Liam <byteslice@airmail.cc>
Date: Tue, 5 Apr 2022 22:05:23 -0400
Subject: [PATCH] OpenGL: fix S8D24 to ABGR8 conversions

---
 src/video_core/host_shaders/CMakeLists.txt    |  1 +
 .../host_shaders/opengl_convert_s8d24.comp    | 18 ++++++++++++++
 .../renderer_opengl/gl_texture_cache.cpp      | 13 ++++++++--
 .../renderer_opengl/gl_texture_cache.h        |  3 ++-
 .../renderer_opengl/util_shaders.cpp          | 24 ++++++++++++++++++-
 src/video_core/renderer_opengl/util_shaders.h |  3 +++
 6 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 src/video_core/host_shaders/opengl_convert_s8d24.comp

diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index af05d47d13..190fc6aeae 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -18,6 +18,7 @@ set(SHADER_FILES
     full_screen_triangle.vert
     fxaa.frag
     fxaa.vert
+    opengl_convert_s8d24.comp
     opengl_copy_bc4.comp
     opengl_present.frag
     opengl_present.vert
diff --git a/src/video_core/host_shaders/opengl_convert_s8d24.comp b/src/video_core/host_shaders/opengl_convert_s8d24.comp
new file mode 100644
index 0000000000..83e1ab1764
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_convert_s8d24.comp
@@ -0,0 +1,18 @@
+// Copyright 2022 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430 core
+
+layout(local_size_x = 16, local_size_y = 8) in;
+
+layout(binding = 0, rgba8ui) restrict uniform uimage2D destination;
+layout(location = 0) uniform uvec3 size;
+
+void main() {
+    if (any(greaterThanEqual(gl_GlobalInvocationID, size))) {
+        return;
+    }
+    uvec4 components = imageLoad(destination, ivec2(gl_GlobalInvocationID.xy));
+    imageStore(destination, ivec2(gl_GlobalInvocationID.xy), components.wxyz);
+}
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 8f9a65beb4..d120763580 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -409,8 +409,8 @@ ImageBufferMap::~ImageBufferMap() {
 
 TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager,
                                          StateTracker& state_tracker_)
-    : device{device_}, state_tracker{state_tracker_},
-      util_shaders(program_manager), resolution{Settings::values.resolution_info} {
+    : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager),
+      format_conversion_pass{util_shaders}, resolution{Settings::values.resolution_info} {
     static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D};
     for (size_t i = 0; i < TARGETS.size(); ++i) {
         const GLenum target = TARGETS[i];
@@ -1325,6 +1325,9 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
 
 Framebuffer::~Framebuffer() = default;
 
+FormatConversionPass::FormatConversionPass(UtilShaders& util_shaders_)
+    : util_shaders{util_shaders_} {}
+
 void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image,
                                         std::span<const VideoCommon::ImageCopy> copies) {
     const GLenum dst_target = ImageTarget(dst_image.info);
@@ -1357,6 +1360,12 @@ void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image,
                             dst_origin.z, region.width, region.height, region.depth,
                             dst_image.GlFormat(), dst_image.GlType(), nullptr);
     }
+
+    // Swap component order of S8D24 to ABGR8 reinterprets
+    if (src_image.info.format == PixelFormat::D24_UNORM_S8_UINT &&
+        dst_image.info.format == PixelFormat::A8B8G8R8_UNORM) {
+        util_shaders.ConvertS8D24(dst_image, copies);
+    }
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 53088b66ee..672fa8dde3 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -55,13 +55,14 @@ struct FormatProperties {
 
 class FormatConversionPass {
 public:
-    FormatConversionPass() = default;
+    explicit FormatConversionPass(UtilShaders& util_shaders);
     ~FormatConversionPass() = default;
 
     void ConvertImage(Image& dst_image, Image& src_image,
                       std::span<const VideoCommon::ImageCopy> copies);
 
 private:
+    UtilShaders& util_shaders;
     OGLBuffer intermediate_pbo;
     size_t pbo_size{};
 };
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 897c380b3b..04c482a093 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -13,6 +13,7 @@
 #include "video_core/host_shaders/astc_decoder_comp.h"
 #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h"
 #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h"
+#include "video_core/host_shaders/opengl_convert_s8d24_comp.h"
 #include "video_core/host_shaders/opengl_copy_bc4_comp.h"
 #include "video_core/host_shaders/pitch_unswizzle_comp.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
@@ -50,7 +51,8 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
       block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)),
       block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)),
       pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
-      copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
+      copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)),
+      convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)) {
     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
     swizzle_table_buffer.Create();
     glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
@@ -248,6 +250,26 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im
     program_manager.RestoreGuestCompute();
 }
 
+void UtilShaders::ConvertS8D24(Image& dst_image, std::span<const ImageCopy> copies) {
+    static constexpr GLuint BINDING_DESTINATION = 0;
+    static constexpr GLuint LOC_SIZE = 0;
+
+    program_manager.BindComputeProgram(convert_s8d24_program.handle);
+    for (const ImageCopy& copy : copies) {
+        ASSERT(copy.src_subresource.base_layer == 0);
+        ASSERT(copy.src_subresource.num_layers == 1);
+        ASSERT(copy.dst_subresource.base_layer == 0);
+        ASSERT(copy.dst_subresource.num_layers == 1);
+
+        glUniform3ui(LOC_SIZE, copy.extent.width, copy.extent.height, copy.extent.depth);
+        glBindImageTexture(BINDING_DESTINATION, dst_image.StorageHandle(),
+                           copy.dst_subresource.base_level, GL_TRUE, 0, GL_READ_WRITE, GL_RGBA8UI);
+        glDispatchCompute(Common::DivCeil(copy.extent.width, 16u),
+                          Common::DivCeil(copy.extent.height, 8u), copy.extent.depth);
+    }
+    program_manager.RestoreGuestCompute();
+}
+
 GLenum StoreFormat(u32 bytes_per_block) {
     switch (bytes_per_block) {
     case 1:
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index 5de95ea7ac..5c132e67f8 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -39,6 +39,8 @@ public:
     void CopyBC4(Image& dst_image, Image& src_image,
                  std::span<const VideoCommon::ImageCopy> copies);
 
+    void ConvertS8D24(Image& dst_image, std::span<const VideoCommon::ImageCopy> copies);
+
 private:
     ProgramManager& program_manager;
 
@@ -49,6 +51,7 @@ private:
     OGLProgram block_linear_unswizzle_3d_program;
     OGLProgram pitch_unswizzle_program;
     OGLProgram copy_bc4_program;
+    OGLProgram convert_s8d24_program;
 };
 
 GLenum StoreFormat(u32 bytes_per_block);