[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)

- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads - [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL - [Implemented] texture streaming and queue system for large sparse textures to prevent hitches - [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused) - [Added] user settings to adjust the streaming unswizzle system for low-end machines - [Improved] slightly the ASTC GPU decoding system Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: DraVee <dravee@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: DraVee <dravee@eden-emu.dev> Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: Forrest Keller <forrestmarkx@outlook.com> Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
2026-01-13 19:18:08 +01:00 · 2026-01-13 19:18:08 +01:00 · ecd01e13fd
parent f544004b5d
commit ecd01e13fd
20 changed files with 1076 additions and 83 deletions
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
@ -47,6 +47,9 @@ enum class IntSetting(override val key: String) : AbstractIntSetting {
    FAST_CPU_TIME("fast_cpu_time"),
    CPU_TICKS("cpu_ticks"),
    FAST_GPU_TIME("fast_gpu_time"),
+    GPU_UNZWIZZLE_MAXTEXTURE_SIZE("gpu_unzwizzle_maxtexture_size"),
+    GPU_UNZWIZZLE_STREAM_SIZE("gpu_unzwizzle_stream_size"),
+    GPU_UNZWIZZLE_CHUNK_SIZE("gpu_unzwizzle_chunk_size"),
    BAT_TEMPERATURE_UNIT("bat_temperature_unit"),
    CABINET_APPLET("cabinet_applet_mode"),
    CONTROLLER_APPLET("controller_applet_mode"),
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
@ -655,6 +655,33 @@ abstract class SettingsItem(
                    valuesId = R.array.gpuValues
                )
            )
+            put(
+                SingleChoiceSetting(
+                    IntSetting.GPU_UNZWIZZLE_MAXTEXTURE_SIZE,
+                    titleId = R.string.gpu_unzwizzle_maxtexture_size,
+                    descriptionId = R.string.gpu_unzwizzle_maxtexture_size_description,
+                    choicesId = R.array.gpuTextureSizeSwizzleEntries,
+                    valuesId = R.array.gpuTextureSizeSwizzleValues
+                )
+            )
+            put(
+                SingleChoiceSetting(
+                    IntSetting.GPU_UNZWIZZLE_STREAM_SIZE,
+                    titleId = R.string.gpu_unzwizzle_stream_size,
+                    descriptionId = R.string.gpu_unzwizzle_stream_size_description,
+                    choicesId = R.array.gpuSwizzleEntries,
+                    valuesId = R.array.gpuSwizzleValues
+                )
+            )
+            put(
+                SingleChoiceSetting(
+                    IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE,
+                    titleId = R.string.gpu_unzwizzle_chunk_size,
+                    descriptionId = R.string.gpu_unzwizzle_chunk_size_description,
+                    choicesId = R.array.gpuSwizzleChunkEntries,
+                    valuesId = R.array.gpuSwizzleChunkValues
+                )
+            )
            put(
                SingleChoiceSetting(
                    IntSetting.FAST_CPU_TIME,
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
@ -280,6 +280,9 @@ class SettingsFragmentPresenter(
            add(IntSetting.FAST_GPU_TIME.key)
            add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
            add(BooleanSetting.RENDERER_ASYNCHRONOUS_SHADERS.key)
+            add(IntSetting.GPU_UNZWIZZLE_MAXTEXTURE_SIZE.key)
+            add(IntSetting.GPU_UNZWIZZLE_STREAM_SIZE.key)
+            add(IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE.key)

            add(HeaderSetting(R.string.extensions))

--- a/src/android/app/src/main/res/values/arrays.xml
+++ b/src/android/app/src/main/res/values/arrays.xml
@ -564,6 +564,54 @@
        <item>2</item>
    </integer-array>

+    <string-array name="gpuTextureSizeSwizzleEntries">
+        <item>@string/gpu_texturesizeswizzle_verysmall</item>
+        <item>@string/gpu_texturesizeswizzle_small</item>
+        <item>@string/gpu_texturesizeswizzle_normal</item>
+        <item>@string/gpu_texturesizeswizzle_large</item>
+        <item>@string/gpu_texturesizeswizzle_verylarge</item>
+    </string-array>
+
+    <integer-array name="gpuTextureSizeSwizzleValues">
+        <item>0</item>
+        <item>1</item>
+        <item>2</item>
+        <item>3</item>
+        <item>4</item>
+    </integer-array>
+
+    <string-array name="gpuSwizzleEntries">
+        <item>@string/gpu_swizzle_verylow</item>
+        <item>@string/gpu_swizzle_low</item>
+        <item>@string/gpu_swizzle_normal</item>
+        <item>@string/gpu_swizzle_medium</item>
+        <item>@string/gpu_swizzle_high</item>
+    </string-array>
+
+    <integer-array name="gpuSwizzleValues">
+        <item>0</item>
+        <item>1</item>
+        <item>2</item>
+        <item>3</item>
+        <item>4</item>
+    </integer-array>
+    
+    <string-array name="gpuSwizzleChunkEntries">
+        <item>@string/gpu_swizzlechunk_verylow</item>
+        <item>@string/gpu_swizzlechunk_low</item>
+        <item>@string/gpu_swizzlechunk_normal</item>
+        <item>@string/gpu_swizzlechunk_medium</item>
+        <item>@string/gpu_swizzlechunk_high</item>
+    </string-array>
+
+    <integer-array name="gpuSwizzleChunkValues">
+        <item>0</item>
+        <item>1</item>
+        <item>2</item>
+        <item>3</item>
+        <item>4</item>
+    </integer-array>
+
    <string-array name="temperatureUnitEntries">
        <item>@string/temperature_celsius</item>
        <item>@string/temperature_fahrenheit</item>
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@ -504,6 +504,13 @@
    <string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
    <string name="renderer_asynchronous_shaders">Use asynchronous shaders</string>
    <string name="renderer_asynchronous_shaders_description">Compiles shaders asynchronously. This may reduce stutters but may also introduce glitches.</string>
+    <string name="gpu_unzwizzle_maxtexture_size">GPU Unswizzle Max Texture Size</string>
+    <string name="gpu_unzwizzle_maxtexture_size_description">Sets the maximum size (MB) for GPU-based texture unswizzling. While the GPU is faster for medium and large textures, the CPU may be more efficient for very small ones. Adjust this to find the balance between GPU acceleration and CPU overhead.</string>
+    <string name="gpu_unzwizzle_stream_size">GPU Unswizzle Stream Size</string>
+    <string name="gpu_unzwizzle_stream_size_description">Sets the data limit per frame for unswizzling large textures. Higher values speed up texture loading at the cost of higher frame latency; lower values reduce GPU overhead but may cause visible texture pop-in.</string>
+    <string name="gpu_unzwizzle_chunk_size">GPU Unswizzle Chunk Size</string>
+    <string name="gpu_unzwizzle_chunk_size_description">Defines the number of depth slices processed per batch for 3D textures. Increasing this improves throughput efficiency on powerful GPUs but may cause stuttering or driver timeouts on weaker hardware.</string>
+

    <string name="extensions">Extensions</string>

@ -926,6 +933,27 @@
    <string name="fast_gpu_medium">Medium (256)</string>
    <string name="fast_gpu_high">High (512)</string>

+    <!-- GPU swizzle texture size -->
+    <string name="gpu_texturesizeswizzle_verysmall">Very Small (16 MB)</string>
+    <string name="gpu_texturesizeswizzle_small">Small (32 MB)</string>
+    <string name="gpu_texturesizeswizzle_normal">Normal (128 MB)</string>
+    <string name="gpu_texturesizeswizzle_large">Large (256 MB)</string>
+    <string name="gpu_texturesizeswizzle_verylarge">Very Large (512 MB)</string>
+
+    <!-- GPU swizzle streams -->
+    <string name="gpu_swizzle_verylow">Very Low (4 MB)</string>
+    <string name="gpu_swizzle_low">Low (8 MB)</string>
+    <string name="gpu_swizzle_normal">Normal (16 MB)</string>
+    <string name="gpu_swizzle_medium">Medium (32 MB)</string>
+    <string name="gpu_swizzle_high">High (64 MB)</string>
+
+    <!-- GPU swizzle chunks -->
+    <string name="gpu_swizzlechunk_verylow">Very Low (32)</string>
+    <string name="gpu_swizzlechunk_low">Low (64)</string>
+    <string name="gpu_swizzlechunk_normal">Normal (128)</string>
+    <string name="gpu_swizzlechunk_medium">Medium (256)</string>
+    <string name="gpu_swizzlechunk_high">High (512)</string>
+
    <!-- Temperature Units -->
    <string name="temperature_celsius">Celsius</string>
    <string name="temperature_fahrenheit">Fahrenheit</string>
--- a/src/common/settings.h
+++ b/src/common/settings.h
@ -513,6 +513,24 @@ struct Values {
    SwitchableSetting<bool> use_asynchronous_shaders{linkage, false, "use_asynchronous_shaders",
                                                     Category::RendererHacks};

+    SwitchableSetting<GpuUnswizzleSize> gpu_unzwizzle_texture_size{linkage,
+                                                  GpuUnswizzleSize::Large,
+                                                  "gpu_unzwizzle_texture_size",
+                                                  Category::RendererHacks,
+                                                  Specialization::Default};
+
+    SwitchableSetting<GpuUnswizzle> gpu_unzwizzle_stream_size{linkage,
+                                                  GpuUnswizzle::Medium,
+                                                  "gpu_unzwizzle_stream_size",
+                                                  Category::RendererHacks,
+                                                  Specialization::Default};
+
+    SwitchableSetting<GpuUnswizzleChunk> gpu_unzwizzle_chunk_size{linkage,
+                                                  GpuUnswizzleChunk::Medium,
+                                                  "gpu_unzwizzle_chunk_size",
+                                                  Category::RendererHacks,
+                                                  Specialization::Default};
+
    SwitchableSetting<ExtendedDynamicState> dyna_state{linkage,
 #if defined (_WIN32)
                                           ExtendedDynamicState::EDS3,
--- a/src/common/settings_enums.h
+++ b/src/common/settings_enums.h
@ -150,6 +150,9 @@ ENUM(ConsoleMode, Handheld, Docked);
 ENUM(AppletMode, HLE, LLE);
 ENUM(SpirvOptimizeMode, Never, OnLoad, Always);
 ENUM(GpuOverclock, Normal, Medium, High)
+ENUM(GpuUnswizzleSize, VerySmall, Small, Normal, Large, VeryLarge)
+ENUM(GpuUnswizzle, VeryLow, Low, Normal, Medium, High)
+ENUM(GpuUnswizzleChunk, VeryLow, Low, Normal, Medium, High)
 ENUM(TemperatureUnits, Celsius, Fahrenheit)
 ENUM(ExtendedDynamicState, Disabled, EDS1, EDS2, EDS3);

--- a/src/qt_common/config/shared_translation.cpp
+++ b/src/qt_common/config/shared_translation.cpp
@ -288,6 +288,22 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QObject* parent)
           tr("Fast GPU Time"),
           tr("Overclocks the emulated GPU to increase dynamic resolution and render "
              "distance.\nUse 256 for maximal performance and 512 for maximal graphics fidelity."));
+    INSERT(Settings,
+           gpu_unzwizzle_texture_size,
+           tr("GPU Unswizzle Max Texture Size"),
+           tr("Sets the maximum size (MiB) for GPU-based texture unswizzling.\n"
+              "While the GPU is faster for medium and large textures, the CPU may be more efficient for very small ones.\n"
+              "Adjust this to find the balance between GPU acceleration and CPU overhead."));
+    INSERT(Settings,
+           gpu_unzwizzle_stream_size,
+           tr("GPU Unswizzle Stream Size"),
+           tr("Sets the maximum amount of texture data (in MiB) processed per frame.\n"
+              "Higher values can reduce stutter during texture loading but may impact frame consistency."));
+    INSERT(Settings,
+           gpu_unzwizzle_chunk_size,
+           tr("GPU Unswizzle Chunk Size"),
+           tr("Determines the number of depth slices processed in a single dispatch.\n"
+              "Increasing this can improve throughput on high-end GPUs but may cause TDR or driver timeouts on weaker hardware."));

    INSERT(Settings,
           use_vulkan_driver_pipeline_cache,
@ -719,6 +735,30 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QObject* parent)
                              PAIR(GpuOverclock, Medium, tr("Medium (256)")),
                              PAIR(GpuOverclock, High, tr("High (512)")),
                          }});
+    translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzleSize>::Index(),
+                          {
+                              PAIR(GpuUnswizzleSize, VerySmall, tr("Very Small (16 MB)")),
+                              PAIR(GpuUnswizzleSize, Small, tr("Small (32 MB)")),
+                              PAIR(GpuUnswizzleSize, Normal, tr("Normal (128 MB)")),
+                              PAIR(GpuUnswizzleSize, Large, tr("Large (256 MB)")),
+                              PAIR(GpuUnswizzleSize, VeryLarge, tr("Very Large (512 MB)")),
+                          }});
+    translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzle>::Index(),
+                          {
+                              PAIR(GpuUnswizzle, VeryLow, tr("Very Low (4 MB)")),
+                              PAIR(GpuUnswizzle, Low, tr("Low (8 MB)")),
+                              PAIR(GpuUnswizzle, Normal, tr("Normal (16 MB)")),
+                              PAIR(GpuUnswizzle, Medium, tr("Medium (32 MB)")),
+                              PAIR(GpuUnswizzle, High, tr("High (64 MB)")),
+                          }});
+    translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzleChunk>::Index(),
+                          {
+                              PAIR(GpuUnswizzleChunk, VeryLow, tr("Very Low (32)")),
+                              PAIR(GpuUnswizzleChunk, Low, tr("Low (64)")),
+                              PAIR(GpuUnswizzleChunk, Normal, tr("Normal (128)")),
+                              PAIR(GpuUnswizzleChunk, Medium, tr("Medium (256)")),
+                              PAIR(GpuUnswizzleChunk, High, tr("High (512)")),
+                          }});

    translations->insert({Settings::EnumMetadata<Settings::ExtendedDynamicState>::Index(),
                          {
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@ -18,6 +18,7 @@ set(SHADER_FILES
    blit_color_float.frag
    block_linear_unswizzle_2d.comp
    block_linear_unswizzle_3d.comp
+    block_linear_unswizzle_3d_bcn.comp
    convert_abgr8_srgb_to_d24s8.frag
    convert_abgr8_to_d24s8.frag
    convert_abgr8_to_d32f.frag
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@ -727,70 +727,35 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, ui
 }

 uint UnquantizeTexelWeight(EncodingData val) {
-    const uint encoding = Encoding(val);
-    const uint bitlen = NumBits(val);
-    const uint bitval = BitValue(val);
-    const uint A = ReplicateBitTo7((bitval & 1));
-    uint B = 0, C = 0, D = 0;
-    uint result = 0;
-    const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
-    switch (encoding) {
-    case JUST_BITS:
-        return FastReplicateTo6(bitval, bitlen);
-    case TRIT: {
+    uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
+    if (encoding == JUST_BITS) {
+        return (bitlen >= 1 && bitlen <= 5)
+            ? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
+            : FastReplicateTo6(bitval, bitlen);
+    } else if (encoding == TRIT || encoding == QUINT) {
+        uint B = 0, C = 0, D = 0;
+        uint b_mask = (0x3100 >> (bitlen * 4)) & 0xf;
+        uint b = (bitval >> 1) & b_mask;
        D = QuintTritValue(val);
-        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D * 2];
-        case 1: {
-            C = 50;
-            break;
+        if (encoding == TRIT) {
+            switch (bitlen) {
+            case 0: return D * 32; //0,32,64
+            case 1: C = 50; break;
+            case 2: C = 23; B = (b << 6) | (b << 2) | b; break;
+            case 3: C = 11; B = (b << 5) | b; break;
+            }
+        } else if (encoding == QUINT) {
+            switch (bitlen) {
+            case 0: return D * 16; //0, 16, 32, 48, 64
+            case 1: C = 28; break;
+            case 2: C = 13; B = (b << 6) | (b << 1); break;
+            }
        }
-        case 2: {
-            C = 23;
-            const uint b = (bitval >> 1) & 1;
-            B = (b << 6) | (b << 2) | b;
-            break;
-        }
-        case 3: {
-            C = 11;
-            const uint cb = (bitval >> 1) & 3;
-            B = (cb << 5) | cb;
-            break;
-        }
-        default:
-            break;
-        }
-        break;
+        uint A = ReplicateBitTo7(bitval & 1);
+        uint res = (A & 0x20) | (((D * C + B) ^ A) >> 2);
+        return res + (res > 32 ? 1 : 0);
    }
-    case QUINT: {
-        D = QuintTritValue(val);
-        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D];
-        case 1: {
-            C = 28;
-            break;
-        }
-        case 2: {
-            C = 13;
-            const uint b = (bitval >> 1) & 1;
-            B = (b << 6) | (b << 1);
-            break;
-        }
-        }
-        break;
-    }
-    }
-    if (encoding != JUST_BITS && bitlen > 0) {
-        result = D * C + B;
-        result ^= A;
-        result = (A & 0x20) | (result >> 2);
-    }
-    if (result > 32) {
-        result += 1;
-    }
-    return result;
+    return 0;
 }

 void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
@ -1159,10 +1124,11 @@ void DecompressBlock(ivec3 coord) {
 }

 uint SwizzleOffset(uvec2 pos) {
-    const uint x = pos.x;
-    const uint y = pos.y;
-    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
-            ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
+    return ((pos.x & 32u) << 3u) |
+           ((pos.y & 6u)  << 5u) |
+           ((pos.x & 16u) << 1u) |
+           ((pos.y & 1u)  << 4u) |
+           (pos.x & 15u);
 }

 void main() {
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#version 430
+
+#ifdef VULKAN
+    #extension GL_EXT_shader_16bit_storage : require
+    #extension GL_EXT_shader_8bit_storage  : require
+    #define HAS_EXTENDED_TYPES 1
+    #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+    #define END_PUSH_CONSTANTS };
+    #define UNIFORM(n)
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  2
+#else
+    #extension GL_NV_gpu_shader5 : enable
+    #ifdef GL_NV_gpu_shader5
+        #define HAS_EXTENDED_TYPES 1
+    #else
+        #define HAS_EXTENDED_TYPES 0
+    #endif
+    #define BEGIN_PUSH_CONSTANTS
+    #define END_PUSH_CONSTANTS
+    #define UNIFORM(n) layout(location = n) uniform
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  0
+#endif
+
+// --- Push Constants / Uniforms ---
+#ifdef VULKAN
+layout(push_constant) uniform PushConstants {
+    uvec3 blocks_dim;           // Offset 0
+    uint bytes_per_block_log2;  // Offset 12
+
+    uvec3 origin;               // Offset 16
+    uint slice_size;            // Offset 28
+
+    uint block_size;            // Offset 32
+    uint x_shift;               // Offset 36
+    uint block_height;          // Offset 40
+    uint block_height_mask;     // Offset 44
+
+    uint block_depth;           // Offset 48
+    uint block_depth_mask;      // Offset 52
+    int _pad;                   // Offset 56
+
+    ivec3 destination;          // Offset 60
+} pc;
+#else
+BEGIN_PUSH_CONSTANTS
+    UNIFORM(0)  uvec3 origin;
+    UNIFORM(1)  ivec3 destination;
+    UNIFORM(2)  uint  bytes_per_block_log2;
+    UNIFORM(3)  uint  slice_size;
+    UNIFORM(4)  uint  block_size;
+    UNIFORM(5)  uint  x_shift;
+    UNIFORM(6)  uint  block_height;
+    UNIFORM(7)  uint  block_height_mask;
+    UNIFORM(8)  uint  block_depth;
+    UNIFORM(9)  uint  block_depth_mask;
+    UNIFORM(10) uvec3 blocks_dim;
+END_PUSH_CONSTANTS
+#define pc // Map pc prefix to nothing for OpenGL compatibility
+#endif
+
+// --- Buffers ---
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8   { uint8_t  u8data[];  };
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16  { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32  { uint   u32data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64  { uvec2  u64data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4  u128data[]; };
+
+layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
+    uint out_u32[];
+};
+
+// --- Constants ---
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE   = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT   = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
+
+// --- Helpers ---
+uint SwizzleOffset(uvec2 pos) {
+    pos &= SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64u + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    uint bpl2 = pc.bytes_per_block_log2;
+    switch (bpl2) {
+#if HAS_EXTENDED_TYPES
+        case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
+        case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
+#else
+        case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
+        case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
+#endif
+        case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
+        case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
+        case 4u: return u128data[offset / 16u];
+    }
+    return uvec4(0u);
+}
+
+void main() {
+    uvec3 block_coord = gl_GlobalInvocationID;
+    if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
+        return;
+    }
+
+    uint bytes_per_block = 1u << pc.bytes_per_block_log2;
+    // Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
+    uvec3 pos;
+    pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
+    pos.y = block_coord.y + (pc.origin.y >> 2u);
+    pos.z = block_coord.z + pc.origin.z;
+
+    uint swizzle = SwizzleOffset(pos.xy);
+    uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+    uint offset  = 0u;
+    // Apply block-linear offsets
+    offset += (pos.z >> pc.block_depth) * pc.slice_size;
+    offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
+    offset += (block_y >> pc.block_height) * pc.block_size;
+    offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
+    offset += swizzle;
+
+    uvec4 texel = ReadTexel(offset);
+
+    // Calculate linear output index
+    uint block_index = block_coord.x +
+                       (block_coord.y * pc.blocks_dim.x) +
+                       (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
+    uint out_idx = block_index * (bytes_per_block >> 2u);
+
+    out_u32[out_idx]     = texel.x;
+    out_u32[out_idx + 1u] = texel.y;
+    if (pc.bytes_per_block_log2 == 4u) {
+        out_u32[out_idx + 2u] = texel.z;
+        out_u32[out_idx + 3u] = texel.w;
+    }
+}
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() {
    glFinish();
 }

-StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
+StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
    return staging_buffer_pool.RequestUploadBuffer(size);
 }

@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
 }

 void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
-                                                std::span<const SwizzleParameters> swizzles) {
+                                                std::span<const SwizzleParameters> swizzles,
+                                                u32 z_start, u32 z_count) {
    switch (image.info.type) {
    case ImageType::e2D:
        if (IsPixelFormatASTC(image.info.format)) {
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -72,7 +75,7 @@ public:

    void Finish();

-    StagingBufferMap UploadStagingBuffer(size_t size);
+    StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false);

    StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false);

@ -116,7 +119,8 @@ public:
                         Tegra::Engines::Fermi2D::Operation operation);

    void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
-                               std::span<const VideoCommon::SwizzleParameters> swizzles);
+                               std::span<const VideoCommon::SwizzleParameters> swizzles,
+                               u32 z_start, u32 z_count);

    void InsertUploadMemoryBarrier();

@ -223,6 +227,8 @@ public:

    bool ScaleDown(bool ignore = false);

+    u64 allocation_tick;
+
 private:
    void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);

--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -24,6 +24,7 @@
 #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
 #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
 #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
+#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
 #include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@ -622,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
            .pNext = nullptr,
            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
-            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
            .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -637,9 +638,292 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
            },
        };
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                               VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
+                               VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
+    });
+}
+
+constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
+constexpr u32 BL3D_BINDING_INPUT_BUFFER  = 1;
+constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
+
+constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
+    {
+        .binding = BL3D_BINDING_SWIZZLE_TABLE,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+    {
+        .binding = BL3D_BINDING_INPUT_BUFFER,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+    {
+        .binding = BL3D_BINDING_OUTPUT_BUFFER,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+}};
+
+constexpr DescriptorBankInfo BL3D_BANK_INFO{
+    .uniform_buffers = 0,
+    .storage_buffers = 3,
+    .texture_buffers = 0,
+    .image_buffers = 0,
+    .textures = 0,
+    .images = 0,
+    .score = 3,
+};
+
+constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
+    BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
+        {
+            .dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        },
+        {
+            .dstBinding = BL3D_BINDING_INPUT_BUFFER,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        },
+        {
+            .dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        }
+    }};
+
+struct alignas(16) BlockLinearUnswizzle3DPushConstants {
+    u32 blocks_dim[3];           // Offset 0
+    u32 bytes_per_block_log2;    // Offset 12
+
+    u32 origin[3];               // Offset 16
+    u32 slice_size;              // Offset 28
+
+    u32 block_size;              // Offset 32
+    u32 x_shift;                 // Offset 36
+    u32 block_height;            // Offset 40
+    u32 block_height_mask;       // Offset 44
+
+    u32 block_depth;             // Offset 48
+    u32 block_depth_mask;        // Offset 52
+    s32 _pad;                    // Offset 56
+
+    s32 destination[3];          // Offset 60
+    s32 _pad_end;                // Offset 72
+};
+static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
+
+BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
+    const Device& device_, Scheduler& scheduler_,
+    DescriptorPool& descriptor_pool_,
+    StagingBufferPool& staging_buffer_pool_,
+    ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
+    : ComputePass(
+          device_, descriptor_pool_,
+          BL3D_DESCRIPTOR_SET_BINDINGS,
+          BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
+          BL3D_BANK_INFO,
+          COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
+          BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
+      scheduler{scheduler_},
+      staging_buffer_pool{staging_buffer_pool_},
+      compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
+
+BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
+
+// God have mercy on my soul
+void BlockLinearUnswizzle3DPass::Unswizzle(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    std::span<const VideoCommon::SwizzleParameters> swizzles,
+    u32 z_start, u32 z_count)
+{
+    using namespace VideoCommon::Accelerated;
+
+    const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
+
+    if (!image.has_compute_unswizzle_buffer) {
+        // Allocate exactly what this batch needs
+        image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
+    }
+
+    ASSERT(swizzles.size() == 1);
+    const auto& sw = swizzles[0];
+    const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
+
+    const u32 blocks_x = (image.info.size.width  + 3) / 4;
+    const u32 blocks_y = (image.info.size.height + 3) / 4;
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
+        const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
+        const u32 current_z_start = z_start + z_offset;
+
+        UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
+                       current_z_start, current_chunk_slices);
+    }
+}
+
+void BlockLinearUnswizzle3DPass::UnswizzleChunk(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    const VideoCommon::SwizzleParameters& sw,
+    const BlockLinearSwizzle3DParams& params,
+    u32 blocks_x, u32 blocks_y,
+    u32 z_start, u32 z_count)
+{
+    BlockLinearUnswizzle3DPushConstants pc{};
+    pc.origin[0] = params.origin[0];
+    pc.origin[1] = params.origin[1];
+    pc.origin[2] = z_start; // Current chunk's Z start
+
+    pc.destination[0] = params.destination[0];
+    pc.destination[1] = params.destination[1];
+    pc.destination[2] = 0; // Shader writes to start of output buffer
+
+    pc.bytes_per_block_log2 = params.bytes_per_block_log2;
+    pc.slice_size           = params.slice_size;
+    pc.block_size           = params.block_size;
+    pc.x_shift              = params.x_shift;
+    pc.block_height         = params.block_height;
+    pc.block_height_mask    = params.block_height_mask;
+    pc.block_depth          = params.block_depth;
+    pc.block_depth_mask     = params.block_depth_mask;
+
+    pc.blocks_dim[0] = blocks_x;
+    pc.blocks_dim[1] = blocks_y;
+    pc.blocks_dim[2] = z_count; // Only process the count
+
+    compute_pass_descriptor_queue.Acquire();
+    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0,
+                                           image.runtime->swizzle_table_size);
+    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer,
+                                           sw.buffer_offset + swizzled.offset,
+                                           image.guest_size_bytes - sw.buffer_offset);
+    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0,
+                                           image.compute_unswizzle_buffer_size);
+
+    const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
+    const VkDescriptorSet set = descriptor_allocator.Commit();
+
+    const u32 gx = Common::DivCeil(blocks_x, 8u);
+    const u32 gy = Common::DivCeil(blocks_y, 8u);
+    const u32 gz = Common::DivCeil(z_count, 4u);
+
+    const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
+    const VkDeviceSize output_slice_size =
+        static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
+    const VkDeviceSize barrier_size = output_slice_size * z_count;
+
+    const bool is_first_chunk = (z_start == 0);
+
+    const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
+    const VkImage dst_image = image.Handle();
+    const VkImageAspectFlags aspect = image.AspectMask();
+    const u32 image_width = image.info.size.width;
+    const u32 image_height = image.info.size.height;
+
+    scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
+                      barrier_size, is_first_chunk, out_buffer, dst_image, aspect,
+                      image_width, image_height
+                      ](vk::CommandBuffer cmdbuf) {
+
+        if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
+            return;
+        }
+
+        device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
+        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
+        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
+        cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
+        cmdbuf.Dispatch(gx, gy, gz);
+
+        // Single barrier for compute -> transfer (buffer ready, image transition)
+        const VkBufferMemoryBarrier buffer_barrier{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = out_buffer,
+            .offset = 0,
+            .size = barrier_size,
+        };
+
+        // Image layout transition
+        const VkImageMemoryBarrier pre_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = is_first_chunk ? VkAccessFlags{} :
+                            static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
+            .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED :
+                        VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = dst_image,
+            .subresourceRange = {aspect, 0, 1, 0, 1},
+        };
+
+        // Single barrier handles both buffer and image
+        cmdbuf.PipelineBarrier(
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            0,
+            nullptr, buffer_barrier, pre_barrier
+        );
+
+        // Copy chunk to correct Z position in image
+        const VkBufferImageCopy copy{
+            .bufferOffset = 0, // Read from start of staging buffer
+            .bufferRowLength = 0,
+            .bufferImageHeight = 0,
+            .imageSubresource = {aspect, 0, 0, 1},
+            .imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
+            .imageExtent = {image_width, image_height, z_count},
+        };
+        cmdbuf.CopyBufferToImage(out_buffer, dst_image,
+                                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
+
+        // Post-copy transition
+        const VkImageMemoryBarrier post_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = dst_image,
+            .subresourceRange = {aspect, 0, 1, 0, 1},
+        };
+
+        cmdbuf.PipelineBarrier(
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            0,
+            nullptr, nullptr, post_barrier
+        );
    });
-    scheduler.Finish();
 }

 MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -14,6 +17,7 @@
 #include "video_core/texture_cache/types.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/texture_cache/accelerated_swizzle.h"

 namespace VideoCommon {
 struct SwizzleParameters;
@ -21,6 +25,8 @@ struct SwizzleParameters;

 namespace Vulkan {

+using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
+
 class Device;
 class StagingBufferPool;
 class Scheduler;
@ -131,6 +137,34 @@ private:
    MemoryAllocator& memory_allocator;
 };

+class BlockLinearUnswizzle3DPass final : public ComputePass {
+public:
+    explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
+                             DescriptorPool& descriptor_pool_,
+                             StagingBufferPool& staging_buffer_pool_,
+                             ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
+    ~BlockLinearUnswizzle3DPass();
+
+    void Unswizzle(Image& image,
+                   const StagingBufferRef& swizzled,
+                   std::span<const VideoCommon::SwizzleParameters> swizzles,
+                   u32 z_start, u32 z_count);
+
+    void UnswizzleChunk(
+        Image& image,
+        const StagingBufferRef& swizzled,
+        const VideoCommon::SwizzleParameters& sw,
+        const BlockLinearSwizzle3DParams& params,
+        u32 blocks_x, u32 blocks_y,
+        u32 z_start, u32 z_count);
+
+private:
+    Scheduler& scheduler;
+    StagingBufferPool& staging_buffer_pool;
+    ComputePassDescriptorQueue& compute_pass_descriptor_queue;
+};
+
+
 class MSAACopyPass final : public ComputePass {
 public:
    explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
    : device{device_}, state_tracker{state_tracker_},
      master_semaphore{std::make_unique<MasterSemaphore>(device)},
      command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
+
+    /*// PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
+    {
+        std::scoped_lock rl{reserve_mutex};
+        chunk_reserve.reserve(2048); // Prevent vector resizing
+        for (int i = 0; i < 1024; ++i) {
+            chunk_reserve.push_back(std::make_unique<CommandChunk>());
+        }
+    }*/
+
    AcquireNewChunk();
    AllocateWorkerCommandBuffer();
    worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@ -24,12 +24,14 @@
 #include "video_core/renderer_vulkan/vk_render_pass_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/surface.h"
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/util.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/textures/decoders.h"

 namespace Vulkan {

@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
            }
        }
    }
+
+    bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
+                            staging_buffer_pool, compute_pass_descriptor_queue);
+
+    // --- Create swizzle table buffer ---
+    {
+        auto table = Tegra::Texture::MakeSwizzleTable();
+
+        swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
+
+        auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
+        std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
+
+        VkBufferCreateInfo ci{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .size = swizzle_table_size,
+            .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+        swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
+
+        scheduler.RequestOutsideRenderPassOperationContext();
+        scheduler.Record([staging_buf = staging.buffer,
+                          dst_buf = *swizzle_table_buffer,
+                          size = swizzle_table_size,
+                          src_off = staging.offset](vk::CommandBuffer cmdbuf) {
+
+            const VkBufferCopy region{
+                .srcOffset = src_off,
+                .dstOffset = 0,
+                .size = size,
+            };
+            cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
+        });
+    }
 }

 void TextureCacheRuntime::Finish() {
    scheduler.Finish();
 }

-StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
-    return staging_buffer_pool.Request(size, MemoryUsage::Upload);
+StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
+    return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
 }

 StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas

 Image::~Image() = default;

+void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
+    if (has_compute_unswizzle_buffer)
+        return;
+
+    using VideoCore::Surface::BytesPerBlock;
+
+    const u32 block_bytes  = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
+    const u32 block_width  = 4;
+    const u32 block_height = 4;
+
+    // BCn is 4x4x1 blocks
+    const u32 blocks_x = (info.size.width  + block_width  - 1) / block_width;
+    const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
+    const u32 blocks_z = std::min(max_slices, info.size.depth);
+
+    const u64 block_count =
+        static_cast<u64>(blocks_x) *
+        static_cast<u64>(blocks_y) *
+        static_cast<u64>(blocks_z);
+
+    compute_unswizzle_buffer_size = block_count * block_bytes;
+
+    VkBufferCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = compute_unswizzle_buffer_size,
+        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                 VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    };
+
+    compute_unswizzle_buffer =
+        runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
+
+    has_compute_unswizzle_buffer = true;
+}
+
 void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
                         std::span<const VideoCommon::BufferImageCopy> copies) {
    // TODO: Move this to another API
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,

 void TextureCacheRuntime::AccelerateImageUpload(
    Image& image, const StagingBufferRef& map,
-    std::span<const VideoCommon::SwizzleParameters> swizzles) {
+    std::span<const VideoCommon::SwizzleParameters> swizzles,
+    u32 z_start, u32 z_count) {
+
    if (IsPixelFormatASTC(image.info.format)) {
        return astc_decoder_pass->Assemble(image, map, swizzles);
    }
+
+    if (bl3d_unswizzle_pass &&
+        IsPixelFormatBCn(image.info.format) &&
+        image.info.type == ImageType::e3D &&
+        image.info.resources.levels == 1 &&
+        image.info.resources.layers == 1) {
+
+        return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
+    }
+
    ASSERT(false);
 }

--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@ -51,7 +51,7 @@ public:

    void Finish();

-    StagingBufferRef UploadStagingBuffer(size_t size);
+    StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);

    StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);

@ -91,7 +91,8 @@ public:
    }

    void AccelerateImageUpload(Image&, const StagingBufferRef&,
-                               std::span<const VideoCommon::SwizzleParameters>);
+                               std::span<const VideoCommon::SwizzleParameters>,
+                               u32 z_start, u32 z_count);

    void InsertUploadMemoryBarrier() {}

@ -127,6 +128,11 @@ public:
    BlitImageHelper& blit_image_helper;
    RenderPassCache& render_pass_cache;
    std::optional<ASTCDecoderPass> astc_decoder_pass;
+
+    std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
+    vk::Buffer swizzle_table_buffer;
+    VkDeviceSize swizzle_table_size = 0;
+
    std::unique_ptr<MSAACopyPass> msaa_copy_pass;
    const Settings::ResolutionScalingInfo& resolution;
    std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
@ -164,6 +170,8 @@ public:
    void DownloadMemory(const StagingBufferRef& map,
                        std::span<const VideoCommon::BufferImageCopy> copies);

+    void AllocateComputeUnswizzleImage();
+
    [[nodiscard]] VkImage Handle() const noexcept {
        return *(this->*current_image);
    }
@ -189,6 +197,10 @@ public:

    bool ScaleDown(bool ignore = false);

+    u64 allocation_tick;
+
+    friend class BlockLinearUnswizzle3DPass;
+
 private:
    bool BlitScaleHelper(bool scale_up);

@ -200,6 +212,12 @@ private:
    vk::Image original_image;
    vk::Image scaled_image;

+    vk::Buffer compute_unswizzle_buffer;
+    VkDeviceSize compute_unswizzle_buffer_size = 0;
+    bool has_compute_unswizzle_buffer = false;
+
+    void AllocateComputeUnswizzleBuffer(u32 max_slices);
+
    // Use a pointer to field because it is relative, so that the object can be
    // moved without breaking the reference.
    vk::Image Image::*current_image{};
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -8,6 +8,7 @@

 #include <limits>
 #include <optional>
+#include <bit>
 #include <unordered_set>
 #include <boost/container/small_vector.hpp>

@ -22,6 +23,7 @@
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/texture_cache/util.h"
+#include "video_core/textures/decoders.h"

 namespace VideoCommon {

@ -68,10 +70,41 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
            (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
                     DEFAULT_CRITICAL_MEMORY));
        minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
+
+        lowmemorydevice = false;
    } else {
        expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
        critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
        minimum_memory = 0;
+
+        lowmemorydevice = true;
+    }
+
+    switch (Settings::values.gpu_unzwizzle_texture_size.GetValue()) {
+        case Settings::GpuUnswizzleSize::VerySmall:    gpu_unswizzle_maxsize = 16_MiB; break;
+        case Settings::GpuUnswizzleSize::Small:        gpu_unswizzle_maxsize = 32_MiB; break;
+        case Settings::GpuUnswizzleSize::Normal:       gpu_unswizzle_maxsize = 128_MiB; break;
+        case Settings::GpuUnswizzleSize::Large:        gpu_unswizzle_maxsize = 256_MiB; break;
+        case Settings::GpuUnswizzleSize::VeryLarge:    gpu_unswizzle_maxsize = 512_MiB; break;
+        default:                                       gpu_unswizzle_maxsize = 128_MiB; break;
+    }
+
+    switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) {
+        case Settings::GpuUnswizzle::VeryLow: swizzle_chunk_size = 4_MiB; break;
+        case Settings::GpuUnswizzle::Low:     swizzle_chunk_size = 8_MiB; break;
+        case Settings::GpuUnswizzle::Normal:  swizzle_chunk_size = 16_MiB; break;
+        case Settings::GpuUnswizzle::Medium:  swizzle_chunk_size = 32_MiB; break;
+        case Settings::GpuUnswizzle::High:    swizzle_chunk_size = 64_MiB; break;
+        default:                              swizzle_chunk_size = 16_MiB;
+    }
+
+    switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) {
+        case Settings::GpuUnswizzleChunk::VeryLow: swizzle_slices_per_batch = 32; break;
+        case Settings::GpuUnswizzleChunk::Low:     swizzle_slices_per_batch = 64; break;
+        case Settings::GpuUnswizzleChunk::Normal:  swizzle_slices_per_batch = 128; break;
+        case Settings::GpuUnswizzleChunk::Medium:  swizzle_slices_per_batch = 256; break;
+        case Settings::GpuUnswizzleChunk::High:    swizzle_slices_per_batch = 512; break;
+        default:                                   swizzle_slices_per_batch = 128;
    }
 }

@ -88,6 +121,7 @@ void TextureCache<P>::RunGarbageCollector() {
        ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 50ULL;
        num_iterations = aggressive_mode ? 40 : (high_priority_mode ? 20 : 10);
    };
+
    const auto Cleanup = [this, &num_iterations, &high_priority_mode,
                          &aggressive_mode](ImageId image_id) {
        if (num_iterations == 0) {
@ -95,20 +129,36 @@ void TextureCache<P>::RunGarbageCollector() {
        }
        --num_iterations;
        auto& image = slot_images[image_id];
+
+        // Never delete recently allocated sparse textures (within 3 frames)
+        const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
+        if (is_recently_allocated && image.info.is_sparse) {
+            return false;
+        }
+
        if (True(image.flags & ImageFlagBits::IsDecoding)) {
            // This image is still being decoded, deleting it will invalidate the slot
            // used by the async decoder thread.
            return false;
        }
-        if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) {
+
+        // Prioritize large sparse textures for cleanup
+        const bool is_large_sparse = lowmemorydevice &&
+                                     image.info.is_sparse &&
+                                     image.guest_size_bytes >= 256_MiB;
+
+        if (!aggressive_mode && !is_large_sparse &&
+            True(image.flags & ImageFlagBits::CostlyLoad)) {
            return false;
        }
+
        const bool must_download =
            image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
-        if (!high_priority_mode && must_download) {
+        if (!high_priority_mode && !is_large_sparse && must_download) {
            return false;
        }
-        if (must_download) {
+
+        if (must_download && !is_large_sparse) {
            auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
            const auto copies = FixSmallVectorADL(FullDownloadCopies(image.info));
            image.DownloadMemory(map, copies);
@ -116,11 +166,13 @@ void TextureCache<P>::RunGarbageCollector() {
            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span,
                         swizzle_data_buffer);
        }
+
        if (True(image.flags & ImageFlagBits::Tracked)) {
            UntrackImage(image, image_id);
        }
        UnregisterImage(image_id);
        DeleteImage(image_id, image.scale_tick > frame_tick + 5);
+
        if (total_used_memory < critical_memory) {
            if (aggressive_mode) {
                // Sink the aggresiveness.
@ -136,7 +188,24 @@ void TextureCache<P>::RunGarbageCollector() {
        return false;
    };

-    // Try to remove anything old enough and not high priority.
+    // Aggressively clear massive sparse textures
+    if (total_used_memory >= expected_memory) {
+        lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
+            auto& image = slot_images[image_id];
+            // Only target sparse textures that are old enough
+            if (lowmemorydevice &&
+                image.info.is_sparse &&
+                image.guest_size_bytes >= 256_MiB &&
+                image.allocation_tick < frame_tick - 3) {
+                LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)",
+                         image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
+                         frame_tick - image.allocation_tick);
+                return Cleanup(image_id);
+            }
+            return false;
+        });
+    }
+
    Configure(false);
    lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, Cleanup);

@ -160,6 +229,7 @@ void TextureCache<P>::TickFrame() {
    sentenced_framebuffers.Tick();
    sentenced_image_view.Tick();
    TickAsyncDecode();
+    TickAsyncUnswizzle();

    runtime.TickFrame();
    ++frame_tick;
@ -627,7 +697,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
                UntrackImage(image, id);
            }
        }
-
        if (True(image.flags & ImageFlagBits::Remapped)) {
            continue;
        }
@ -1055,7 +1124,12 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        // Only upload modified images
        return;
    }
+
    image.flags &= ~ImageFlagBits::CpuModified;
+    if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) {
+        return;
+    }
+
    TrackImage(image, image_id);

    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
@ -1067,6 +1141,16 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        QueueAsyncDecode(image, image_id);
        return;
    }
+    if (IsPixelFormatBCn(image.info.format) &&
+        image.info.type == ImageType::e3D &&
+        image.info.resources.levels == 1 &&
+        image.info.resources.layers == 1 &&
+        MapSizeBytes(image) >= gpu_unswizzle_maxsize &&
+        False(image.flags & ImageFlagBits::GpuModified)) {
+
+        QueueAsyncUnswizzle(image, image_id);
+        return;
+    }
    auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
    UploadImageContents(image, staging);
    runtime.InsertUploadMemoryBarrier();
@ -1082,7 +1166,7 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
        gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(),
                              VideoCommon::CacheType::NoTextureCache);
        const auto uploads = FullUploadSwizzles(image.info);
-        runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads));
+        runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0);
        return;
    }

@ -1311,6 +1395,20 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
    texture_decode_worker.QueueWork(std::move(func));
 }

+template <class P>
+void TextureCache<P>::QueueAsyncUnswizzle(Image& image, ImageId image_id) {
+    if (True(image.flags & ImageFlagBits::IsDecoding)) {
+        return;
+    }
+
+    image.flags |= ImageFlagBits::IsDecoding;
+
+    unswizzle_queue.push_back({
+        .image_id = image_id,
+        .info = image.info
+    });
+}
+
 template <class P>
 void TextureCache<P>::TickAsyncDecode() {
    bool has_uploads{};
@ -1336,6 +1434,83 @@ void TextureCache<P>::TickAsyncDecode() {
    }
 }

+template <class P>
+void TextureCache<P>::TickAsyncUnswizzle() {
+    if (unswizzle_queue.empty()) {
+        return;
+    }
+
+    if(current_unswizzle_frame > 0) {
+        current_unswizzle_frame--;
+        return;
+    }
+
+    PendingUnswizzle& task = unswizzle_queue.front();
+    Image& image = slot_images[task.image_id];
+
+    if (!task.initialized) {
+        task.total_size = MapSizeBytes(image);
+        task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true);
+
+        const auto& info = image.info;
+        const u32 bytes_per_block = BytesPerBlock(info.format);
+        const u32 width_blocks = Common::DivCeil(info.size.width, 4u);
+        const u32 height_blocks = Common::DivCeil(info.size.height, 4u);
+
+        const u32 stride = width_blocks * bytes_per_block;
+        const u32 aligned_height = height_blocks;
+        task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
+        task.last_submitted_offset = 0;
+        task.initialized = true;
+    }
+
+    // Read data
+    if (task.current_offset < task.total_size) {
+        const size_t remaining = task.total_size - task.current_offset;
+
+        size_t copy_amount = std::min(swizzle_chunk_size, remaining);
+
+        if (remaining > swizzle_chunk_size) {
+            copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
+            if (copy_amount == 0) copy_amount = task.bytes_per_slice;
+        }
+
+        gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
+                              task.staging_buffer.mapped_span.data() + task.current_offset,
+                              copy_amount);
+        task.current_offset += copy_amount;
+    }
+
+    const bool is_final_batch = task.current_offset >= task.total_size;
+    const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
+    const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
+
+    if (complete_slices >= swizzle_slices_per_batch || (is_final_batch && complete_slices > 0)) {
+        const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+        const u32 slices_to_process = std::min(complete_slices, swizzle_slices_per_batch);
+        const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);
+
+        if (z_count > 0) {
+            const auto uploads = FullUploadSwizzles(task.info);
+            runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
+            task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
+        }
+    }
+
+    // Check if complete
+    const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+    const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
+
+    if (is_final_batch && all_slices_submitted) {
+        runtime.FreeDeferredStagingBuffer(task.staging_buffer);
+        image.flags &= ~ImageFlagBits::IsDecoding;
+        unswizzle_queue.pop_front();
+
+        // Wait 4 frames to process the next entry
+        current_unswizzle_frame = 4u;
+    }
+}
+
 template <class P>
 bool TextureCache<P>::ScaleUp(Image& image) {
    const bool has_copy = image.HasScaled();
@ -1374,6 +1549,39 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
        }
    }
    ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
+
+    // For large sparse textures, aggressively clean up old allocations at same address
+    if (lowmemorydevice && info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
+        const auto alloc_it = image_allocs_table.find(gpu_addr);
+        if (alloc_it != image_allocs_table.end()) {
+            const ImageAllocId alloc_id = alloc_it->second;
+            auto& alloc_images = slot_image_allocs[alloc_id].images;
+
+            // Collect old images at this address that were created more than 2 frames ago
+            boost::container::small_vector<ImageId, 4> to_delete;
+            for (ImageId old_image_id : alloc_images) {
+                Image& old_image = slot_images[old_image_id];
+                if (old_image.info.is_sparse &&
+                    old_image.gpu_addr == gpu_addr &&
+                    old_image.allocation_tick < frame_tick - 2) {  // Try not to delete fresh textures
+                    to_delete.push_back(old_image_id);
+                }
+            }
+
+            // Delete old images immediately
+            for (ImageId old_id : to_delete) {
+                Image& old_image = slot_images[old_id];
+                LOG_DEBUG(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)",
+                         gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
+                if (True(old_image.flags & ImageFlagBits::Tracked)) {
+                    UntrackImage(old_image, old_id);
+                }
+                UnregisterImage(old_id);
+                DeleteImage(old_id, true);
+            }
+        }
+    }
+
    const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr);
    const Image& image = slot_images[image_id];
    // Using "image.gpu_addr" instead of "gpu_addr" is important because it might be different
@ -1389,6 +1597,27 @@ template <class P>
 ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DAddr cpu_addr) {
    ImageInfo new_info = info;
    const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
+
+    // Proactive cleanup for large sparse texture allocations
+    if (lowmemorydevice && new_info.is_sparse && size_bytes >= 256_MiB) {
+        const u64 estimated_alloc_size = size_bytes;
+
+        if (total_used_memory + estimated_alloc_size >= critical_memory) {
+            LOG_DEBUG(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
+                       "Current memory: {} MiB, Critical: {} MiB",
+                       size_bytes / (1024 * 1024),
+                       total_used_memory / (1024 * 1024),
+                       critical_memory / (1024 * 1024));
+            RunGarbageCollector();
+
+            // If still over threshold after GC, try one more aggressive pass
+            if (total_used_memory + estimated_alloc_size >= critical_memory) {
+                LOG_DEBUG(HW_GPU, "Still critically low on memory, running second GC pass");
+                RunGarbageCollector();
+            }
+        }
+    }
+
    const bool broken_views = runtime.HasBrokenTextureViewFormats();
    const bool native_bgr = runtime.HasNativeBgr();
    join_overlap_ids.clear();
@ -1485,6 +1714,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
    const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
    Image& new_image = slot_images[new_image_id];

+    new_image.allocation_tick = frame_tick;
+
    if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
        new_info.is_sparse) {
        new_image.flags |= ImageFlagBits::Sparse;
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
    using AsyncBuffer = typename P::AsyncBuffer;
    using BufferType = typename P::BufferType;

+    struct PendingUnswizzle {
+        ImageId image_id;
+        VideoCommon::ImageInfo info;
+        size_t current_offset = 0;
+        size_t total_size = 0;
+        AsyncBuffer staging_buffer;
+        size_t last_submitted_offset = 0;
+        size_t bytes_per_slice;
+        bool initialized = false;
+    };
+
    struct BlitImages {
        ImageId dst_id;
        ImageId src_id;
@ -433,6 +444,9 @@ private:
    void TrimInactiveSamplers(size_t budget);
    std::optional<size_t> QuerySamplerBudget() const;

+    void QueueAsyncUnswizzle(Image& image, ImageId image_id);
+    void TickAsyncUnswizzle();
+
    Runtime& runtime;

    Tegra::MaxwellDeviceMemoryManager& device_memory;
@ -453,6 +467,10 @@ private:
    u64 minimum_memory;
    u64 expected_memory;
    u64 critical_memory;
+    bool lowmemorydevice = false;
+    size_t gpu_unswizzle_maxsize = 0;
+    size_t swizzle_chunk_size = 0;
+    u32 swizzle_slices_per_batch = 0;

    struct BufferDownload {
        GPUVAddr address;
@ -508,6 +526,9 @@ private:
    Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
    std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;

+    std::deque<PendingUnswizzle> unswizzle_queue;
+    u8 current_unswizzle_frame;
+
    // Join caching
    boost::container::small_vector<ImageId, 4> join_overlap_ids;
    std::unordered_set<ImageId> join_overlaps_found;