[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)

- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads
- [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL
- [Implemented] texture streaming and queue system for large sparse textures to prevent hitches
- [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused)
- [Added] user settings to adjust the streaming unswizzle system for low-end machines
- [Improved] slightly the ASTC GPU decoding system

Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-authored-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: DraVee <dravee@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246
Reviewed-by: Maufeat <sahyno1996@gmail.com>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: DraVee <dravee@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: Forrest Keller <forrestmarkx@outlook.com>
Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
This commit is contained in:
Forrest Keller 2026-01-13 19:18:08 +01:00 committed by crueter
parent f544004b5d
commit ecd01e13fd
No known key found for this signature in database
GPG Key ID: 425ACD2D4830EBC6
20 changed files with 1076 additions and 83 deletions

View File

@ -47,6 +47,9 @@ enum class IntSetting(override val key: String) : AbstractIntSetting {
FAST_CPU_TIME("fast_cpu_time"),
CPU_TICKS("cpu_ticks"),
FAST_GPU_TIME("fast_gpu_time"),
GPU_UNZWIZZLE_MAXTEXTURE_SIZE("gpu_unzwizzle_maxtexture_size"),
GPU_UNZWIZZLE_STREAM_SIZE("gpu_unzwizzle_stream_size"),
GPU_UNZWIZZLE_CHUNK_SIZE("gpu_unzwizzle_chunk_size"),
BAT_TEMPERATURE_UNIT("bat_temperature_unit"),
CABINET_APPLET("cabinet_applet_mode"),
CONTROLLER_APPLET("controller_applet_mode"),

View File

@ -655,6 +655,33 @@ abstract class SettingsItem(
valuesId = R.array.gpuValues
)
)
put(
SingleChoiceSetting(
IntSetting.GPU_UNZWIZZLE_MAXTEXTURE_SIZE,
titleId = R.string.gpu_unzwizzle_maxtexture_size,
descriptionId = R.string.gpu_unzwizzle_maxtexture_size_description,
choicesId = R.array.gpuTextureSizeSwizzleEntries,
valuesId = R.array.gpuTextureSizeSwizzleValues
)
)
put(
SingleChoiceSetting(
IntSetting.GPU_UNZWIZZLE_STREAM_SIZE,
titleId = R.string.gpu_unzwizzle_stream_size,
descriptionId = R.string.gpu_unzwizzle_stream_size_description,
choicesId = R.array.gpuSwizzleEntries,
valuesId = R.array.gpuSwizzleValues
)
)
put(
SingleChoiceSetting(
IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE,
titleId = R.string.gpu_unzwizzle_chunk_size,
descriptionId = R.string.gpu_unzwizzle_chunk_size_description,
choicesId = R.array.gpuSwizzleChunkEntries,
valuesId = R.array.gpuSwizzleChunkValues
)
)
put(
SingleChoiceSetting(
IntSetting.FAST_CPU_TIME,

View File

@ -280,6 +280,9 @@ class SettingsFragmentPresenter(
add(IntSetting.FAST_GPU_TIME.key)
add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
add(BooleanSetting.RENDERER_ASYNCHRONOUS_SHADERS.key)
add(IntSetting.GPU_UNZWIZZLE_MAXTEXTURE_SIZE.key)
add(IntSetting.GPU_UNZWIZZLE_STREAM_SIZE.key)
add(IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE.key)
add(HeaderSetting(R.string.extensions))

View File

@ -564,6 +564,54 @@
<item>2</item>
</integer-array>
<string-array name="gpuTextureSizeSwizzleEntries">
<item>@string/gpu_texturesizeswizzle_verysmall</item>
<item>@string/gpu_texturesizeswizzle_small</item>
<item>@string/gpu_texturesizeswizzle_normal</item>
<item>@string/gpu_texturesizeswizzle_large</item>
<item>@string/gpu_texturesizeswizzle_verylarge</item>
</string-array>
<integer-array name="gpuTextureSizeSwizzleValues">
<item>0</item>
<item>1</item>
<item>2</item>
<item>3</item>
<item>4</item>
</integer-array>
<string-array name="gpuSwizzleEntries">
<item>@string/gpu_swizzle_verylow</item>
<item>@string/gpu_swizzle_low</item>
<item>@string/gpu_swizzle_normal</item>
<item>@string/gpu_swizzle_medium</item>
<item>@string/gpu_swizzle_high</item>
</string-array>
<integer-array name="gpuSwizzleValues">
<item>0</item>
<item>1</item>
<item>2</item>
<item>3</item>
<item>4</item>
</integer-array>
<string-array name="gpuSwizzleChunkEntries">
<item>@string/gpu_swizzlechunk_verylow</item>
<item>@string/gpu_swizzlechunk_low</item>
<item>@string/gpu_swizzlechunk_normal</item>
<item>@string/gpu_swizzlechunk_medium</item>
<item>@string/gpu_swizzlechunk_high</item>
</string-array>
<integer-array name="gpuSwizzleChunkValues">
<item>0</item>
<item>1</item>
<item>2</item>
<item>3</item>
<item>4</item>
</integer-array>
<string-array name="temperatureUnitEntries">
<item>@string/temperature_celsius</item>
<item>@string/temperature_fahrenheit</item>

View File

@ -504,6 +504,13 @@
<string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
<string name="renderer_asynchronous_shaders">Use asynchronous shaders</string>
<string name="renderer_asynchronous_shaders_description">Compiles shaders asynchronously. This may reduce stutters but may also introduce glitches.</string>
<string name="gpu_unzwizzle_maxtexture_size">GPU Unswizzle Max Texture Size</string>
<string name="gpu_unzwizzle_maxtexture_size_description">Sets the maximum size (MB) for GPU-based texture unswizzling. While the GPU is faster for medium and large textures, the CPU may be more efficient for very small ones. Adjust this to find the balance between GPU acceleration and CPU overhead.</string>
<string name="gpu_unzwizzle_stream_size">GPU Unswizzle Stream Size</string>
<string name="gpu_unzwizzle_stream_size_description">Sets the data limit per frame for unswizzling large textures. Higher values speed up texture loading at the cost of higher frame latency; lower values reduce GPU overhead but may cause visible texture pop-in.</string>
<string name="gpu_unzwizzle_chunk_size">GPU Unswizzle Chunk Size</string>
<string name="gpu_unzwizzle_chunk_size_description">Defines the number of depth slices processed per batch for 3D textures. Increasing this improves throughput efficiency on powerful GPUs but may cause stuttering or driver timeouts on weaker hardware.</string>
<string name="extensions">Extensions</string>
@ -926,6 +933,27 @@
<string name="fast_gpu_medium">Medium (256)</string>
<string name="fast_gpu_high">High (512)</string>
<!-- GPU swizzle texture size -->
<string name="gpu_texturesizeswizzle_verysmall">Very Small (16 MB)</string>
<string name="gpu_texturesizeswizzle_small">Small (32 MB)</string>
<string name="gpu_texturesizeswizzle_normal">Normal (128 MB)</string>
<string name="gpu_texturesizeswizzle_large">Large (256 MB)</string>
<string name="gpu_texturesizeswizzle_verylarge">Very Large (512 MB)</string>
<!-- GPU swizzle streams -->
<string name="gpu_swizzle_verylow">Very Low (4 MB)</string>
<string name="gpu_swizzle_low">Low (8 MB)</string>
<string name="gpu_swizzle_normal">Normal (16 MB)</string>
<string name="gpu_swizzle_medium">Medium (32 MB)</string>
<string name="gpu_swizzle_high">High (64 MB)</string>
<!-- GPU swizzle chunks -->
<string name="gpu_swizzlechunk_verylow">Very Low (32)</string>
<string name="gpu_swizzlechunk_low">Low (64)</string>
<string name="gpu_swizzlechunk_normal">Normal (128)</string>
<string name="gpu_swizzlechunk_medium">Medium (256)</string>
<string name="gpu_swizzlechunk_high">High (512)</string>
<!-- Temperature Units -->
<string name="temperature_celsius">Celsius</string>
<string name="temperature_fahrenheit">Fahrenheit</string>

View File

@ -513,6 +513,24 @@ struct Values {
SwitchableSetting<bool> use_asynchronous_shaders{linkage, false, "use_asynchronous_shaders",
Category::RendererHacks};
SwitchableSetting<GpuUnswizzleSize> gpu_unzwizzle_texture_size{linkage,
GpuUnswizzleSize::Large,
"gpu_unzwizzle_texture_size",
Category::RendererHacks,
Specialization::Default};
SwitchableSetting<GpuUnswizzle> gpu_unzwizzle_stream_size{linkage,
GpuUnswizzle::Medium,
"gpu_unzwizzle_stream_size",
Category::RendererHacks,
Specialization::Default};
SwitchableSetting<GpuUnswizzleChunk> gpu_unzwizzle_chunk_size{linkage,
GpuUnswizzleChunk::Medium,
"gpu_unzwizzle_chunk_size",
Category::RendererHacks,
Specialization::Default};
SwitchableSetting<ExtendedDynamicState> dyna_state{linkage,
#if defined (_WIN32)
ExtendedDynamicState::EDS3,

View File

@ -150,6 +150,9 @@ ENUM(ConsoleMode, Handheld, Docked);
ENUM(AppletMode, HLE, LLE);
ENUM(SpirvOptimizeMode, Never, OnLoad, Always);
ENUM(GpuOverclock, Normal, Medium, High)
ENUM(GpuUnswizzleSize, VerySmall, Small, Normal, Large, VeryLarge)
ENUM(GpuUnswizzle, VeryLow, Low, Normal, Medium, High)
ENUM(GpuUnswizzleChunk, VeryLow, Low, Normal, Medium, High)
ENUM(TemperatureUnits, Celsius, Fahrenheit)
ENUM(ExtendedDynamicState, Disabled, EDS1, EDS2, EDS3);

View File

@ -288,6 +288,22 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QObject* parent)
tr("Fast GPU Time"),
tr("Overclocks the emulated GPU to increase dynamic resolution and render "
"distance.\nUse 256 for maximal performance and 512 for maximal graphics fidelity."));
INSERT(Settings,
gpu_unzwizzle_texture_size,
tr("GPU Unswizzle Max Texture Size"),
tr("Sets the maximum size (MiB) for GPU-based texture unswizzling.\n"
"While the GPU is faster for medium and large textures, the CPU may be more efficient for very small ones.\n"
"Adjust this to find the balance between GPU acceleration and CPU overhead."));
INSERT(Settings,
gpu_unzwizzle_stream_size,
tr("GPU Unswizzle Stream Size"),
tr("Sets the maximum amount of texture data (in MiB) processed per frame.\n"
"Higher values can reduce stutter during texture loading but may impact frame consistency."));
INSERT(Settings,
gpu_unzwizzle_chunk_size,
tr("GPU Unswizzle Chunk Size"),
tr("Determines the number of depth slices processed in a single dispatch.\n"
"Increasing this can improve throughput on high-end GPUs but may cause TDR or driver timeouts on weaker hardware."));
INSERT(Settings,
use_vulkan_driver_pipeline_cache,
@ -719,6 +735,30 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QObject* parent)
PAIR(GpuOverclock, Medium, tr("Medium (256)")),
PAIR(GpuOverclock, High, tr("High (512)")),
}});
translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzleSize>::Index(),
{
PAIR(GpuUnswizzleSize, VerySmall, tr("Very Small (16 MB)")),
PAIR(GpuUnswizzleSize, Small, tr("Small (32 MB)")),
PAIR(GpuUnswizzleSize, Normal, tr("Normal (128 MB)")),
PAIR(GpuUnswizzleSize, Large, tr("Large (256 MB)")),
PAIR(GpuUnswizzleSize, VeryLarge, tr("Very Large (512 MB)")),
}});
translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzle>::Index(),
{
PAIR(GpuUnswizzle, VeryLow, tr("Very Low (4 MB)")),
PAIR(GpuUnswizzle, Low, tr("Low (8 MB)")),
PAIR(GpuUnswizzle, Normal, tr("Normal (16 MB)")),
PAIR(GpuUnswizzle, Medium, tr("Medium (32 MB)")),
PAIR(GpuUnswizzle, High, tr("High (64 MB)")),
}});
translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzleChunk>::Index(),
{
PAIR(GpuUnswizzleChunk, VeryLow, tr("Very Low (32)")),
PAIR(GpuUnswizzleChunk, Low, tr("Low (64)")),
PAIR(GpuUnswizzleChunk, Normal, tr("Normal (128)")),
PAIR(GpuUnswizzleChunk, Medium, tr("Medium (256)")),
PAIR(GpuUnswizzleChunk, High, tr("High (512)")),
}});
translations->insert({Settings::EnumMetadata<Settings::ExtendedDynamicState>::Index(),
{

View File

@ -18,6 +18,7 @@ set(SHADER_FILES
blit_color_float.frag
block_linear_unswizzle_2d.comp
block_linear_unswizzle_3d.comp
block_linear_unswizzle_3d_bcn.comp
convert_abgr8_srgb_to_d24s8.frag
convert_abgr8_to_d24s8.frag
convert_abgr8_to_d32f.frag

View File

@ -727,70 +727,35 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, ui
}
uint UnquantizeTexelWeight(EncodingData val) {
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
const uint A = ReplicateBitTo7((bitval & 1));
uint B = 0, C = 0, D = 0;
uint result = 0;
const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
switch (encoding) {
case JUST_BITS:
return FastReplicateTo6(bitval, bitlen);
case TRIT: {
uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
if (encoding == JUST_BITS) {
return (bitlen >= 1 && bitlen <= 5)
? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
: FastReplicateTo6(bitval, bitlen);
} else if (encoding == TRIT || encoding == QUINT) {
uint B = 0, C = 0, D = 0;
uint b_mask = (0x3100 >> (bitlen * 4)) & 0xf;
uint b = (bitval >> 1) & b_mask;
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D * 2];
case 1: {
C = 50;
break;
if (encoding == TRIT) {
switch (bitlen) {
case 0: return D * 32; //0,32,64
case 1: C = 50; break;
case 2: C = 23; B = (b << 6) | (b << 2) | b; break;
case 3: C = 11; B = (b << 5) | b; break;
}
} else if (encoding == QUINT) {
switch (bitlen) {
case 0: return D * 16; //0, 16, 32, 48, 64
case 1: C = 28; break;
case 2: C = 13; B = (b << 6) | (b << 1); break;
}
}
case 2: {
C = 23;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 2) | b;
break;
}
case 3: {
C = 11;
const uint cb = (bitval >> 1) & 3;
B = (cb << 5) | cb;
break;
}
default:
break;
}
break;
uint A = ReplicateBitTo7(bitval & 1);
uint res = (A & 0x20) | (((D * C + B) ^ A) >> 2);
return res + (res > 32 ? 1 : 0);
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D];
case 1: {
C = 28;
break;
}
case 2: {
C = 13;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 1);
break;
}
}
break;
}
}
if (encoding != JUST_BITS && bitlen > 0) {
result = D * C + B;
result ^= A;
result = (A & 0x20) | (result >> 2);
}
if (result > 32) {
result += 1;
}
return result;
return 0;
}
void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
@ -1159,10 +1124,11 @@ void DecompressBlock(ivec3 coord) {
}
uint SwizzleOffset(uvec2 pos) {
const uint x = pos.x;
const uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
return ((pos.x & 32u) << 3u) |
((pos.y & 6u) << 5u) |
((pos.x & 16u) << 1u) |
((pos.y & 1u) << 4u) |
(pos.x & 15u);
}
void main() {

View File

@ -0,0 +1,160 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_BUFFER 2
#else
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_BUFFER 0
#endif
// --- Push Constants / Uniforms ---
#ifdef VULKAN
layout(push_constant) uniform PushConstants {
uvec3 blocks_dim; // Offset 0
uint bytes_per_block_log2; // Offset 12
uvec3 origin; // Offset 16
uint slice_size; // Offset 28
uint block_size; // Offset 32
uint x_shift; // Offset 36
uint block_height; // Offset 40
uint block_height_mask; // Offset 44
uint block_depth; // Offset 48
uint block_depth_mask; // Offset 52
int _pad; // Offset 56
ivec3 destination; // Offset 60
} pc;
#else
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint slice_size;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
UNIFORM(8) uint block_depth;
UNIFORM(9) uint block_depth_mask;
UNIFORM(10) uvec3 blocks_dim;
END_PUSH_CONSTANTS
#define pc // Map pc prefix to nothing for OpenGL compatibility
#endif
// --- Buffers ---
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
uint out_u32[];
};
// --- Constants ---
layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
// --- Helpers ---
uint SwizzleOffset(uvec2 pos) {
pos &= SWIZZLE_MASK;
return swizzle_table[pos.y * 64u + pos.x];
}
uvec4 ReadTexel(uint offset) {
uint bpl2 = pc.bytes_per_block_log2;
switch (bpl2) {
#if HAS_EXTENDED_TYPES
case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
#else
case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
#endif
case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
case 4u: return u128data[offset / 16u];
}
return uvec4(0u);
}
void main() {
uvec3 block_coord = gl_GlobalInvocationID;
if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
return;
}
uint bytes_per_block = 1u << pc.bytes_per_block_log2;
// Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
uvec3 pos;
pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
pos.y = block_coord.y + (pc.origin.y >> 2u);
pos.z = block_coord.z + pc.origin.z;
uint swizzle = SwizzleOffset(pos.xy);
uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0u;
// Apply block-linear offsets
offset += (pos.z >> pc.block_depth) * pc.slice_size;
offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
offset += (block_y >> pc.block_height) * pc.block_size;
offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
offset += swizzle;
uvec4 texel = ReadTexel(offset);
// Calculate linear output index
uint block_index = block_coord.x +
(block_coord.y * pc.blocks_dim.x) +
(block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
uint out_idx = block_index * (bytes_per_block >> 2u);
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1u] = texel.y;
if (pc.bytes_per_block_log2 == 4u) {
out_u32[out_idx + 2u] = texel.z;
out_u32[out_idx + 3u] = texel.w;
}
}

View File

@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() {
glFinish();
}
StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
return staging_buffer_pool.RequestUploadBuffer(size);
}
@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
}
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const SwizzleParameters> swizzles) {
std::span<const SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
switch (image.info.type) {
case ImageType::e2D:
if (IsPixelFormatASTC(image.info.format)) {

View File

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -72,7 +75,7 @@ public:
void Finish();
StagingBufferMap UploadStagingBuffer(size_t size);
StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false);
StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false);
@ -116,7 +119,8 @@ public:
Tegra::Engines::Fermi2D::Operation operation);
void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles);
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
void InsertUploadMemoryBarrier();
@ -223,6 +227,8 @@ public:
bool ScaleDown(bool ignore = false);
u64 allocation_tick;
private:
void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);

View File

@ -24,6 +24,7 @@
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
@ -622,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -637,9 +638,292 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
});
}
constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
constexpr u32 BL3D_BINDING_INPUT_BUFFER = 1;
constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
{
.binding = BL3D_BINDING_SWIZZLE_TABLE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BL3D_BINDING_INPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BL3D_BINDING_OUTPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr DescriptorBankInfo BL3D_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 3,
.texture_buffers = 0,
.image_buffers = 0,
.textures = 0,
.images = 0,
.score = 3,
};
constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{
.dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = BL3D_BINDING_INPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
}
}};
struct alignas(16) BlockLinearUnswizzle3DPushConstants {
u32 blocks_dim[3]; // Offset 0
u32 bytes_per_block_log2; // Offset 12
u32 origin[3]; // Offset 16
u32 slice_size; // Offset 28
u32 block_size; // Offset 32
u32 x_shift; // Offset 36
u32 block_height; // Offset 40
u32 block_height_mask; // Offset 44
u32 block_depth; // Offset 48
u32 block_depth_mask; // Offset 52
s32 _pad; // Offset 56
s32 destination[3]; // Offset 60
s32 _pad_end; // Offset 72
};
static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(
device_, descriptor_pool_,
BL3D_DESCRIPTOR_SET_BINDINGS,
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
BL3D_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
scheduler{scheduler_},
staging_buffer_pool{staging_buffer_pool_},
compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
// God have mercy on my soul
void BlockLinearUnswizzle3DPass::Unswizzle(
Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count)
{
using namespace VideoCommon::Accelerated;
const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
if (!image.has_compute_unswizzle_buffer) {
// Allocate exactly what this batch needs
image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
}
ASSERT(swizzles.size() == 1);
const auto& sw = swizzles[0];
const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4;
scheduler.RequestOutsideRenderPassOperationContext();
for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
const u32 current_z_start = z_start + z_offset;
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
current_z_start, current_chunk_slices);
}
}
void BlockLinearUnswizzle3DPass::UnswizzleChunk(
Image& image,
const StagingBufferRef& swizzled,
const VideoCommon::SwizzleParameters& sw,
const BlockLinearSwizzle3DParams& params,
u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count)
{
BlockLinearUnswizzle3DPushConstants pc{};
pc.origin[0] = params.origin[0];
pc.origin[1] = params.origin[1];
pc.origin[2] = z_start; // Current chunk's Z start
pc.destination[0] = params.destination[0];
pc.destination[1] = params.destination[1];
pc.destination[2] = 0; // Shader writes to start of output buffer
pc.bytes_per_block_log2 = params.bytes_per_block_log2;
pc.slice_size = params.slice_size;
pc.block_size = params.block_size;
pc.x_shift = params.x_shift;
pc.block_height = params.block_height;
pc.block_height_mask = params.block_height_mask;
pc.block_depth = params.block_depth;
pc.block_depth_mask = params.block_depth_mask;
pc.blocks_dim[0] = blocks_x;
pc.blocks_dim[1] = blocks_y;
pc.blocks_dim[2] = z_count; // Only process the count
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0,
image.runtime->swizzle_table_size);
compute_pass_descriptor_queue.AddBuffer(swizzled.buffer,
sw.buffer_offset + swizzled.offset,
image.guest_size_bytes - sw.buffer_offset);
compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0,
image.compute_unswizzle_buffer_size);
const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
const VkDescriptorSet set = descriptor_allocator.Commit();
const u32 gx = Common::DivCeil(blocks_x, 8u);
const u32 gy = Common::DivCeil(blocks_y, 8u);
const u32 gz = Common::DivCeil(z_count, 4u);
const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
const VkDeviceSize output_slice_size =
static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
const VkDeviceSize barrier_size = output_slice_size * z_count;
const bool is_first_chunk = (z_start == 0);
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
const u32 image_width = image.info.size.width;
const u32 image_height = image.info.size.height;
scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
barrier_size, is_first_chunk, out_buffer, dst_image, aspect,
image_width, image_height
](vk::CommandBuffer cmdbuf) {
if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
return;
}
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
cmdbuf.Dispatch(gx, gy, gz);
// Single barrier for compute -> transfer (buffer ready, image transition)
const VkBufferMemoryBarrier buffer_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = out_buffer,
.offset = 0,
.size = barrier_size,
};
// Image layout transition
const VkImageMemoryBarrier pre_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_first_chunk ? VkAccessFlags{} :
static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED :
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
// Single barrier handles both buffer and image
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0,
nullptr, buffer_barrier, pre_barrier
);
// Copy chunk to correct Z position in image
const VkBufferImageCopy copy{
.bufferOffset = 0, // Read from start of staging buffer
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
.imageExtent = {image_width, image_height, z_count},
};
cmdbuf.CopyBufferToImage(out_buffer, dst_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
// Post-copy transition
const VkImageMemoryBarrier post_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
nullptr, nullptr, post_barrier
);
});
scheduler.Finish();
}
MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,

View File

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -14,6 +17,7 @@
#include "video_core/texture_cache/types.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/texture_cache/accelerated_swizzle.h"
namespace VideoCommon {
struct SwizzleParameters;
@ -21,6 +25,8 @@ struct SwizzleParameters;
namespace Vulkan {
using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
class Device;
class StagingBufferPool;
class Scheduler;
@ -131,6 +137,34 @@ private:
MemoryAllocator& memory_allocator;
};
class BlockLinearUnswizzle3DPass final : public ComputePass {
public:
explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
~BlockLinearUnswizzle3DPass();
void Unswizzle(Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
void UnswizzleChunk(
Image& image,
const StagingBufferRef& swizzled,
const VideoCommon::SwizzleParameters& sw,
const BlockLinearSwizzle3DParams& params,
u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count);
private:
Scheduler& scheduler;
StagingBufferPool& staging_buffer_pool;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class MSAACopyPass final : public ComputePass {
public:
explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,

View File

@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
: device{device_}, state_tracker{state_tracker_},
master_semaphore{std::make_unique<MasterSemaphore>(device)},
command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
/*// PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
{
std::scoped_lock rl{reserve_mutex};
chunk_reserve.reserve(2048); // Prevent vector resizing
for (int i = 0; i < 1024; ++i) {
chunk_reserve.push_back(std::make_unique<CommandChunk>());
}
}*/
AcquireNewChunk();
AllocateWorkerCommandBuffer();
worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });

View File

@ -24,12 +24,14 @@
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/surface.h"
#include "video_core/texture_cache/formatter.h"
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/util.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/textures/decoders.h"
namespace Vulkan {
@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
}
}
}
bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
staging_buffer_pool, compute_pass_descriptor_queue);
// --- Create swizzle table buffer ---
{
auto table = Tegra::Texture::MakeSwizzleTable();
swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = swizzle_table_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
};
swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([staging_buf = staging.buffer,
dst_buf = *swizzle_table_buffer,
size = swizzle_table_size,
src_off = staging.offset](vk::CommandBuffer cmdbuf) {
const VkBufferCopy region{
.srcOffset = src_off,
.dstOffset = 0,
.size = size,
};
cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
});
}
}
void TextureCacheRuntime::Finish() {
scheduler.Finish();
}
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
return staging_buffer_pool.Request(size, MemoryUsage::Upload);
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
}
StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default;
void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
if (has_compute_unswizzle_buffer)
return;
using VideoCore::Surface::BytesPerBlock;
const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
const u32 block_width = 4;
const u32 block_height = 4;
// BCn is 4x4x1 blocks
const u32 blocks_x = (info.size.width + block_width - 1) / block_width;
const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
const u32 blocks_z = std::min(max_slices, info.size.depth);
const u64 block_count =
static_cast<u64>(blocks_x) *
static_cast<u64>(blocks_y) *
static_cast<u64>(blocks_z);
compute_unswizzle_buffer_size = block_count * block_bytes;
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = compute_unswizzle_buffer_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
compute_unswizzle_buffer =
runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
has_compute_unswizzle_buffer = true;
}
void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
// TODO: Move this to another API
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
void TextureCacheRuntime::AccelerateImageUpload(
Image& image, const StagingBufferRef& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) {
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
if (IsPixelFormatASTC(image.info.format)) {
return astc_decoder_pass->Assemble(image, map, swizzles);
}
if (bl3d_unswizzle_pass &&
IsPixelFormatBCn(image.info.format) &&
image.info.type == ImageType::e3D &&
image.info.resources.levels == 1 &&
image.info.resources.layers == 1) {
return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
}
ASSERT(false);
}

View File

@ -51,7 +51,7 @@ public:
void Finish();
StagingBufferRef UploadStagingBuffer(size_t size);
StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);
StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);
@ -91,7 +91,8 @@ public:
}
void AccelerateImageUpload(Image&, const StagingBufferRef&,
std::span<const VideoCommon::SwizzleParameters>);
std::span<const VideoCommon::SwizzleParameters>,
u32 z_start, u32 z_count);
void InsertUploadMemoryBarrier() {}
@ -127,6 +128,11 @@ public:
BlitImageHelper& blit_image_helper;
RenderPassCache& render_pass_cache;
std::optional<ASTCDecoderPass> astc_decoder_pass;
std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
vk::Buffer swizzle_table_buffer;
VkDeviceSize swizzle_table_size = 0;
std::unique_ptr<MSAACopyPass> msaa_copy_pass;
const Settings::ResolutionScalingInfo& resolution;
std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
@ -164,6 +170,8 @@ public:
void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
void AllocateComputeUnswizzleImage();
[[nodiscard]] VkImage Handle() const noexcept {
return *(this->*current_image);
}
@ -189,6 +197,10 @@ public:
bool ScaleDown(bool ignore = false);
u64 allocation_tick;
friend class BlockLinearUnswizzle3DPass;
private:
bool BlitScaleHelper(bool scale_up);
@ -200,6 +212,12 @@ private:
vk::Image original_image;
vk::Image scaled_image;
vk::Buffer compute_unswizzle_buffer;
VkDeviceSize compute_unswizzle_buffer_size = 0;
bool has_compute_unswizzle_buffer = false;
void AllocateComputeUnswizzleBuffer(u32 max_slices);
// Use a pointer to field because it is relative, so that the object can be
// moved without breaking the reference.
vk::Image Image::*current_image{};

View File

@ -8,6 +8,7 @@
#include <limits>
#include <optional>
#include <bit>
#include <unordered_set>
#include <boost/container/small_vector.hpp>
@ -22,6 +23,7 @@
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/texture_cache_base.h"
#include "video_core/texture_cache/util.h"
#include "video_core/textures/decoders.h"
namespace VideoCommon {
@ -68,10 +70,41 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
(std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
DEFAULT_CRITICAL_MEMORY));
minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
lowmemorydevice = false;
} else {
expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
minimum_memory = 0;
lowmemorydevice = true;
}
switch (Settings::values.gpu_unzwizzle_texture_size.GetValue()) {
case Settings::GpuUnswizzleSize::VerySmall: gpu_unswizzle_maxsize = 16_MiB; break;
case Settings::GpuUnswizzleSize::Small: gpu_unswizzle_maxsize = 32_MiB; break;
case Settings::GpuUnswizzleSize::Normal: gpu_unswizzle_maxsize = 128_MiB; break;
case Settings::GpuUnswizzleSize::Large: gpu_unswizzle_maxsize = 256_MiB; break;
case Settings::GpuUnswizzleSize::VeryLarge: gpu_unswizzle_maxsize = 512_MiB; break;
default: gpu_unswizzle_maxsize = 128_MiB; break;
}
switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) {
case Settings::GpuUnswizzle::VeryLow: swizzle_chunk_size = 4_MiB; break;
case Settings::GpuUnswizzle::Low: swizzle_chunk_size = 8_MiB; break;
case Settings::GpuUnswizzle::Normal: swizzle_chunk_size = 16_MiB; break;
case Settings::GpuUnswizzle::Medium: swizzle_chunk_size = 32_MiB; break;
case Settings::GpuUnswizzle::High: swizzle_chunk_size = 64_MiB; break;
default: swizzle_chunk_size = 16_MiB;
}
switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) {
case Settings::GpuUnswizzleChunk::VeryLow: swizzle_slices_per_batch = 32; break;
case Settings::GpuUnswizzleChunk::Low: swizzle_slices_per_batch = 64; break;
case Settings::GpuUnswizzleChunk::Normal: swizzle_slices_per_batch = 128; break;
case Settings::GpuUnswizzleChunk::Medium: swizzle_slices_per_batch = 256; break;
case Settings::GpuUnswizzleChunk::High: swizzle_slices_per_batch = 512; break;
default: swizzle_slices_per_batch = 128;
}
}
@ -88,6 +121,7 @@ void TextureCache<P>::RunGarbageCollector() {
ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 50ULL;
num_iterations = aggressive_mode ? 40 : (high_priority_mode ? 20 : 10);
};
const auto Cleanup = [this, &num_iterations, &high_priority_mode,
&aggressive_mode](ImageId image_id) {
if (num_iterations == 0) {
@ -95,20 +129,36 @@ void TextureCache<P>::RunGarbageCollector() {
}
--num_iterations;
auto& image = slot_images[image_id];
// Never delete recently allocated sparse textures (within 3 frames)
const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
if (is_recently_allocated && image.info.is_sparse) {
return false;
}
if (True(image.flags & ImageFlagBits::IsDecoding)) {
// This image is still being decoded, deleting it will invalidate the slot
// used by the async decoder thread.
return false;
}
if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) {
// Prioritize large sparse textures for cleanup
const bool is_large_sparse = lowmemorydevice &&
image.info.is_sparse &&
image.guest_size_bytes >= 256_MiB;
if (!aggressive_mode && !is_large_sparse &&
True(image.flags & ImageFlagBits::CostlyLoad)) {
return false;
}
const bool must_download =
image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
if (!high_priority_mode && must_download) {
if (!high_priority_mode && !is_large_sparse && must_download) {
return false;
}
if (must_download) {
if (must_download && !is_large_sparse) {
auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
const auto copies = FixSmallVectorADL(FullDownloadCopies(image.info));
image.DownloadMemory(map, copies);
@ -116,11 +166,13 @@ void TextureCache<P>::RunGarbageCollector() {
SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span,
swizzle_data_buffer);
}
if (True(image.flags & ImageFlagBits::Tracked)) {
UntrackImage(image, image_id);
}
UnregisterImage(image_id);
DeleteImage(image_id, image.scale_tick > frame_tick + 5);
if (total_used_memory < critical_memory) {
if (aggressive_mode) {
// Sink the aggresiveness.
@ -136,7 +188,24 @@ void TextureCache<P>::RunGarbageCollector() {
return false;
};
// Try to remove anything old enough and not high priority.
// Aggressively clear massive sparse textures
if (total_used_memory >= expected_memory) {
lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
auto& image = slot_images[image_id];
// Only target sparse textures that are old enough
if (lowmemorydevice &&
image.info.is_sparse &&
image.guest_size_bytes >= 256_MiB &&
image.allocation_tick < frame_tick - 3) {
LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)",
image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
frame_tick - image.allocation_tick);
return Cleanup(image_id);
}
return false;
});
}
Configure(false);
lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, Cleanup);
@ -160,6 +229,7 @@ void TextureCache<P>::TickFrame() {
sentenced_framebuffers.Tick();
sentenced_image_view.Tick();
TickAsyncDecode();
TickAsyncUnswizzle();
runtime.TickFrame();
++frame_tick;
@ -627,7 +697,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
UntrackImage(image, id);
}
}
if (True(image.flags & ImageFlagBits::Remapped)) {
continue;
}
@ -1055,7 +1124,12 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
// Only upload modified images
return;
}
image.flags &= ~ImageFlagBits::CpuModified;
if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) {
return;
}
TrackImage(image, image_id);
if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
@ -1067,6 +1141,16 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
QueueAsyncDecode(image, image_id);
return;
}
if (IsPixelFormatBCn(image.info.format) &&
image.info.type == ImageType::e3D &&
image.info.resources.levels == 1 &&
image.info.resources.layers == 1 &&
MapSizeBytes(image) >= gpu_unswizzle_maxsize &&
False(image.flags & ImageFlagBits::GpuModified)) {
QueueAsyncUnswizzle(image, image_id);
return;
}
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, staging);
runtime.InsertUploadMemoryBarrier();
@ -1082,7 +1166,7 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(),
VideoCommon::CacheType::NoTextureCache);
const auto uploads = FullUploadSwizzles(image.info);
runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads));
runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0);
return;
}
@ -1311,6 +1395,20 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
texture_decode_worker.QueueWork(std::move(func));
}
template <class P>
void TextureCache<P>::QueueAsyncUnswizzle(Image& image, ImageId image_id) {
if (True(image.flags & ImageFlagBits::IsDecoding)) {
return;
}
image.flags |= ImageFlagBits::IsDecoding;
unswizzle_queue.push_back({
.image_id = image_id,
.info = image.info
});
}
template <class P>
void TextureCache<P>::TickAsyncDecode() {
bool has_uploads{};
@ -1336,6 +1434,83 @@ void TextureCache<P>::TickAsyncDecode() {
}
}
template <class P>
void TextureCache<P>::TickAsyncUnswizzle() {
if (unswizzle_queue.empty()) {
return;
}
if(current_unswizzle_frame > 0) {
current_unswizzle_frame--;
return;
}
PendingUnswizzle& task = unswizzle_queue.front();
Image& image = slot_images[task.image_id];
if (!task.initialized) {
task.total_size = MapSizeBytes(image);
task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true);
const auto& info = image.info;
const u32 bytes_per_block = BytesPerBlock(info.format);
const u32 width_blocks = Common::DivCeil(info.size.width, 4u);
const u32 height_blocks = Common::DivCeil(info.size.height, 4u);
const u32 stride = width_blocks * bytes_per_block;
const u32 aligned_height = height_blocks;
task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
task.last_submitted_offset = 0;
task.initialized = true;
}
// Read data
if (task.current_offset < task.total_size) {
const size_t remaining = task.total_size - task.current_offset;
size_t copy_amount = std::min(swizzle_chunk_size, remaining);
if (remaining > swizzle_chunk_size) {
copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
if (copy_amount == 0) copy_amount = task.bytes_per_slice;
}
gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
task.staging_buffer.mapped_span.data() + task.current_offset,
copy_amount);
task.current_offset += copy_amount;
}
const bool is_final_batch = task.current_offset >= task.total_size;
const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
if (complete_slices >= swizzle_slices_per_batch || (is_final_batch && complete_slices > 0)) {
const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 slices_to_process = std::min(complete_slices, swizzle_slices_per_batch);
const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);
if (z_count > 0) {
const auto uploads = FullUploadSwizzles(task.info);
runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
}
}
// Check if complete
const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
if (is_final_batch && all_slices_submitted) {
runtime.FreeDeferredStagingBuffer(task.staging_buffer);
image.flags &= ~ImageFlagBits::IsDecoding;
unswizzle_queue.pop_front();
// Wait 4 frames to process the next entry
current_unswizzle_frame = 4u;
}
}
template <class P>
bool TextureCache<P>::ScaleUp(Image& image) {
const bool has_copy = image.HasScaled();
@ -1374,6 +1549,39 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
}
}
ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
// For large sparse textures, aggressively clean up old allocations at same address
if (lowmemorydevice && info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
const auto alloc_it = image_allocs_table.find(gpu_addr);
if (alloc_it != image_allocs_table.end()) {
const ImageAllocId alloc_id = alloc_it->second;
auto& alloc_images = slot_image_allocs[alloc_id].images;
// Collect old images at this address that were created more than 2 frames ago
boost::container::small_vector<ImageId, 4> to_delete;
for (ImageId old_image_id : alloc_images) {
Image& old_image = slot_images[old_image_id];
if (old_image.info.is_sparse &&
old_image.gpu_addr == gpu_addr &&
old_image.allocation_tick < frame_tick - 2) { // Try not to delete fresh textures
to_delete.push_back(old_image_id);
}
}
// Delete old images immediately
for (ImageId old_id : to_delete) {
Image& old_image = slot_images[old_id];
LOG_DEBUG(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)",
gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
if (True(old_image.flags & ImageFlagBits::Tracked)) {
UntrackImage(old_image, old_id);
}
UnregisterImage(old_id);
DeleteImage(old_id, true);
}
}
}
const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr);
const Image& image = slot_images[image_id];
// Using "image.gpu_addr" instead of "gpu_addr" is important because it might be different
@ -1389,6 +1597,27 @@ template <class P>
ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DAddr cpu_addr) {
ImageInfo new_info = info;
const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
// Proactive cleanup for large sparse texture allocations
if (lowmemorydevice && new_info.is_sparse && size_bytes >= 256_MiB) {
const u64 estimated_alloc_size = size_bytes;
if (total_used_memory + estimated_alloc_size >= critical_memory) {
LOG_DEBUG(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
"Current memory: {} MiB, Critical: {} MiB",
size_bytes / (1024 * 1024),
total_used_memory / (1024 * 1024),
critical_memory / (1024 * 1024));
RunGarbageCollector();
// If still over threshold after GC, try one more aggressive pass
if (total_used_memory + estimated_alloc_size >= critical_memory) {
LOG_DEBUG(HW_GPU, "Still critically low on memory, running second GC pass");
RunGarbageCollector();
}
}
}
const bool broken_views = runtime.HasBrokenTextureViewFormats();
const bool native_bgr = runtime.HasNativeBgr();
join_overlap_ids.clear();
@ -1485,6 +1714,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
Image& new_image = slot_images[new_image_id];
new_image.allocation_tick = frame_tick;
if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
new_info.is_sparse) {
new_image.flags |= ImageFlagBits::Sparse;

View File

@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
using AsyncBuffer = typename P::AsyncBuffer;
using BufferType = typename P::BufferType;
struct PendingUnswizzle {
ImageId image_id;
VideoCommon::ImageInfo info;
size_t current_offset = 0;
size_t total_size = 0;
AsyncBuffer staging_buffer;
size_t last_submitted_offset = 0;
size_t bytes_per_slice;
bool initialized = false;
};
struct BlitImages {
ImageId dst_id;
ImageId src_id;
@ -433,6 +444,9 @@ private:
void TrimInactiveSamplers(size_t budget);
std::optional<size_t> QuerySamplerBudget() const;
void QueueAsyncUnswizzle(Image& image, ImageId image_id);
void TickAsyncUnswizzle();
Runtime& runtime;
Tegra::MaxwellDeviceMemoryManager& device_memory;
@ -453,6 +467,10 @@ private:
u64 minimum_memory;
u64 expected_memory;
u64 critical_memory;
bool lowmemorydevice = false;
size_t gpu_unswizzle_maxsize = 0;
size_t swizzle_chunk_size = 0;
u32 swizzle_slices_per_batch = 0;
struct BufferDownload {
GPUVAddr address;
@ -508,6 +526,9 @@ private:
Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;
std::deque<PendingUnswizzle> unswizzle_queue;
u8 current_unswizzle_frame;
// Join caching
boost::container::small_vector<ImageId, 4> join_overlap_ids;
std::unordered_set<ImageId> join_overlaps_found;