From 670764b535c558f049a855d9116ba2c32999c470 Mon Sep 17 00:00:00 2001 From: wildcard Date: Mon, 2 Feb 2026 00:12:32 +0100 Subject: [PATCH] [astc] add support for workgroup in astc --- src/video_core/host_shaders/astc_decoder.comp | 688 ++++++++++-------- .../renderer_vulkan/vk_compute_pass.cpp | 4 +- 2 files changed, 374 insertions(+), 318 deletions(-) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index da21b4bde8..3e3d2dcacf 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -36,6 +36,12 @@ struct EncodingData { uint data; }; +struct PartitionTable { + uint s1, s2, s3, s4, s5, s6, s7, s8; + uint rnum; + bool small_block; +}; + layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 { uvec4 astc_data[]; }; @@ -62,26 +68,40 @@ const uint encoding_values[22] = uint[]( (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); -// Input ASTC texture globals -int total_bitsread = 0; -uvec4 local_buff; +// Shared memory for workgroup processing +shared uvec4 local_buff; +shared int total_bitsread; // Color data globals -uvec4 color_endpoint_data; -int color_bitsread = 0; +shared uvec4 color_endpoint_data; +shared int color_bitsread; // Global "vector" to be pushed into when decoding -// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode -// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode -// So the maximum would be 144 (12 x 12) elements, x 2 for two planes #define DIVCEIL(number, divisor) (number + divisor - 1) / divisor #define ARRAY_NUM_ELEMENTS 144 #define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) -uint result_vector[ARRAY_NUM_ELEMENTS * 2]; +shared uint result_vector[ARRAY_NUM_ELEMENTS * 2]; -int result_index = 0; -uint result_vector_max_index; -bool result_limit_reached = false; +shared int result_index; +shared uint result_vector_max_index; +shared bool result_limit_reached; + +// avoid intermediate result_vector storage during color decode phase +shared bool write_color_values; +shared uint color_values_direct[32]; +shared uint color_out_index; +shared uint color_num_values; + +// Shared variables for DecompressBlock interthread communication +shared uvec4 endpoints0[4]; +shared uvec4 endpoints1[4]; +shared PartitionTable pt; +shared uvec2 size_params; +shared uint num_partitions; +shared uint partition_index; +shared uint plane_index; +shared bool dual_plane; +shared vec4 fill_color; // EncodingData helpers uint Encoding(EncodingData val) { @@ -114,9 +134,110 @@ EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | ((bit_val) << 16u) | ((quint_trit_val) << 24u)); } +uint ReplicateBitTo9(uint bit); +uint FastReplicateTo8(uint value, uint num_bits); + +void EmitColorValue(EncodingData val) { + // write directly to color_values_direct[] + const uint encoding = Encoding(val); + const uint bitlen = NumBits(val); + const uint bitval = BitValue(val); + + if (encoding == JUST_BITS) { + color_values_direct[++color_out_index] = FastReplicateTo8(bitval, bitlen); + return; + } + + uint A = ReplicateBitTo9((bitval & 1)); + uint B = 0, C = 0, D = QuintTritValue(val); + + if (encoding == TRIT) { + switch (bitlen) { + case 1: + C = 204; + break; + case 2: { + C = 93; + const uint b = (bitval >> 1) & 1; + B = (b << 8) | (b << 4) | (b << 2) | (b << 1); + break; + } + case 3: { + C = 44; + const uint cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 2) | cb; + break; + } + case 4: { + C = 22; + const uint dcb = (bitval >> 1) & 7; + B = (dcb << 6) | dcb; + break; + } + case 5: { + C = 11; + const uint edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 2); + break; + } + case 6: { + C = 5; + const uint fedcb = (bitval >> 1) & 0x1F; + B = (fedcb << 4) | (fedcb >> 4); + break; + } + } + } else { // QUINT + switch (bitlen) { + case 1: + C = 113; + break; + case 2: { + C = 54; + const uint b = (bitval >> 1) & 1; + B = (b << 8) | (b << 3) | (b << 2); + break; + } + case 3: { + C = 26; + const uint cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 1) | (cb >> 1); + break; + } + case 4: { + C = 13; + const uint dcb = (bitval >> 1) & 7; + B = (dcb << 6) | (dcb >> 1); + break; + } + case 5: { + C = 6; + const uint edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 3); + break; + } + } + } + + uint T = (D * C) + B; + T ^= A; + T = (A & 0x80) | (T >> 2); + color_values_direct[++color_out_index] = T; +} + void ResultEmplaceBack(EncodingData val) { + if (write_color_values) { + if (color_out_index >= color_num_values) { + // avoid decoding more than needed by this phase + result_limit_reached = true; + return; + } + EmitColorValue(val); + return; + } + if (result_index >= result_vector_max_index) { // Alert callers to avoid decoding more than needed by this phase result_limit_reached = true; @@ -197,32 +318,31 @@ uint Hash52(uint p) { return p; } -uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { - if ((block_dims.y * block_dims.x) < 32) { - x <<= 1; - y <<= 1; - } + +PartitionTable GetPartitionTable(uint seed, uint partition_count) { + PartitionTable pt; + pt.small_block = (block_dims.y * block_dims.x) < 32; seed += (partition_count - 1) * 1024; + uint rnum = Hash52(uint(seed)); + pt.rnum = rnum; - const uint rnum = Hash52(uint(seed)); - uint seed1 = uint(rnum & 0xF); - uint seed2 = uint((rnum >> 4) & 0xF); - uint seed3 = uint((rnum >> 8) & 0xF); - uint seed4 = uint((rnum >> 12) & 0xF); - uint seed5 = uint((rnum >> 16) & 0xF); - uint seed6 = uint((rnum >> 20) & 0xF); - uint seed7 = uint((rnum >> 24) & 0xF); - uint seed8 = uint((rnum >> 28) & 0xF); - - seed1 = (seed1 * seed1); - seed2 = (seed2 * seed2); - seed3 = (seed3 * seed3); - seed4 = (seed4 * seed4); - seed5 = (seed5 * seed5); - seed6 = (seed6 * seed6); - seed7 = (seed7 * seed7); - seed8 = (seed8 * seed8); + uint seed1 = (rnum & 0xF); + seed1 *= seed1; + uint seed2 = (rnum >> 4) & 0xF; + seed2 *= seed2; + uint seed3 = (rnum >> 8) & 0xF; + seed3 *= seed3; + uint seed4 = (rnum >> 12) & 0xF; + seed4 *= seed4; + uint seed5 = (rnum >> 16) & 0xF; + seed5 *= seed5; + uint seed6 = (rnum >> 20) & 0xF; + seed6 *= seed6; + uint seed7 = (rnum >> 24) & 0xF; + seed7 *= seed7; + uint seed8 = (rnum >> 28) & 0xF; + seed8 *= seed8; uint sh1, sh2; if ((seed & 1) > 0) { @@ -232,31 +352,37 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { sh1 = (partition_count == 3) ? 6 : 5; sh2 = (seed & 2) > 0 ? 4 : 5; } - seed1 >>= sh1; - seed2 >>= sh2; - seed3 >>= sh1; - seed4 >>= sh2; - seed5 >>= sh1; - seed6 >>= sh2; - seed7 >>= sh1; - seed8 >>= sh2; - uint a = seed1 * x + seed2 * y + (rnum >> 14); - uint b = seed3 * x + seed4 * y + (rnum >> 10); - uint c = seed5 * x + seed6 * y + (rnum >> 6); - uint d = seed7 * x + seed8 * y + (rnum >> 2); + pt.s1 = seed1 >> sh1; + pt.s2 = seed2 >> sh2; + pt.s3 = seed3 >> sh1; + pt.s4 = seed4 >> sh2; + pt.s5 = seed5 >> sh1; + pt.s6 = seed6 >> sh2; + pt.s7 = seed7 >> sh1; + pt.s8 = seed8 >> sh2; + + return pt; + } + +uint SelectPartition(PartitionTable pt, uint x, uint y, uint partition_count) { + if (pt.small_block) { + x <<= 1; + y <<= 1; + } + + uint a = pt.s1 * x + pt.s2 * y + (pt.rnum >> 14); + uint b = pt.s3 * x + pt.s4 * y + (pt.rnum >> 10); + uint c = pt.s5 * x + pt.s6 * y + (pt.rnum >> 6); + uint d = pt.s7 * x + pt.s8 * y + (pt.rnum >> 2); a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F; - if (partition_count < 4) { - d = 0; - } - if (partition_count < 3) { - c = 0; - } + if (partition_count < 4) d = 0; + if (partition_count < 3) c = 0; if (a >= b && a >= c && a >= d) { return 0; @@ -457,7 +583,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { } } -void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) { +void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { uint num_values = 0; for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; @@ -471,104 +597,21 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, o break; } } - DecodeIntegerSequence(range - 1, num_values); - uint out_index = 0; - for (int itr = 0; itr < result_index; ++itr) { - if (out_index >= num_values) { - break; - } - const EncodingData val = GetEncodingFromVector(itr); - const uint encoding = Encoding(val); - const uint bitlen = NumBits(val); - const uint bitval = BitValue(val); - uint A = 0, B = 0, C = 0, D = 0; - A = ReplicateBitTo9((bitval & 1)); - switch (encoding) { - case JUST_BITS: - color_values[++out_index] = FastReplicateTo8(bitval, bitlen); - break; - case TRIT: { - D = QuintTritValue(val); - switch (bitlen) { - case 1: - C = 204; - break; - case 2: { - C = 93; - const uint b = (bitval >> 1) & 1; - B = (b << 8) | (b << 4) | (b << 2) | (b << 1); - break; - } - case 3: { - C = 44; - const uint cb = (bitval >> 1) & 3; - B = (cb << 7) | (cb << 2) | cb; - break; - } - case 4: { - C = 22; - const uint dcb = (bitval >> 1) & 7; - B = (dcb << 6) | dcb; - break; - } - case 5: { - C = 11; - const uint edcb = (bitval >> 1) & 0xF; - B = (edcb << 5) | (edcb >> 2); - break; - } - case 6: { - C = 5; - const uint fedcb = (bitval >> 1) & 0x1F; - B = (fedcb << 4) | (fedcb >> 4); - break; - } - } - break; - } - case QUINT: { - D = QuintTritValue(val); - switch (bitlen) { - case 1: - C = 113; - break; - case 2: { - C = 54; - const uint b = (bitval >> 1) & 1; - B = (b << 8) | (b << 3) | (b << 2); - break; - } - case 3: { - C = 26; - const uint cb = (bitval >> 1) & 3; - B = (cb << 7) | (cb << 1) | (cb >> 1); - break; - } - case 4: { - C = 13; - const uint dcb = (bitval >> 1) & 7; - B = (dcb << 6) | (dcb >> 1); - break; - } - case 5: { - C = 6; - const uint edcb = (bitval >> 1) & 0xF; - B = (edcb << 5) | (edcb >> 3); - break; - } - } - break; - } - } - if (encoding != JUST_BITS) { - uint T = (D * C) + B; - T ^= A; - T = (A & 0x80) | (T >> 2); - color_values[++out_index] = T; - } + // Decode directly into color_values_direct[] + write_color_values = true; + color_out_index = 0; + color_num_values = num_values; + for (uint i = 0; i < 32; ++i) { + color_values_direct[i] = 0; } + + DecodeIntegerSequence(range - 1, num_values); + + write_color_values = false; } + + ivec2 BitTransferSigned(int a, int b) { ivec2 transferred; transferred.y = b >> 1; @@ -730,7 +773,7 @@ uint UnquantizeTexelWeight(EncodingData val) { uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val); if (encoding == JUST_BITS) { return (bitlen >= 1 && bitlen <= 5) - ? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1))) + ? ((bitval * 64) + ((1 << bitlen) - 1) / 2) / ((1 << bitlen) - 1) : FastReplicateTo6(bitval, bitlen); } else if (encoding == TRIT || encoding == QUINT) { uint B = 0, C = 0, D = 0; @@ -864,27 +907,32 @@ int FindLayout(uint mode) { void FillError(ivec3 coord) { - for (uint j = 0; j < block_dims.y; j++) { - for (uint i = 0; i < block_dims.x; i++) { - imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); - } + const uint total_texels = block_dims.x * block_dims.y; + for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { + uint x = tid % block_dims.x; + uint y = tid / block_dims.x; + imageStore(dest_image, coord + ivec3(x, y, 0), vec4(0.0, 0.0, 0.0, 0.0)); } } void FillVoidExtentLDR(ivec3 coord) { - SkipBits(52); - const uint r_u = StreamBits(16); - const uint g_u = StreamBits(16); - const uint b_u = StreamBits(16); - const uint a_u = StreamBits(16); - const float a = float(a_u) / 65535.0f; - const float r = float(r_u) / 65535.0f; - const float g = float(g_u) / 65535.0f; - const float b = float(b_u) / 65535.0f; - for (uint j = 0; j < block_dims.y; j++) { - for (uint i = 0; i < block_dims.x; i++) { - imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); - } + // Thread 0 decodes color + + if (gl_LocalInvocationIndex == 0) { + SkipBits(52); + const uint r_u = StreamBits(16); + const uint g_u = StreamBits(16); + const uint b_u = StreamBits(16); + const uint a_u = StreamBits(16); + fill_color = vec4(float(r_u) / 65535.0f, float(g_u) / 65535.0f, float(b_u) / 65535.0f, float(a_u) / 65535.0f); + } + barrier(); + + const uint total_texels = block_dims.x * block_dims.y; + for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { + uint x = tid % block_dims.x; + uint y = tid / block_dims.x; + imageStore(dest_image, coord + ivec3(x, y, 0), fill_color); } } @@ -966,160 +1014,156 @@ uint DecodeMaxWeight(uint mode) { } void DecompressBlock(ivec3 coord) { - uint mode = StreamBits(11); - if (IsError(mode)) { + if (gl_LocalInvocationIndex == 0) { + uint mode = StreamBits(11); + bool early_exit = false; + if (IsError(mode)) { + size_params = uvec2(0); + early_exit = true; + } else if ((mode & 0x1ff) == 0x1fc) { + size_params = uvec2(0xFFFFFFFF); + early_exit = true; + } else { + size_params = DecodeBlockSize(mode); + if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { + size_params = uvec2(0); + early_exit = true; + } + } + + if (!early_exit) { + num_partitions = StreamBits(2) + 1; + uint mode_layout = FindLayout(mode); + dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); + if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { + size_params = uvec2(0); + early_exit = true; + } + } + + if (!early_exit) { + uint partition_index_local = 1; + uvec4 color_endpoint_mode = uvec4(0); + uint ced_pointer = 0; + uint base_cem = 0; + if (num_partitions == 1) { + color_endpoint_mode.x = StreamBits(4); + partition_index_local = 0; + } else { + partition_index_local = StreamBits(10); + base_cem = StreamBits(6); + } + partition_index = partition_index_local; // Store to shared + const uint base_mode = base_cem & 3; + const uint max_weight = DecodeMaxWeight(mode); + const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); + uint remaining_bits = 128 - weight_bits - total_bitsread; + uint extra_cem_bits = 0; + if (base_mode > 0) { + switch (num_partitions) { + case 2: extra_cem_bits += 2; break; + case 3: extra_cem_bits += 5; break; + case 4: extra_cem_bits += 8; break; + } + } + remaining_bits -= extra_cem_bits; + const uint plane_selector_bits = dual_plane ? 2 : 0; + remaining_bits -= plane_selector_bits; + if (remaining_bits > 128) { + size_params = uvec2(0); // Error + } else { + const uint color_data_bits = remaining_bits; + while (remaining_bits > 0) { + const int nb = int(min(remaining_bits, 32U)); + const uint b = StreamBits(nb); + color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); + ++ced_pointer; + remaining_bits -= nb; + } + plane_index = uint(StreamBits(plane_selector_bits)); + if (base_mode > 0) { + const uint extra_cem = StreamBits(extra_cem_bits); + uint cem = (extra_cem << 6) | base_cem; + cem >>= 2; + uvec4 C = uvec4(0); + for (uint i = 0; i < num_partitions; i++) { + C[i] = (cem & 1); cem >>= 1; + } + uvec4 M = uvec4(0); + for (uint i = 0; i < num_partitions; i++) { + M[i] = cem & 3; cem >>= 2; + } + for (uint i = 0; i < num_partitions; i++) { + color_endpoint_mode[i] = base_mode; + if (C[i] == 0) --color_endpoint_mode[i]; + color_endpoint_mode[i] <<= 2; + color_endpoint_mode[i] |= M[i]; + } + } else if (num_partitions > 1) { + const uint cem = base_cem >> 2; + for (uint i = 0; i < num_partitions; i++) { + color_endpoint_mode[i] = cem; + } + } + + result_limit_reached = false; + uint colvals_index = 0; + DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); + for (uint i = 0; i < num_partitions; i++) { + ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values_direct, colvals_index); + } + + color_endpoint_data = local_buff; + color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; + const uint clear_byte_start = (weight_bits >> 3) + 1; + const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & uint(((1 << (weight_bits % 8)) - 1)); + const uint vec_index = (clear_byte_start - 1) >> 2; + color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); + for (uint i = clear_byte_start; i < 16; ++i) { + const uint idx = i >> 2; + color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); + } + + result_index = 0; + color_bitsread = 0; + result_limit_reached = false; + result_vector_max_index = size_params.x * size_params.y; + if (dual_plane) result_vector_max_index *= 2; + DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); + UnquantizeTexelWeights(size_params, dual_plane); + + if (num_partitions > 1) { + pt = GetPartitionTable(partition_index, num_partitions); + } + } + } + } + barrier(); + + if (size_params.x == 0) { FillError(coord); return; } - if ((mode & 0x1ff) == 0x1fc) { - // params.void_extent_ldr = true; + if (size_params.x == 0xFFFFFFFF) { FillVoidExtentLDR(coord); return; } - const uvec2 size_params = DecodeBlockSize(mode); - if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { - FillError(coord); - return; - } - const uint num_partitions = StreamBits(2) + 1; - const uint mode_layout = FindLayout(mode); - const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); - if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { - FillError(coord); - return; - } - uint partition_index = 1; - uvec4 color_endpoint_mode = uvec4(0); - uint ced_pointer = 0; - uint base_cem = 0; - if (num_partitions == 1) { - color_endpoint_mode.x = StreamBits(4); - partition_index = 0; - } else { - partition_index = StreamBits(10); - base_cem = StreamBits(6); - } - const uint base_mode = base_cem & 3; - const uint max_weight = DecodeMaxWeight(mode); - const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); - uint remaining_bits = 128 - weight_bits - total_bitsread; - uint extra_cem_bits = 0; - if (base_mode > 0) { - switch (num_partitions) { - case 2: - extra_cem_bits += 2; - break; - case 3: - extra_cem_bits += 5; - break; - case 4: - extra_cem_bits += 8; - break; - default: - return; - } - } - remaining_bits -= extra_cem_bits; - const uint plane_selector_bits = dual_plane ? 2 : 0; - remaining_bits -= plane_selector_bits; - if (remaining_bits > 128) { - // Bad data, more remaining bits than 4 bytes - // return early - return; - } - // Read color data... - const uint color_data_bits = remaining_bits; - while (remaining_bits > 0) { - const int nb = int(min(remaining_bits, 32U)); - const uint b = StreamBits(nb); - color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); - ++ced_pointer; - remaining_bits -= nb; - } - const uint plane_index = uint(StreamBits(plane_selector_bits)); - if (base_mode > 0) { - const uint extra_cem = StreamBits(extra_cem_bits); - uint cem = (extra_cem << 6) | base_cem; - cem >>= 2; - uvec4 C = uvec4(0); - for (uint i = 0; i < num_partitions; i++) { - C[i] = (cem & 1); - cem >>= 1; - } - uvec4 M = uvec4(0); - for (uint i = 0; i < num_partitions; i++) { - M[i] = cem & 3; - cem >>= 2; - } - for (uint i = 0; i < num_partitions; i++) { - color_endpoint_mode[i] = base_mode; - if (C[i] == 0) { - --color_endpoint_mode[i]; - } - color_endpoint_mode[i] <<= 2; - color_endpoint_mode[i] |= M[i]; - } - } else if (num_partitions > 1) { - const uint cem = base_cem >> 2; - for (uint i = 0; i < num_partitions; i++) { - color_endpoint_mode[i] = cem; - } - } - uvec4 endpoints0[4]; - uvec4 endpoints1[4]; - { - // This decode phase should at most push 32 elements into the vector - result_vector_max_index = 32; - uint color_values[32]; - uint colvals_index = 0; - DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values); - for (uint i = 0; i < num_partitions; i++) { - ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values, - colvals_index); - } - } - color_endpoint_data = local_buff; - color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; - const uint clear_byte_start = (weight_bits >> 3) + 1; - - const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & - uint(((1 << (weight_bits % 8)) - 1)); - const uint vec_index = (clear_byte_start - 1) >> 2; - color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, - int((clear_byte_start - 1) % 4) * 8, 8); - for (uint i = clear_byte_start; i < 16; ++i) { - const uint idx = i >> 2; - color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); - } - - // Re-init vector variables for next decode phase - result_index = 0; - color_bitsread = 0; - result_limit_reached = false; - - // The limit for the Unquantize phase, avoids decoding more data than needed. - result_vector_max_index = size_params.x * size_params.y; - if (dual_plane) { - result_vector_max_index *= 2; - } - DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); - - UnquantizeTexelWeights(size_params, dual_plane); - for (uint j = 0; j < block_dims.y; j++) { - for (uint i = 0; i < block_dims.x; i++) { - uint local_partition = 0; - if (num_partitions > 1) { - local_partition = Select2DPartition(partition_index, i, j, num_partitions); - } - const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); - const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); - const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane); - const vec4 Cf = - vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); - const vec4 p = (Cf / 65535.0f); - imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); + const uint total_texels = block_dims.x * block_dims.y; + for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { + uint x = tid % block_dims.x; + uint y = tid / block_dims.x; + + uint local_partition = 0; + if (num_partitions > 1) { + local_partition = SelectPartition(pt, x, y, num_partitions); } + const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); + const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); + const uvec4 weight_vec = GetUnquantizedWeightVector(y, x, size_params, plane_index, dual_plane); + const vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); + const vec4 p = (Cf / 65535.0f); + imageStore(dest_image, coord + ivec3(x, y, 0), p.gbar); } } @@ -1132,7 +1176,8 @@ uint SwizzleOffset(uvec2 pos) { } void main() { - uvec3 pos = gl_GlobalInvocationID; + uvec3 block_id = gl_WorkGroupID; + uvec3 pos = block_id; pos.x <<= BYTES_PER_BLOCK_LOG2; const uint swizzle = SwizzleOffset(pos.xy); const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; @@ -1144,10 +1189,21 @@ void main() { offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; offset += swizzle; - const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); + if (gl_LocalInvocationIndex == 0) { + total_bitsread = 0; + result_index = 0; + color_bitsread = 0; + write_color_values = false; + result_limit_reached = false; + color_out_index = 0; + color_num_values = 0; + local_buff = astc_data[offset / 16]; + } + barrier(); + + ivec3 coord = ivec3(block_id * uvec3(block_dims, 1)); if (any(greaterThanEqual(coord, imageSize(dest_image)))) { return; } - local_buff = astc_data[offset / 16]; DecompressBlock(coord); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 0ae81af0fb..e3d041fe6d 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -586,8 +586,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, }); for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { const size_t input_offset = swizzle.buffer_offset + map.offset; - const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U); - const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); + const u32 num_dispatches_x = swizzle.num_tiles.width; + const u32 num_dispatches_y = swizzle.num_tiles.height; const u32 num_dispatches_z = image.info.resources.layers; compute_pass_descriptor_queue.Acquire();