diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index da21b4bde8..2ca3bfcdeb 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -83,6 +83,12 @@ int result_index = 0; uint result_vector_max_index; bool result_limit_reached = false; +// avoid intermediate result_vector storage during color decode phase +bool write_color_values = false; +uint color_values_direct[32]; +uint color_out_index = 0; +uint color_num_values = 0; + // EncodingData helpers uint Encoding(EncodingData val) { return bitfieldExtract(val.data, 0, 8); @@ -114,9 +120,110 @@ EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | ((bit_val) << 16u) | ((quint_trit_val) << 24u)); } +uint ReplicateBitTo9(uint bit); +uint FastReplicateTo8(uint value, uint num_bits); + +void EmitColorValue(EncodingData val) { + // write directly to color_values_direct[] + const uint encoding = Encoding(val); + const uint bitlen = NumBits(val); + const uint bitval = BitValue(val); + + if (encoding == JUST_BITS) { + color_values_direct[++color_out_index] = FastReplicateTo8(bitval, bitlen); + return; + } + + uint A = ReplicateBitTo9((bitval & 1)); + uint B = 0, C = 0, D = QuintTritValue(val); + + if (encoding == TRIT) { + switch (bitlen) { + case 1: + C = 204; + break; + case 2: { + C = 93; + const uint b = (bitval >> 1) & 1; + B = (b << 8) | (b << 4) | (b << 2) | (b << 1); + break; + } + case 3: { + C = 44; + const uint cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 2) | cb; + break; + } + case 4: { + C = 22; + const uint dcb = (bitval >> 1) & 7; + B = (dcb << 6) | dcb; + break; + } + case 5: { + C = 11; + const uint edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 2); + break; + } + case 6: { + C = 5; + const uint fedcb = (bitval >> 1) & 0x1F; + B = (fedcb << 4) | (fedcb >> 4); + break; + } + } + } else { // QUINT + switch (bitlen) { + case 1: + C = 113; + break; + case 2: { + C = 54; + const uint b = (bitval >> 1) & 1; + B = (b << 8) | (b << 3) | (b << 2); + break; + } + case 3: { + C = 26; + const uint cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 1) | (cb >> 1); + break; + } + case 4: { + C = 13; + const uint dcb = (bitval >> 1) & 7; + B = (dcb << 6) | (dcb >> 1); + break; + } + case 5: { + C = 6; + const uint edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 3); + break; + } + } + } + + uint T = (D * C) + B; + T ^= A; + T = (A & 0x80) | (T >> 2); + color_values_direct[++color_out_index] = T; +} + void ResultEmplaceBack(EncodingData val) { + if (write_color_values) { + if (color_out_index >= color_num_values) { + // avoid decoding more than needed by this phase + result_limit_reached = true; + return; + } + EmitColorValue(val); + return; + } + if (result_index >= result_vector_max_index) { // Alert callers to avoid decoding more than needed by this phase result_limit_reached = true; @@ -457,7 +564,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { } } -void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) { +void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { uint num_values = 0; for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; @@ -471,104 +578,21 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, o break; } } - DecodeIntegerSequence(range - 1, num_values); - uint out_index = 0; - for (int itr = 0; itr < result_index; ++itr) { - if (out_index >= num_values) { - break; - } - const EncodingData val = GetEncodingFromVector(itr); - const uint encoding = Encoding(val); - const uint bitlen = NumBits(val); - const uint bitval = BitValue(val); - uint A = 0, B = 0, C = 0, D = 0; - A = ReplicateBitTo9((bitval & 1)); - switch (encoding) { - case JUST_BITS: - color_values[++out_index] = FastReplicateTo8(bitval, bitlen); - break; - case TRIT: { - D = QuintTritValue(val); - switch (bitlen) { - case 1: - C = 204; - break; - case 2: { - C = 93; - const uint b = (bitval >> 1) & 1; - B = (b << 8) | (b << 4) | (b << 2) | (b << 1); - break; - } - case 3: { - C = 44; - const uint cb = (bitval >> 1) & 3; - B = (cb << 7) | (cb << 2) | cb; - break; - } - case 4: { - C = 22; - const uint dcb = (bitval >> 1) & 7; - B = (dcb << 6) | dcb; - break; - } - case 5: { - C = 11; - const uint edcb = (bitval >> 1) & 0xF; - B = (edcb << 5) | (edcb >> 2); - break; - } - case 6: { - C = 5; - const uint fedcb = (bitval >> 1) & 0x1F; - B = (fedcb << 4) | (fedcb >> 4); - break; - } - } - break; - } - case QUINT: { - D = QuintTritValue(val); - switch (bitlen) { - case 1: - C = 113; - break; - case 2: { - C = 54; - const uint b = (bitval >> 1) & 1; - B = (b << 8) | (b << 3) | (b << 2); - break; - } - case 3: { - C = 26; - const uint cb = (bitval >> 1) & 3; - B = (cb << 7) | (cb << 1) | (cb >> 1); - break; - } - case 4: { - C = 13; - const uint dcb = (bitval >> 1) & 7; - B = (dcb << 6) | (dcb >> 1); - break; - } - case 5: { - C = 6; - const uint edcb = (bitval >> 1) & 0xF; - B = (edcb << 5) | (edcb >> 3); - break; - } - } - break; - } - } - if (encoding != JUST_BITS) { - uint T = (D * C) + B; - T ^= A; - T = (A & 0x80) | (T >> 2); - color_values[++out_index] = T; - } + // Decode directly into color_values_direct[] + write_color_values = true; + color_out_index = 0; + color_num_values = num_values; + for (uint i = 0; i < 32; ++i) { + color_values_direct[i] = 0; } + + DecodeIntegerSequence(range - 1, num_values); + + write_color_values = false; } + + ivec2 BitTransferSigned(int a, int b) { ivec2 transferred; transferred.y = b >> 1; @@ -1069,13 +1093,12 @@ void DecompressBlock(ivec3 coord) { uvec4 endpoints0[4]; uvec4 endpoints1[4]; { - // This decode phase should at most push 32 elements into the vector - result_vector_max_index = 32; - uint color_values[32]; + // Decode directly into color_values_direct[] (no intermediate result_vector storage) + result_limit_reached = false; uint colvals_index = 0; - DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values); + DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); for (uint i = 0; i < num_partitions; i++) { - ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values, + ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values_direct, colvals_index); } }