| // SPDX-License-Identifier: Apache-2.0 |
| // ---------------------------------------------------------------------------- |
| // Copyright 2011-2022 Arm Limited |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| // use this file except in compliance with the License. You may obtain a copy |
| // of the License at: |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| // License for the specific language governing permissions and limitations |
| // under the License. |
| // ---------------------------------------------------------------------------- |
| |
| /** |
| * @brief Functions to generate block size descriptor and decimation tables. |
| */ |
| |
| #include "astcenc_internal.h" |
| |
| /** |
| * @brief Decode the properties of an encoded 2D block mode. |
| * |
| * @param block_mode The encoded block mode. |
| * @param[out] x_weights The number of weights in the X dimension. |
| * @param[out] y_weights The number of weights in the Y dimension. |
| * @param[out] is_dual_plane True if this block mode has two weight planes. |
| * @param[out] quant_mode The quantization level for the weights. |
| * @param[out] weight_bits The storage bit count for the weights. |
| * |
| * @return Returns true if a valid mode, false otherwise. |
| */ |
| static bool decode_block_mode_2d( |
| unsigned int block_mode, |
| unsigned int& x_weights, |
| unsigned int& y_weights, |
| bool& is_dual_plane, |
| unsigned int& quant_mode, |
| unsigned int& weight_bits |
| ) { |
| unsigned int base_quant_mode = (block_mode >> 4) & 1; |
| unsigned int H = (block_mode >> 9) & 1; |
| unsigned int D = (block_mode >> 10) & 1; |
| unsigned int A = (block_mode >> 5) & 0x3; |
| |
| x_weights = 0; |
| y_weights = 0; |
| |
| if ((block_mode & 3) != 0) |
| { |
| base_quant_mode |= (block_mode & 3) << 1; |
| unsigned int B = (block_mode >> 7) & 3; |
| switch ((block_mode >> 2) & 3) |
| { |
| case 0: |
| x_weights = B + 4; |
| y_weights = A + 2; |
| break; |
| case 1: |
| x_weights = B + 8; |
| y_weights = A + 2; |
| break; |
| case 2: |
| x_weights = A + 2; |
| y_weights = B + 8; |
| break; |
| case 3: |
| B &= 1; |
| if (block_mode & 0x100) |
| { |
| x_weights = B + 2; |
| y_weights = A + 2; |
| } |
| else |
| { |
| x_weights = A + 2; |
| y_weights = B + 6; |
| } |
| break; |
| } |
| } |
| else |
| { |
| base_quant_mode |= ((block_mode >> 2) & 3) << 1; |
| if (((block_mode >> 2) & 3) == 0) |
| { |
| return false; |
| } |
| |
| unsigned int B = (block_mode >> 9) & 3; |
| switch ((block_mode >> 7) & 3) |
| { |
| case 0: |
| x_weights = 12; |
| y_weights = A + 2; |
| break; |
| case 1: |
| x_weights = A + 2; |
| y_weights = 12; |
| break; |
| case 2: |
| x_weights = A + 6; |
| y_weights = B + 6; |
| D = 0; |
| H = 0; |
| break; |
| case 3: |
| switch ((block_mode >> 5) & 3) |
| { |
| case 0: |
| x_weights = 6; |
| y_weights = 10; |
| break; |
| case 1: |
| x_weights = 10; |
| y_weights = 6; |
| break; |
| case 2: |
| case 3: |
| return false; |
| } |
| break; |
| } |
| } |
| |
| unsigned int weight_count = x_weights * y_weights * (D + 1); |
| quant_mode = (base_quant_mode - 2) + 6 * H; |
| is_dual_plane = D != 0; |
| |
| weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode)); |
| return (weight_count <= BLOCK_MAX_WEIGHTS && |
| weight_bits >= BLOCK_MIN_WEIGHT_BITS && |
| weight_bits <= BLOCK_MAX_WEIGHT_BITS); |
| } |
| |
| /** |
| * @brief Decode the properties of an encoded 3D block mode. |
| * |
| * @param block_mode The encoded block mode. |
| * @param[out] x_weights The number of weights in the X dimension. |
| * @param[out] y_weights The number of weights in the Y dimension. |
| * @param[out] z_weights The number of weights in the Z dimension. |
| * @param[out] is_dual_plane True if this block mode has two weight planes. |
| * @param[out] quant_mode The quantization level for the weights. |
| * @param[out] weight_bits The storage bit count for the weights. |
| * |
| * @return Returns true if a valid mode, false otherwise. |
| */ |
| static bool decode_block_mode_3d( |
| unsigned int block_mode, |
| unsigned int& x_weights, |
| unsigned int& y_weights, |
| unsigned int& z_weights, |
| bool& is_dual_plane, |
| unsigned int& quant_mode, |
| unsigned int& weight_bits |
| ) { |
| unsigned int base_quant_mode = (block_mode >> 4) & 1; |
| unsigned int H = (block_mode >> 9) & 1; |
| unsigned int D = (block_mode >> 10) & 1; |
| unsigned int A = (block_mode >> 5) & 0x3; |
| |
| x_weights = 0; |
| y_weights = 0; |
| z_weights = 0; |
| |
| if ((block_mode & 3) != 0) |
| { |
| base_quant_mode |= (block_mode & 3) << 1; |
| unsigned int B = (block_mode >> 7) & 3; |
| unsigned int C = (block_mode >> 2) & 0x3; |
| x_weights = A + 2; |
| y_weights = B + 2; |
| z_weights = C + 2; |
| } |
| else |
| { |
| base_quant_mode |= ((block_mode >> 2) & 3) << 1; |
| if (((block_mode >> 2) & 3) == 0) |
| { |
| return false; |
| } |
| |
| int B = (block_mode >> 9) & 3; |
| if (((block_mode >> 7) & 3) != 3) |
| { |
| D = 0; |
| H = 0; |
| } |
| switch ((block_mode >> 7) & 3) |
| { |
| case 0: |
| x_weights = 6; |
| y_weights = B + 2; |
| z_weights = A + 2; |
| break; |
| case 1: |
| x_weights = A + 2; |
| y_weights = 6; |
| z_weights = B + 2; |
| break; |
| case 2: |
| x_weights = A + 2; |
| y_weights = B + 2; |
| z_weights = 6; |
| break; |
| case 3: |
| x_weights = 2; |
| y_weights = 2; |
| z_weights = 2; |
| switch ((block_mode >> 5) & 3) |
| { |
| case 0: |
| x_weights = 6; |
| break; |
| case 1: |
| y_weights = 6; |
| break; |
| case 2: |
| z_weights = 6; |
| break; |
| case 3: |
| return false; |
| } |
| break; |
| } |
| } |
| |
| unsigned int weight_count = x_weights * y_weights * z_weights * (D + 1); |
| quant_mode = (base_quant_mode - 2) + 6 * H; |
| is_dual_plane = D != 0; |
| |
| weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode)); |
| return (weight_count <= BLOCK_MAX_WEIGHTS && |
| weight_bits >= BLOCK_MIN_WEIGHT_BITS && |
| weight_bits <= BLOCK_MAX_WEIGHT_BITS); |
| } |
| |
| /** |
| * @brief Create a 2D decimation entry for a block-size and weight-decimation pair. |
| * |
| * @param x_texels The number of texels in the X dimension. |
| * @param y_texels The number of texels in the Y dimension. |
| * @param x_weights The number of weights in the X dimension. |
| * @param y_weights The number of weights in the Y dimension. |
| * @param[out] di The decimation info structure to populate. |
| * @param[out] wb The decimation table init scratch working buffers. |
| */ |
| static void init_decimation_info_2d( |
| unsigned int x_texels, |
| unsigned int y_texels, |
| unsigned int x_weights, |
| unsigned int y_weights, |
| decimation_info& di, |
| dt_init_working_buffers& wb |
| ) { |
| unsigned int texels_per_block = x_texels * y_texels; |
| unsigned int weights_per_block = x_weights * y_weights; |
| |
| uint8_t max_texel_count_of_weight = 0; |
| |
| promise(weights_per_block > 0); |
| promise(texels_per_block > 0); |
| promise(x_texels > 0); |
| promise(y_texels > 0); |
| |
| for (unsigned int i = 0; i < weights_per_block; i++) |
| { |
| wb.texel_count_of_weight[i] = 0; |
| } |
| |
| for (unsigned int i = 0; i < texels_per_block; i++) |
| { |
| wb.weight_count_of_texel[i] = 0; |
| } |
| |
| for (unsigned int y = 0; y < y_texels; y++) |
| { |
| for (unsigned int x = 0; x < x_texels; x++) |
| { |
| unsigned int texel = y * x_texels + x; |
| |
| unsigned int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6; |
| unsigned int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6; |
| |
| unsigned int x_weight_frac = x_weight & 0xF; |
| unsigned int y_weight_frac = y_weight & 0xF; |
| unsigned int x_weight_int = x_weight >> 4; |
| unsigned int y_weight_int = y_weight >> 4; |
| |
| unsigned int qweight[4]; |
| qweight[0] = x_weight_int + y_weight_int * x_weights; |
| qweight[1] = qweight[0] + 1; |
| qweight[2] = qweight[0] + x_weights; |
| qweight[3] = qweight[2] + 1; |
| |
| // Truncated-precision bilinear interpolation |
| unsigned int prod = x_weight_frac * y_weight_frac; |
| |
| unsigned int weight[4]; |
| weight[3] = (prod + 8) >> 4; |
| weight[1] = x_weight_frac - weight[3]; |
| weight[2] = y_weight_frac - weight[3]; |
| weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3]; |
| |
| for (unsigned int i = 0; i < 4; i++) |
| { |
| if (weight[i] != 0) |
| { |
| wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]); |
| wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]); |
| wb.weight_count_of_texel[texel]++; |
| wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel); |
| wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]); |
| wb.texel_count_of_weight[qweight[i]]++; |
| max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]); |
| } |
| } |
| } |
| } |
| |
| uint8_t max_texel_weight_count = 0; |
| for (unsigned int i = 0; i < texels_per_block; i++) |
| { |
| di.texel_weight_count[i] = wb.weight_count_of_texel[i]; |
| max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]); |
| |
| for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++) |
| { |
| di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j]; |
| di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); |
| di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j]; |
| } |
| |
| // Init all 4 entries so we can rely on zeros for vectorization |
| for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++) |
| { |
| di.texel_weights_int_4t[j][i] = 0; |
| di.texel_weights_float_4t[j][i] = 0.0f; |
| di.texel_weights_4t[j][i] = 0; |
| } |
| } |
| |
| di.max_texel_weight_count = max_texel_weight_count; |
| |
| for (unsigned int i = 0; i < weights_per_block; i++) |
| { |
| unsigned int texel_count_wt = wb.texel_count_of_weight[i]; |
| di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt); |
| |
| for (unsigned int j = 0; j < texel_count_wt; j++) |
| { |
| uint8_t texel = wb.texels_of_weight[i][j]; |
| |
| // Create transposed versions of these for better vectorization |
| di.weight_texel[j][i] = texel; |
| di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]); |
| |
| // perform a layer of array unrolling. An aspect of this unrolling is that |
| // one of the texel-weight indexes is an identity-mapped index; we will use this |
| // fact to reorder the indexes so that the first one is the identity index. |
| int swap_idx = -1; |
| for (unsigned int k = 0; k < 4; k++) |
| { |
| uint8_t dttw = di.texel_weights_4t[k][texel]; |
| float dttwf = di.texel_weights_float_4t[k][texel]; |
| if (dttw == i && dttwf != 0.0f) |
| { |
| swap_idx = k; |
| } |
| di.texel_weights_texel[i][j][k] = dttw; |
| di.texel_weights_float_texel[i][j][k] = dttwf; |
| } |
| |
| if (swap_idx != 0) |
| { |
| uint8_t vi = di.texel_weights_texel[i][j][0]; |
| float vf = di.texel_weights_float_texel[i][j][0]; |
| di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx]; |
| di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx]; |
| di.texel_weights_texel[i][j][swap_idx] = vi; |
| di.texel_weights_float_texel[i][j][swap_idx] = vf; |
| } |
| } |
| |
| // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
| // Match last texel in active lane in SIMD group, for better gathers |
| uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i]; |
| for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++) |
| { |
| di.weight_texel[j][i] = last_texel; |
| di.weights_flt[j][i] = 0.0f; |
| } |
| } |
| |
| // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
| unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block); |
| for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++) |
| { |
| di.texel_weight_count[i] = 0; |
| |
| for (unsigned int j = 0; j < 4; j++) |
| { |
| di.texel_weights_float_4t[j][i] = 0; |
| di.texel_weights_4t[j][i] = 0; |
| di.texel_weights_int_4t[j][i] = 0; |
| } |
| } |
| |
| // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
| // Match last texel in active lane in SIMD group, for better gathers |
| unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1]; |
| uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1]; |
| |
| unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block); |
| for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++) |
| { |
| di.weight_texel_count[i] = 0; |
| |
| for (unsigned int j = 0; j < max_texel_count_of_weight; j++) |
| { |
| di.weight_texel[j][i] = last_texel; |
| di.weights_flt[j][i] = 0.0f; |
| } |
| } |
| |
| di.texel_count = static_cast<uint8_t>(texels_per_block); |
| di.weight_count = static_cast<uint8_t>(weights_per_block); |
| di.weight_x = static_cast<uint8_t>(x_weights); |
| di.weight_y = static_cast<uint8_t>(y_weights); |
| di.weight_z = 1; |
| } |
| |
| /** |
| * @brief Create a 3D decimation entry for a block-size and weight-decimation pair. |
| * |
| * @param x_texels The number of texels in the X dimension. |
| * @param y_texels The number of texels in the Y dimension. |
| * @param z_texels The number of texels in the Z dimension. |
| * @param x_weights The number of weights in the X dimension. |
| * @param y_weights The number of weights in the Y dimension. |
| * @param z_weights The number of weights in the Z dimension. |
| * @param[out] di The decimation info structure to populate. |
| @param[out] wb The decimation table init scratch working buffers. |
| */ |
| static void init_decimation_info_3d( |
| unsigned int x_texels, |
| unsigned int y_texels, |
| unsigned int z_texels, |
| unsigned int x_weights, |
| unsigned int y_weights, |
| unsigned int z_weights, |
| decimation_info& di, |
| dt_init_working_buffers& wb |
| ) { |
| unsigned int texels_per_block = x_texels * y_texels * z_texels; |
| unsigned int weights_per_block = x_weights * y_weights * z_weights; |
| |
| uint8_t max_texel_count_of_weight = 0; |
| |
| promise(weights_per_block > 0); |
| promise(texels_per_block > 0); |
| |
| for (unsigned int i = 0; i < weights_per_block; i++) |
| { |
| wb.texel_count_of_weight[i] = 0; |
| } |
| |
| for (unsigned int i = 0; i < texels_per_block; i++) |
| { |
| wb.weight_count_of_texel[i] = 0; |
| } |
| |
| for (unsigned int z = 0; z < z_texels; z++) |
| { |
| for (unsigned int y = 0; y < y_texels; y++) |
| { |
| for (unsigned int x = 0; x < x_texels; x++) |
| { |
| int texel = (z * y_texels + y) * x_texels + x; |
| |
| int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6; |
| int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6; |
| int z_weight = (((1024 + z_texels / 2) / (z_texels - 1)) * z * (z_weights - 1) + 32) >> 6; |
| |
| int x_weight_frac = x_weight & 0xF; |
| int y_weight_frac = y_weight & 0xF; |
| int z_weight_frac = z_weight & 0xF; |
| int x_weight_int = x_weight >> 4; |
| int y_weight_int = y_weight >> 4; |
| int z_weight_int = z_weight >> 4; |
| int qweight[4]; |
| int weight[4]; |
| qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int; |
| qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1); |
| |
| // simplex interpolation |
| int fs = x_weight_frac; |
| int ft = y_weight_frac; |
| int fp = z_weight_frac; |
| |
| int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp)); |
| int N = x_weights; |
| int NM = x_weights * y_weights; |
| |
| int s1, s2, w0, w1, w2, w3; |
| switch (cas) |
| { |
| case 7: |
| s1 = 1; |
| s2 = N; |
| w0 = 16 - fs; |
| w1 = fs - ft; |
| w2 = ft - fp; |
| w3 = fp; |
| break; |
| case 3: |
| s1 = N; |
| s2 = 1; |
| w0 = 16 - ft; |
| w1 = ft - fs; |
| w2 = fs - fp; |
| w3 = fp; |
| break; |
| case 5: |
| s1 = 1; |
| s2 = NM; |
| w0 = 16 - fs; |
| w1 = fs - fp; |
| w2 = fp - ft; |
| w3 = ft; |
| break; |
| case 4: |
| s1 = NM; |
| s2 = 1; |
| w0 = 16 - fp; |
| w1 = fp - fs; |
| w2 = fs - ft; |
| w3 = ft; |
| break; |
| case 2: |
| s1 = N; |
| s2 = NM; |
| w0 = 16 - ft; |
| w1 = ft - fp; |
| w2 = fp - fs; |
| w3 = fs; |
| break; |
| case 0: |
| s1 = NM; |
| s2 = N; |
| w0 = 16 - fp; |
| w1 = fp - ft; |
| w2 = ft - fs; |
| w3 = fs; |
| break; |
| default: |
| s1 = NM; |
| s2 = N; |
| w0 = 16 - fp; |
| w1 = fp - ft; |
| w2 = ft - fs; |
| w3 = fs; |
| break; |
| } |
| |
| qweight[1] = qweight[0] + s1; |
| qweight[2] = qweight[1] + s2; |
| weight[0] = w0; |
| weight[1] = w1; |
| weight[2] = w2; |
| weight[3] = w3; |
| |
| for (unsigned int i = 0; i < 4; i++) |
| { |
| if (weight[i] != 0) |
| { |
| wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]); |
| wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]); |
| wb.weight_count_of_texel[texel]++; |
| wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel); |
| wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]); |
| wb.texel_count_of_weight[qweight[i]]++; |
| max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]); |
| } |
| } |
| } |
| } |
| } |
| |
| uint8_t max_texel_weight_count = 0; |
| for (unsigned int i = 0; i < texels_per_block; i++) |
| { |
| di.texel_weight_count[i] = wb.weight_count_of_texel[i]; |
| max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]); |
| |
| // Init all 4 entries so we can rely on zeros for vectorization |
| for (unsigned int j = 0; j < 4; j++) |
| { |
| di.texel_weights_int_4t[j][i] = 0; |
| di.texel_weights_float_4t[j][i] = 0.0f; |
| di.texel_weights_4t[j][i] = 0; |
| } |
| |
| for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++) |
| { |
| di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j]; |
| di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); |
| di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j]; |
| } |
| } |
| |
| di.max_texel_weight_count = max_texel_weight_count; |
| |
| for (unsigned int i = 0; i < weights_per_block; i++) |
| { |
| unsigned int texel_count_wt = wb.texel_count_of_weight[i]; |
| di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt); |
| |
| for (unsigned int j = 0; j < texel_count_wt; j++) |
| { |
| unsigned int texel = wb.texels_of_weight[i][j]; |
| |
| // Create transposed versions of these for better vectorization |
| di.weight_texel[j][i] = static_cast<uint8_t>(texel); |
| di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]); |
| |
| // perform a layer of array unrolling. An aspect of this unrolling is that |
| // one of the texel-weight indexes is an identity-mapped index; we will use this |
| // fact to reorder the indexes so that the first one is the identity index. |
| int swap_idx = -1; |
| for (unsigned int k = 0; k < 4; k++) |
| { |
| uint8_t dttw = di.texel_weights_4t[k][texel]; |
| float dttwf = di.texel_weights_float_4t[k][texel]; |
| if (dttw == i && dttwf != 0.0f) |
| { |
| swap_idx = k; |
| } |
| di.texel_weights_texel[i][j][k] = dttw; |
| di.texel_weights_float_texel[i][j][k] = dttwf; |
| } |
| |
| if (swap_idx != 0) |
| { |
| uint8_t vi = di.texel_weights_texel[i][j][0]; |
| float vf = di.texel_weights_float_texel[i][j][0]; |
| di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx]; |
| di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx]; |
| di.texel_weights_texel[i][j][swap_idx] = vi; |
| di.texel_weights_float_texel[i][j][swap_idx] = vf; |
| } |
| } |
| |
| // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
| // Match last texel in active lane in SIMD group, for better gathers |
| uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i]; |
| for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++) |
| { |
| di.weight_texel[j][i] = last_texel; |
| di.weights_flt[j][i] = 0.0f; |
| } |
| } |
| |
| // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
| unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block); |
| for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++) |
| { |
| di.texel_weight_count[i] = 0; |
| |
| for (unsigned int j = 0; j < 4; j++) |
| { |
| di.texel_weights_float_4t[j][i] = 0; |
| di.texel_weights_4t[j][i] = 0; |
| di.texel_weights_int_4t[j][i] = 0; |
| } |
| } |
| |
| // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
| // Match last texel in active lane in SIMD group, for better gathers |
| int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1]; |
| uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1]; |
| |
| unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block); |
| for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++) |
| { |
| di.weight_texel_count[i] = 0; |
| |
| for (int j = 0; j < max_texel_count_of_weight; j++) |
| { |
| di.weight_texel[j][i] = last_texel; |
| di.weights_flt[j][i] = 0.0f; |
| } |
| } |
| |
| di.texel_count = static_cast<uint8_t>(texels_per_block); |
| di.weight_count = static_cast<uint8_t>(weights_per_block); |
| di.weight_x = static_cast<uint8_t>(x_weights); |
| di.weight_y = static_cast<uint8_t>(y_weights); |
| di.weight_z = static_cast<uint8_t>(z_weights); |
| } |
| |
| /** |
| * @brief Assign the texels to use for kmeans clustering. |
| * |
| * The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used. |
| * The @c bsd.texel_count is an input and must be populated beforehand. |
| * |
| * @param[in,out] bsd The block size descriptor to populate. |
| */ |
| static void assign_kmeans_texels( |
| block_size_descriptor& bsd |
| ) { |
| // Use all texels for kmeans on a small block |
| if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS) |
| { |
| for (uint8_t i = 0; i < bsd.texel_count; i++) |
| { |
| bsd.kmeans_texels[i] = i; |
| } |
| |
| return; |
| } |
| |
| // Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block |
| uint64_t rng_state[2]; |
| astc::rand_init(rng_state); |
| |
| // Initialize array used for tracking used indices |
| bool seen[BLOCK_MAX_TEXELS]; |
| for (uint8_t i = 0; i < bsd.texel_count; i++) |
| { |
| seen[i] = false; |
| } |
| |
| // Assign 64 random indices, retrying if we see repeats |
| unsigned int arr_elements_set = 0; |
| while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS) |
| { |
| uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state)); |
| texel = texel % bsd.texel_count; |
| if (!seen[texel]) |
| { |
| bsd.kmeans_texels[arr_elements_set++] = texel; |
| seen[texel] = true; |
| } |
| } |
| } |
| |
| /** |
| * @brief Allocate a single 2D decimation table entry. |
| * |
| * @param x_texels The number of texels in the X dimension. |
| * @param y_texels The number of texels in the Y dimension. |
| * @param x_weights The number of weights in the X dimension. |
| * @param y_weights The number of weights in the Y dimension. |
| * @param bsd The block size descriptor we are populating. |
| * @param wb The decimation table init scratch working buffers. |
| * @param index The packed array index to populate. |
| */ |
| static void construct_dt_entry_2d( |
| unsigned int x_texels, |
| unsigned int y_texels, |
| unsigned int x_weights, |
| unsigned int y_weights, |
| block_size_descriptor& bsd, |
| dt_init_working_buffers& wb, |
| unsigned int index |
| ) { |
| unsigned int weight_count = x_weights * y_weights; |
| assert(weight_count <= BLOCK_MAX_WEIGHTS); |
| |
| bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS; |
| |
| decimation_info& di = bsd.decimation_tables[index]; |
| init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb); |
| |
| int maxprec_1plane = -1; |
| int maxprec_2planes = -1; |
| for (int i = 0; i < 12; i++) |
| { |
| unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i)); |
| if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS) |
| { |
| maxprec_1plane = i; |
| } |
| |
| if (try_2planes) |
| { |
| unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i)); |
| if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS) |
| { |
| maxprec_2planes = i; |
| } |
| } |
| } |
| |
| // At least one of the two should be valid ... |
| assert(maxprec_1plane >= 0 || maxprec_2planes >= 0); |
| bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane); |
| bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes); |
| bsd.decimation_modes[index].refprec_1_plane = 0; |
| bsd.decimation_modes[index].refprec_2_planes = 0; |
| } |
| |
| /** |
| * @brief Allocate block modes and decimation tables for a single 2D block size. |
| * |
| * @param x_texels The number of texels in the X dimension. |
| * @param y_texels The number of texels in the Y dimension. |
| * @param can_omit_modes Can we discard modes that astcenc won't use, even if legal? |
| * @param mode_cutoff Percentile cutoff in range [0,1]. Low values more likely to be used. |
| * @param[out] bsd The block size descriptor to populate. |
| */ |
| static void construct_block_size_descriptor_2d( |
| unsigned int x_texels, |
| unsigned int y_texels, |
| bool can_omit_modes, |
| float mode_cutoff, |
| block_size_descriptor& bsd |
| ) { |
| // Store a remap table for storing packed decimation modes. |
| // Indexing uses [Y * 16 + X] and max size for each axis is 12. |
| static const unsigned int MAX_DMI = 12 * 16 + 12; |
| int decimation_mode_index[MAX_DMI]; |
| |
| dt_init_working_buffers* wb = new dt_init_working_buffers; |
| |
| bsd.xdim = static_cast<uint8_t>(x_texels); |
| bsd.ydim = static_cast<uint8_t>(y_texels); |
| bsd.zdim = 1; |
| bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels); |
| |
| for (unsigned int i = 0; i < MAX_DMI; i++) |
| { |
| decimation_mode_index[i] = -1; |
| } |
| |
| // Gather all the decimation grids that can be used with the current block |
| #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| const float *percentiles = get_2d_percentile_table(x_texels, y_texels); |
| float always_cutoff = 0.0f; |
| #else |
| // Unused in decompress-only builds |
| (void)can_omit_modes; |
| (void)mode_cutoff; |
| #endif |
| |
| // Construct the list of block formats referencing the decimation tables |
| unsigned int packed_bm_idx = 0; |
| unsigned int packed_dm_idx = 0; |
| |
| // Trackers |
| unsigned int bm_counts[4] { 0 }; |
| unsigned int dm_counts[4] { 0 }; |
| |
| // Clear the list to a known-bad value |
| for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
| { |
| bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; |
| } |
| |
| // Iterate four times to build a usefully ordered list: |
| // - Pass 0 - keep selected single plane "always" block modes |
| // - Pass 1 - keep selected single plane "non-always" block modes |
| // - Pass 2 - keep select dual plane block modes |
| // - Pass 3 - keep everything else that's legal |
| unsigned int limit = can_omit_modes ? 3 : 4; |
| for (unsigned int j = 0; j < limit; j ++) |
| { |
| for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
| { |
| // Skip modes we've already included in a previous pass |
| if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE) |
| { |
| continue; |
| } |
| |
| // Decode parameters |
| unsigned int x_weights; |
| unsigned int y_weights; |
| bool is_dual_plane; |
| unsigned int quant_mode; |
| unsigned int weight_bits; |
| bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits); |
| |
| // Always skip invalid encodings for the current block size |
| if (!valid || (x_weights > x_texels) || (y_weights > y_texels)) |
| { |
| continue; |
| } |
| |
| // Selectively skip dual plane encodings |
| if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane)) |
| { |
| continue; |
| } |
| |
| // Always skip encodings we can't physically encode based on |
| // generic encoding bit availability |
| if (is_dual_plane) |
| { |
| // This is the only check we need as only support 1 partition |
| if ((109 - weight_bits) <= 0) |
| { |
| continue; |
| } |
| } |
| else |
| { |
| // This is conservative - fewer bits may be available for > 1 partition |
| if ((111 - weight_bits) <= 0) |
| { |
| continue; |
| } |
| } |
| |
| // Selectively skip encodings based on percentile |
| bool percentile_hit = false; |
| #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| if (j == 0) |
| { |
| percentile_hit = percentiles[i] <= always_cutoff; |
| } |
| else |
| { |
| percentile_hit = percentiles[i] <= mode_cutoff; |
| } |
| #endif |
| |
| if (j != 3 && !percentile_hit) |
| { |
| continue; |
| } |
| |
| // Allocate and initialize the decimation table entry if we've not used it yet |
| int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights]; |
| if (decimation_mode < 0) |
| { |
| construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx); |
| decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx; |
| decimation_mode = packed_dm_idx; |
| |
| dm_counts[j]++; |
| packed_dm_idx++; |
| } |
| |
| auto& bm = bsd.block_modes[packed_bm_idx]; |
| |
| bm.decimation_mode = static_cast<uint8_t>(decimation_mode); |
| bm.quant_mode = static_cast<uint8_t>(quant_mode); |
| bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane); |
| bm.weight_bits = static_cast<uint8_t>(weight_bits); |
| bm.mode_index = static_cast<uint16_t>(i); |
| |
| auto& dm = bsd.decimation_modes[decimation_mode]; |
| |
| if (is_dual_plane) |
| { |
| dm.set_ref_2_plane(bm.get_weight_quant_mode()); |
| } |
| else |
| { |
| dm.set_ref_1_plane(bm.get_weight_quant_mode()); |
| } |
| |
| bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx); |
| |
| packed_bm_idx++; |
| bm_counts[j]++; |
| } |
| } |
| |
| bsd.block_mode_count_1plane_always = bm_counts[0]; |
| bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1]; |
| bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2]; |
| bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3]; |
| |
| bsd.decimation_mode_count_always = dm_counts[0]; |
| bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2]; |
| bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3]; |
| |
| #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| assert(bsd.block_mode_count_1plane_always > 0); |
| assert(bsd.decimation_mode_count_always > 0); |
| |
| delete[] percentiles; |
| #endif |
| |
| // Ensure the end of the array contains valid data (should never get read) |
| for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++) |
| { |
| bsd.decimation_modes[i].maxprec_1plane = -1; |
| bsd.decimation_modes[i].maxprec_2planes = -1; |
| bsd.decimation_modes[i].refprec_1_plane = 0; |
| bsd.decimation_modes[i].refprec_2_planes = 0; |
| } |
| |
| // Determine the texels to use for kmeans clustering. |
| assign_kmeans_texels(bsd); |
| |
| delete wb; |
| } |
| |
| /** |
| * @brief Allocate block modes and decimation tables for a single £D block size. |
| * |
| * TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as |
| * the percentile mode cutoffs. If 3D becomes more widely used we should look at this. |
| * |
| * @param x_texels The number of texels in the X dimension. |
| * @param y_texels The number of texels in the Y dimension. |
| * @param z_texels The number of texels in the Z dimension. |
| * @param[out] bsd The block size descriptor to populate. |
| */ |
| static void construct_block_size_descriptor_3d( |
| unsigned int x_texels, |
| unsigned int y_texels, |
| unsigned int z_texels, |
| block_size_descriptor& bsd |
| ) { |
| // Store a remap table for storing packed decimation modes. |
| // Indexing uses [Z * 64 + Y * 8 + X] and max size for each axis is 6. |
| static constexpr unsigned int MAX_DMI = 6 * 64 + 6 * 8 + 6; |
| int decimation_mode_index[MAX_DMI]; |
| unsigned int decimation_mode_count = 0; |
| |
| dt_init_working_buffers* wb = new dt_init_working_buffers; |
| |
| bsd.xdim = static_cast<uint8_t>(x_texels); |
| bsd.ydim = static_cast<uint8_t>(y_texels); |
| bsd.zdim = static_cast<uint8_t>(z_texels); |
| bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels); |
| |
| for (unsigned int i = 0; i < MAX_DMI; i++) |
| { |
| decimation_mode_index[i] = -1; |
| } |
| |
| // gather all the infill-modes that can be used with the current block size |
| for (unsigned int x_weights = 2; x_weights <= x_texels; x_weights++) |
| { |
| for (unsigned int y_weights = 2; y_weights <= y_texels; y_weights++) |
| { |
| for (unsigned int z_weights = 2; z_weights <= z_texels; z_weights++) |
| { |
| unsigned int weight_count = x_weights * y_weights * z_weights; |
| if (weight_count > BLOCK_MAX_WEIGHTS) |
| { |
| continue; |
| } |
| |
| decimation_info& di = bsd.decimation_tables[decimation_mode_count]; |
| decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count; |
| init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb); |
| |
| int maxprec_1plane = -1; |
| int maxprec_2planes = -1; |
| for (unsigned int i = 0; i < 12; i++) |
| { |
| unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i)); |
| if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS) |
| { |
| maxprec_1plane = i; |
| } |
| |
| unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i)); |
| if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS) |
| { |
| maxprec_2planes = i; |
| } |
| } |
| |
| if ((2 * weight_count) > BLOCK_MAX_WEIGHTS) |
| { |
| maxprec_2planes = -1; |
| } |
| |
| bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane); |
| bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes); |
| bsd.decimation_modes[decimation_mode_count].refprec_1_plane = maxprec_1plane == -1 ? 0 : 0xFFFF; |
| bsd.decimation_modes[decimation_mode_count].refprec_2_planes = maxprec_2planes == -1 ? 0 : 0xFFFF; |
| decimation_mode_count++; |
| } |
| } |
| } |
| |
| // Ensure the end of the array contains valid data (should never get read) |
| for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++) |
| { |
| bsd.decimation_modes[i].maxprec_1plane = -1; |
| bsd.decimation_modes[i].maxprec_2planes = -1; |
| bsd.decimation_modes[i].refprec_1_plane = 0; |
| bsd.decimation_modes[i].refprec_2_planes = 0; |
| } |
| |
| bsd.decimation_mode_count_always = 0; // Skipped for 3D modes |
| bsd.decimation_mode_count_selected = decimation_mode_count; |
| bsd.decimation_mode_count_all = decimation_mode_count; |
| |
| // Construct the list of block formats referencing the decimation tables |
| |
| // Clear the list to a known-bad value |
| for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
| { |
| bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; |
| } |
| |
| unsigned int packed_idx = 0; |
| unsigned int bm_counts[2] { 0 }; |
| |
| // Iterate two times to build a usefully ordered list: |
| // - Pass 0 - keep valid single plane block modes |
| // - Pass 1 - keep valid dual plane block modes |
| for (unsigned int j = 0; j < 2; j++) |
| { |
| for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
| { |
| // Skip modes we've already included in a previous pass |
| if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE) |
| { |
| continue; |
| } |
| |
| unsigned int x_weights; |
| unsigned int y_weights; |
| unsigned int z_weights; |
| bool is_dual_plane; |
| unsigned int quant_mode; |
| unsigned int weight_bits; |
| |
| bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits); |
| // Skip invalid encodings |
| if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels) |
| { |
| continue; |
| } |
| |
| // Skip encodings in the wrong iteration |
| if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane)) |
| { |
| continue; |
| } |
| |
| // Always skip encodings we can't physically encode based on bit availability |
| if (is_dual_plane) |
| { |
| // This is the only check we need as only support 1 partition |
| if ((109 - weight_bits) <= 0) |
| { |
| continue; |
| } |
| } |
| else |
| { |
| // This is conservative - fewer bits may be available for > 1 partition |
| if ((111 - weight_bits) <= 0) |
| { |
| continue; |
| } |
| } |
| |
| int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights]; |
| bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode); |
| bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode); |
| bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits); |
| bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane); |
| bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i); |
| |
| bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx); |
| bm_counts[j]++; |
| packed_idx++; |
| } |
| } |
| |
| bsd.block_mode_count_1plane_always = 0; // Skipped for 3D modes |
| bsd.block_mode_count_1plane_selected = bm_counts[0]; |
| bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1]; |
| bsd.block_mode_count_all = bm_counts[0] + bm_counts[1]; |
| |
| // Determine the texels to use for kmeans clustering. |
| assign_kmeans_texels(bsd); |
| |
| delete wb; |
| } |
| |
| /* See header for documentation. */ |
| void init_block_size_descriptor( |
| unsigned int x_texels, |
| unsigned int y_texels, |
| unsigned int z_texels, |
| bool can_omit_modes, |
| unsigned int partition_count_cutoff, |
| float mode_cutoff, |
| block_size_descriptor& bsd |
| ) { |
| if (z_texels > 1) |
| { |
| construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd); |
| } |
| else |
| { |
| construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd); |
| } |
| |
| init_partition_tables(bsd, can_omit_modes, partition_count_cutoff); |
| } |