| /* Copyright 2019 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| // # What is "packing"? |
| // |
| // Before feeding data to the gemm kernels (the parts of Ruy that do lots |
| // of multiply-add operations), Ruy first performs a data transformation (which |
| // we call "packing") on the input matrices. This transformation has two main |
| // goals: |
| // - rearrange data into blocks that are a convenient size/layout for the gemm |
| // kernels to consume. This helps make the memory access pattern of the gemm |
| // kernel simpler and more contiguous, and puts the data in a layout most |
| // convenient for specific arithmetic instructions in the gemm kernel. |
| // - compute row/column sums needed for handling quantization with non-symmetric |
| // zero points. |
| // |
| // # Simplified algorithmic analysis of packing |
| // |
| // Packing is a relatively simple transformation which does a small constant |
| // amount of work on each element of an input matrix, and hence for an NxM |
| // matrix performs O(N*M) work. If N and M are of the same order, then this is |
| // O(N^2) work. |
| // |
| // A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations. |
| // Note that if N, K, and M are all the same order, then the number of |
| // multiply-accumulate operations is O(N^3). |
| // |
| // Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the |
| // case of all dimensions being roughly the same order. |
| // |
| // # Packing cost can be significant |
| // |
| // When matrix * matrix multiplications begin to look more like matrix * vector |
| // multiplications, packing cost can become significant. We sometimes call these |
| // cases "gemv-like". |
| // |
| // Continuing the algorithmic analysis above, if we consider a case where an |
| // NxKxM matrix multiplication has either N = O(1) or M = O(1), then the |
| // situation is different. In this case, the multiply-accumulate work is only |
| // quadratic, so the quadratic cost of packing can be come significant. |
| // |
| // Another way to say this is that the cost of packing an input matrix (either |
| // the LHS or RHS) is amortized across the non-depth dimension of the opposite |
| // input matrix. Thus, when the LHS has very few rows or the RHS has very few |
| // columns, the cost of packing the opposite input matrix can become |
| // significant. |
| // |
| // As a rough rule of thumb, the cost of packing starts to become significant |
| // when either N or M is below 32 (and other dimensions are hundreds), with very |
| // significant packing costs at 8 or below. This varies by data type, Path, and |
| // tuning, so these numbers are only rough guides. |
| // |
| // One practical use case that is affected by this is inference of |
| // fully connected neural network layers with a low batch size. The weight |
| // matrix (which is a constant for inference) is the one affected by significant |
| // packing cost. |
| // |
| // Ruy provides an API in ruy_advanced.h for advanced users to pre-pack |
| // input matrices that are affected by significant packing costs. |
| // |
| // # Implementation notes |
| // |
| // Ruy's packing routines always operate on a range of columns and can be |
| // applied to either the LHS or RHS. This is possible because Ruy internally |
| // implements a TrMul, so the accumulation along depth is done along columns of |
| // both the LHS and RHS (whereas for a normal Mul the accumulation along depth |
| // for the LHS is along rows). As another example, we are always computing |
| // column sums for quantization (and never row sums, since the LHS is |
| // transposed). |
| |
| #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PACK_ARM_H_ |
| #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PACK_ARM_H_ |
| |
| #include <cstdint> |
| #include <type_traits> |
| |
| #include "ruy/check_macros.h" |
| #include "ruy/common.h" |
| #include "ruy/internal_matrix.h" |
| #include "ruy/matrix.h" |
| #include "ruy/opt_set.h" |
| #include "ruy/pack_common.h" |
| #include "ruy/path.h" |
| #include "ruy/platform.h" |
| #include "ruy/profiler/instrumentation.h" |
| #include "ruy/tune.h" |
| |
| namespace ruy { |
| |
| #if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1, |
| const void* src_ptr2, const void* src_ptr3, |
| int src_inc0, int src_inc1, int src_inc2, |
| int src_inc3, int src_rows, int src_zero_point, |
| std::int8_t* packed_ptr, int start_col, int end_col, |
| std::int32_t* sums_ptr, int input_xor); |
| void Pack8bitNeonInOrder(const void* src_ptr0, const void* src_ptr1, |
| const void* src_ptr2, const void* src_ptr3, |
| int src_inc0, int src_inc1, int src_inc2, int src_inc3, |
| int src_rows, int src_zero_point, |
| std::int8_t* packed_ptr, int start_col, int end_col, |
| std::int32_t* sums_ptr, int input_xor); |
| void Pack8bitNeonDotprodOutOfOrder(const void* src_ptr0, const void* src_ptr1, |
| const void* src_ptr2, const void* src_ptr3, |
| int src_inc0, int src_inc1, int src_inc2, |
| int src_inc3, int src_rows, |
| int src_zero_point, std::int8_t* packed_ptr, |
| int start_col, int end_col, |
| std::int32_t* sums_ptr, int input_xor); |
| void Pack8bitNeonDotprodInOrder(const void* src_ptr0, const void* src_ptr1, |
| const void* src_ptr2, const void* src_ptr3, |
| int src_inc0, int src_inc1, int src_inc2, |
| int src_inc3, int src_rows, int src_zero_point, |
| std::int8_t* packed_ptr, int start_col, |
| int end_col, std::int32_t* sums_ptr, |
| int input_xor); |
| |
| #elif RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| void Pack8bitNeonOutOfOrder4Cols(const PackParams8bit& params); |
| void Pack8bitNeonOutOfOrder2Cols(const PackParams8bit& params); |
| #endif // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| #if (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && \ |
| RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| template <typename Scalar> |
| struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar, |
| std::int8_t, std::int32_t> { |
| static_assert(std::is_same<Scalar, std::int8_t>::value || |
| std::is_same<Scalar, std::uint8_t>::value, |
| ""); |
| static constexpr int kInputXor = |
| std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80; |
| |
| static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix, |
| PackedMatrix<std::int8_t>* packed_matrix, int start_col, |
| int end_col) { |
| RUY_DCHECK(IsColMajor(src_matrix.layout)); |
| RUY_DCHECK(IsColMajor(packed_matrix->layout)); |
| RUY_DCHECK_EQ(start_col % 4, 0); |
| std::int32_t* sums = packed_matrix->sums; |
| Scalar zerobuf[16]; |
| memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf)); |
| for (int block_col = start_col; block_col < end_col; block_col += 4) { |
| int src_stride = src_matrix.layout.stride; |
| const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col; |
| const Scalar* src_ptr1 = src_ptr0 + src_stride; |
| const Scalar* src_ptr2 = src_ptr1 + src_stride; |
| const Scalar* src_ptr3 = src_ptr2 + src_stride; |
| int src_inc0 = 16; |
| int src_inc1 = 16; |
| int src_inc2 = 16; |
| int src_inc3 = 16; |
| if (block_col >= src_matrix.layout.cols - 3) { |
| if (block_col >= src_matrix.layout.cols - 0) { |
| src_ptr0 = zerobuf; |
| src_inc0 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 1) { |
| src_ptr1 = zerobuf; |
| src_inc1 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 2) { |
| src_ptr2 = zerobuf; |
| src_inc2 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 3) { |
| src_ptr3 = zerobuf; |
| src_inc3 = 0; |
| } |
| } |
| std::int8_t* packed_ptr = |
| packed_matrix->data + packed_matrix->layout.stride * block_col; |
| std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; |
| #if RUY_PLATFORM(NEON_64) |
| if (__builtin_expect(tuning == Tuning::kInOrder, true)) { |
| Pack8bitNeonInOrder( |
| src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, |
| src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col, sums_ptr, kInputXor); |
| } else { |
| Pack8bitNeonOutOfOrder( |
| src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, |
| src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col, sums_ptr, kInputXor); |
| } |
| #else |
| // We have a more limited set of general purpose registers in ARMv7, so |
| // we use the "params" struct technique from the kernel code to save |
| // registers. |
| PackParams8bit params; |
| MakePackParams8bit(src_ptr0, src_ptr1, src_ptr2, src_ptr3, sums_ptr, |
| packed_ptr, src_inc0, src_inc1, src_inc2, src_inc3, |
| src_matrix.layout.rows, src_matrix.zero_point, |
| kInputXor, ¶ms); |
| Pack8bitNeonOutOfOrder4Cols(params); |
| #endif // RUY_PLATFORM(NEON_64) |
| } |
| } |
| }; |
| |
| #endif // (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && |
| // RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| #if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| // The 32-bit float kernel is 4 rows X 2 columns, so we need an additional |
| // partial specialization for the RHS, which has a FixedKernelLayout with 2 |
| // columns. |
| template <typename Scalar> |
| struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 2>, Scalar, |
| std::int8_t, std::int32_t> { |
| static_assert(std::is_same<Scalar, std::int8_t>::value || |
| std::is_same<Scalar, std::uint8_t>::value, |
| ""); |
| static constexpr int kInputXor = |
| std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80; |
| static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix, |
| PackedMatrix<std::int8_t>* packed_matrix, int start_col, |
| int end_col) { |
| RUY_DCHECK(IsColMajor(src_matrix.layout)); |
| RUY_DCHECK(IsColMajor(packed_matrix->layout)); |
| RUY_DCHECK_EQ(start_col % 2, 0); |
| std::int32_t* sums = packed_matrix->sums; |
| Scalar zerobuf[16]; |
| memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf)); |
| for (int block_col = start_col; block_col < end_col; block_col += 2) { |
| int src_stride = src_matrix.layout.stride; |
| const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col; |
| const Scalar* src_ptr1 = src_ptr0 + src_stride; |
| int src_inc0 = 16; |
| int src_inc1 = 16; |
| if (block_col >= src_matrix.layout.cols - 2) { |
| if (block_col >= src_matrix.layout.cols - 0) { |
| src_ptr0 = zerobuf; |
| src_inc0 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 1) { |
| src_ptr1 = zerobuf; |
| src_inc1 = 0; |
| } |
| } |
| std::int8_t* packed_ptr = |
| packed_matrix->data + packed_matrix->layout.stride * block_col; |
| std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; |
| PackParams8bit params; |
| MakePackParams8bit(src_ptr0, src_ptr1, nullptr, nullptr, sums_ptr, |
| packed_ptr, src_inc0, src_inc1, -1, -1, |
| src_matrix.layout.rows, src_matrix.zero_point, |
| kInputXor, ¶ms); |
| Pack8bitNeonOutOfOrder2Cols(params); |
| } |
| } |
| }; |
| #endif // (RUY_PLATFORM(NEON_32)) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| #if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| template <typename Scalar> |
| struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>, |
| Scalar, std::int8_t, std::int32_t> { |
| static_assert(std::is_same<Scalar, std::int8_t>::value || |
| std::is_same<Scalar, std::uint8_t>::value, |
| ""); |
| static constexpr int kInputXor = |
| std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80; |
| |
| static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix, |
| PackedMatrix<std::int8_t>* packed_matrix, int start_col, |
| int end_col) { |
| RUY_DCHECK(IsColMajor(src_matrix.layout)); |
| RUY_DCHECK(IsColMajor(packed_matrix->layout)); |
| RUY_DCHECK_EQ(start_col % 8, 0); |
| std::int32_t* sums = packed_matrix->sums; |
| Scalar zerobuf[16]; |
| memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf)); |
| for (int block_col = start_col; block_col < end_col; block_col += 4) { |
| int src_stride = src_matrix.layout.stride; |
| const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col; |
| const Scalar* src_ptr1 = src_ptr0 + src_stride; |
| const Scalar* src_ptr2 = src_ptr1 + src_stride; |
| const Scalar* src_ptr3 = src_ptr2 + src_stride; |
| std::int64_t src_inc0 = 16; |
| std::int64_t src_inc1 = 16; |
| std::int64_t src_inc2 = 16; |
| std::int64_t src_inc3 = 16; |
| if (block_col >= src_matrix.layout.cols - 3) { |
| if (block_col >= src_matrix.layout.cols - 0) { |
| src_ptr0 = zerobuf; |
| src_inc0 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 1) { |
| src_ptr1 = zerobuf; |
| src_inc1 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 2) { |
| src_ptr2 = zerobuf; |
| src_inc2 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 3) { |
| src_ptr3 = zerobuf; |
| src_inc3 = 0; |
| } |
| } |
| std::int8_t* packed_ptr = |
| packed_matrix->data + |
| packed_matrix->layout.stride * (block_col & ~7) + |
| ((block_col & 4) * 4); |
| std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; |
| if (__builtin_expect(tuning == Tuning::kInOrder, true)) { |
| Pack8bitNeonDotprodInOrder( |
| src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, |
| src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col, sums_ptr, kInputXor); |
| } else { |
| Pack8bitNeonDotprodOutOfOrder( |
| src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, |
| src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col, sums_ptr, kInputXor); |
| } |
| } |
| } |
| }; |
| #endif // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| #if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1, |
| const float* src_ptr2, const float* src_ptr3, |
| int src_inc0, int src_inc1, int src_inc2, |
| int src_inc3, int src_rows, int src_zero_point, |
| float* packed_ptr, int start_col, int end_col); |
| void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1, |
| const float* src_ptr2, const float* src_ptr3, |
| int src_inc0, int src_inc1, int src_inc2, |
| int src_inc3, int src_rows, int src_zero_point, |
| float* packed_ptr, int start_col, int end_col); |
| |
| #elif RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM) |
| void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1, |
| const float* src_ptr2, const float* src_ptr3, |
| int src_inc, int src_rows, int src_zero_point, |
| float* packed_ptr, int start_col, int end_col, |
| int stride); |
| #endif // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| #if (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && \ |
| RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| template <> |
| struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float, |
| float, float> { |
| static void Run(Tuning tuning, const Matrix<float>& src_matrix, |
| PackedMatrix<float>* packed_matrix, int start_col, |
| int end_col) { |
| RUY_DCHECK(IsColMajor(src_matrix.layout)); |
| RUY_DCHECK(IsColMajor(packed_matrix->layout)); |
| RUY_DCHECK_EQ(start_col % 8, 0); |
| const float zerobuf[4] = {0}; |
| for (int block_col = start_col; block_col < end_col; block_col += 4) { |
| int src_stride = src_matrix.layout.stride; |
| const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col; |
| const float* src_ptr1 = src_ptr0 + src_stride; |
| const float* src_ptr2 = src_ptr1 + src_stride; |
| const float* src_ptr3 = src_ptr2 + src_stride; |
| std::int64_t src_inc0 = 16; |
| std::int64_t src_inc1 = 16; |
| std::int64_t src_inc2 = 16; |
| std::int64_t src_inc3 = 16; |
| if (block_col >= src_matrix.layout.cols - 3) { |
| if (block_col >= src_matrix.layout.cols - 0) { |
| src_ptr0 = zerobuf; |
| src_inc0 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 1) { |
| src_ptr1 = zerobuf; |
| src_inc1 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 2) { |
| src_ptr2 = zerobuf; |
| src_inc2 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 3) { |
| src_ptr3 = zerobuf; |
| src_inc3 = 0; |
| } |
| } |
| float* packed_ptr = packed_matrix->data + |
| packed_matrix->layout.stride * (block_col & ~7) + |
| ((block_col & 4)); |
| #if RUY_PLATFORM(NEON_64) |
| if (__builtin_expect(tuning == Tuning::kInOrder, true)) { |
| PackFloatNeonInOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, |
| src_inc1, src_inc2, src_inc3, |
| src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col); |
| } else { |
| PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, |
| src_inc0, src_inc1, src_inc2, src_inc3, |
| src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col); |
| } |
| #else |
| // Encode each of src_inc0, ..., src_inc3 in lowest 4 bits of src_inc |
| // to save on registers (we have fewer general purpose registers in |
| // 32-bit ARM than in 64-bit ARM). For the 64-bit case, we pass four |
| // values that are each either 16 or 0 and use them directly. For the |
| // 32-bit case, bits 0, 1, 2, and 3 are used to determine if we should |
| // use the value 16 (bit is set) or 0 (bit is not set) for the |
| // respective increment value. |
| std::int64_t src_inc = 0; |
| src_inc += src_inc0 == 16 ? 1 : 0; |
| src_inc += src_inc1 == 16 ? 2 : 0; |
| src_inc += src_inc2 == 16 ? 4 : 0; |
| src_inc += src_inc3 == 16 ? 8 : 0; |
| const int kOutputStride = 32; |
| PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc, |
| src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col, kOutputStride); |
| #endif // RUY_PLATFORM(NEON_64) |
| } |
| } |
| }; |
| |
| #if RUY_PLATFORM(NEON_32) |
| // The 32-bit float kernel is 8 rows X 4 columns, so we need an additional |
| // specialization for a FixedKernelLayout with 4 columns. |
| template <> |
| struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 4>, float, |
| float, float> { |
| static void Run(Tuning tuning, const Matrix<float>& src_matrix, |
| PackedMatrix<float>* packed_matrix, int start_col, |
| int end_col) { |
| RUY_DCHECK(IsColMajor(src_matrix.layout)); |
| RUY_DCHECK(IsColMajor(packed_matrix->layout)); |
| RUY_DCHECK_EQ(start_col % 4, 0); |
| const float zerobuf[4] = {0}; |
| for (int block_col = start_col; block_col < end_col; block_col += 4) { |
| int src_stride = src_matrix.layout.stride; |
| const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col; |
| const float* src_ptr1 = src_ptr0 + src_stride; |
| const float* src_ptr2 = src_ptr1 + src_stride; |
| const float* src_ptr3 = src_ptr2 + src_stride; |
| std::int64_t src_inc0 = 16; |
| std::int64_t src_inc1 = 16; |
| std::int64_t src_inc2 = 16; |
| std::int64_t src_inc3 = 16; |
| if (block_col >= src_matrix.layout.cols - 3) { |
| if (block_col >= src_matrix.layout.cols - 0) { |
| src_ptr0 = zerobuf; |
| src_inc0 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 1) { |
| src_ptr1 = zerobuf; |
| src_inc1 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 2) { |
| src_ptr2 = zerobuf; |
| src_inc2 = 0; |
| } |
| if (block_col >= src_matrix.layout.cols - 3) { |
| src_ptr3 = zerobuf; |
| src_inc3 = 0; |
| } |
| } |
| float* packed_ptr = |
| packed_matrix->data + packed_matrix->layout.stride * (block_col); |
| // Encode each of src_inc0, ..., src_inc1 in lowest 4 bits of scrc_inc |
| // to save registers. |
| std::int64_t src_inc = 0; |
| src_inc += src_inc0 == 16 ? 1 : 0; |
| src_inc += src_inc1 == 16 ? 2 : 0; |
| src_inc += src_inc2 == 16 ? 4 : 0; |
| src_inc += src_inc3 == 16 ? 8 : 0; |
| const int kOutputStride = 16; |
| PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc, |
| src_matrix.layout.rows, src_matrix.zero_point, |
| packed_ptr, start_col, end_col, kOutputStride); |
| } |
| } |
| }; |
| #endif // (RUY_PLATFORM(NEON_32)) |
| #endif // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \ |
| // RUY_OPT_ENABLED(RUY_OPT_ASM) |
| |
| } // namespace ruy |
| |
| #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PACK_ARM_H_ |