| /* |
| * Copyright (c) 2020, Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included |
| * in all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| * OTHER DEALINGS IN THE SOFTWARE. |
| */ |
| //! |
| //! \file cm_mem_sse2_impl.cpp |
| //! \brief Contains CM memory function implementations |
| //! |
| |
| #include "cm_mem.h" |
| #include "cm_mem_sse2_impl.h" |
| |
| #if defined(__SSE2__) || !(defined(LINUX) || defined(ANDROID)) |
| |
| #include <mmintrin.h> |
| |
| void FastMemCopy_SSE2_movntdq_movdqa( |
| void* dst, |
| void* src, |
| const size_t doubleQuadWords ) |
| { |
| CM_ASSERT( IsAligned( dst, sizeof(DQWORD) ) ); |
| CM_ASSERT( IsAligned( src, sizeof(DQWORD) ) ); |
| |
| const size_t doubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD); |
| |
| // Prefetch the src data |
| Prefetch( (uint8_t*)src ); |
| Prefetch( (uint8_t*)src + sizeof(PREFETCH) ); |
| |
| // Convert to SSE2 registers |
| __m128i* dst128i = (__m128i*)dst; |
| __m128i* src128i = (__m128i*)src; |
| |
| size_t count = doubleQuadWords; |
| |
| // Copies a cacheline per loop iteration |
| while( count >= doubleQuadWordsPerPrefetch ) |
| { |
| Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) ); |
| |
| count -= doubleQuadWordsPerPrefetch; |
| |
| // Copy cacheline of data |
| for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ ) |
| { |
| _mm_stream_si128( dst128i++, |
| _mm_load_si128( src128i++ ) ); |
| } |
| } |
| |
| // Copy DQWORD if not cacheline multiple |
| while( count-- ) |
| { |
| _mm_stream_si128( dst128i++, |
| _mm_load_si128( src128i++ ) ); |
| } |
| } |
| |
| void FastMemCopy_SSE2_movdqu_movdqa( |
| void* dst, |
| void* src, |
| const size_t doubleQuadWords ) |
| { |
| CM_ASSERT( IsAligned( src, sizeof(DQWORD) ) ); |
| |
| const size_t doubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD); |
| |
| // Prefetch the src data |
| Prefetch( (uint8_t*)src ); |
| Prefetch( (uint8_t*)src + sizeof(PREFETCH) ); |
| |
| // Convert to SSE2 registers |
| __m128i* dst128i = (__m128i*)dst; |
| __m128i* src128i = (__m128i*)src; |
| |
| size_t count = doubleQuadWords; |
| |
| // Copies a cacheline per loop iteration |
| while( count >= doubleQuadWordsPerPrefetch ) |
| { |
| Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) ); |
| |
| count -= doubleQuadWordsPerPrefetch; |
| |
| // Copy cacheline of data |
| for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ ) |
| { |
| _mm_storeu_si128( dst128i++, |
| _mm_load_si128( src128i++ ) ); |
| } |
| } |
| |
| // Copy DQWORD if not cacheline multiple |
| while( count-- ) |
| { |
| _mm_storeu_si128( dst128i++, |
| _mm_load_si128( src128i++ ) ); |
| } |
| } |
| |
| void FastMemCopy_SSE2_movntdq_movdqu( |
| void* dst, |
| const void* src, |
| const size_t doubleQuadWords ) |
| { |
| CM_ASSERT( IsAligned( dst, sizeof(DQWORD) ) ); |
| |
| const size_t doubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD); |
| |
| // Prefetch the src data |
| Prefetch( (uint8_t*)src ); |
| Prefetch( (uint8_t*)src + sizeof(PREFETCH) ); |
| |
| // Convert to SSE2 registers |
| __m128i* dst128i = (__m128i*)dst; |
| __m128i* src128i = (__m128i*)src; |
| |
| size_t count = doubleQuadWords; |
| |
| // Copies a cacheline per loop iteration |
| while( count >= doubleQuadWordsPerPrefetch ) |
| { |
| Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) ); |
| |
| count -= doubleQuadWordsPerPrefetch; |
| |
| // Copy cacheline of data |
| for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ ) |
| { |
| _mm_stream_si128( dst128i++, |
| _mm_loadu_si128( src128i++ ) ); |
| } |
| } |
| |
| // Copy DQWORD if not cacheline multiple |
| while( count-- ) |
| { |
| _mm_stream_si128( dst128i++, |
| _mm_loadu_si128( src128i++ ) ); |
| } |
| } |
| |
| void FastMemCopy_SSE2_movdqu_movdqu( |
| void* dst, |
| const void* src, |
| const size_t doubleQuadWords ) |
| { |
| const size_t doubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD); |
| |
| // Prefetch the src data |
| Prefetch( (uint8_t*)src ); |
| Prefetch( (uint8_t*)src + sizeof(PREFETCH) ); |
| |
| // Convert to SSE2 registers |
| __m128i* dst128i = (__m128i*)dst; |
| __m128i* src128i = (__m128i*)src; |
| |
| size_t count = doubleQuadWords; |
| |
| // Copies a cacheline per loop iteration |
| while( count >= doubleQuadWordsPerPrefetch ) |
| { |
| Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) ); |
| |
| count -= doubleQuadWordsPerPrefetch; |
| |
| // Copy cacheline of data |
| for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ ) |
| { |
| _mm_storeu_si128( dst128i++, |
| _mm_loadu_si128( src128i++ ) ); |
| } |
| } |
| |
| // Copy DQWORD if not cacheline multiple |
| while( count-- ) |
| { |
| _mm_storeu_si128( dst128i++, |
| _mm_loadu_si128( src128i++ ) ); |
| } |
| } |
| |
| void FastMemCopy_SSE2( |
| void* dst, |
| void* src, |
| const size_t doubleQuadWords ) |
| { |
| // Determine if the source and destination addresses are 128-bit aligned |
| const bool isDstDoubleQuadWordAligned = IsAligned( dst, sizeof(DQWORD) ); |
| const bool isSrcDoubleQuadWordAligned = IsAligned( src, sizeof(DQWORD) ); |
| |
| if( isSrcDoubleQuadWordAligned && isDstDoubleQuadWordAligned ) |
| { |
| FastMemCopy_SSE2_movntdq_movdqa( dst, src, doubleQuadWords ); |
| } |
| else if( isDstDoubleQuadWordAligned ) |
| { |
| FastMemCopy_SSE2_movntdq_movdqu( dst, src, doubleQuadWords ); |
| } |
| else if( isSrcDoubleQuadWordAligned ) |
| { |
| FastMemCopy_SSE2_movdqu_movdqa( dst, src, doubleQuadWords ); |
| } |
| else // if( !isSrcDoubleQuadWordAligned && !isDstDoubleQuadWordAligned ) |
| { |
| FastMemCopy_SSE2_movdqu_movdqu( dst, src, doubleQuadWords ); |
| } |
| } |
| |
| void CmFastMemCopy_SSE2( void* dst, const void* src, const size_t bytes ) |
| { |
| // Cache pointers to memory |
| uint8_t *cacheDst = (uint8_t*)dst; |
| uint8_t *cacheSrc = (uint8_t*)src; |
| |
| size_t count = bytes; |
| |
| // Get the number of DQWORDs to be copied |
| const size_t doubleQuadWords = count / sizeof(DQWORD); |
| |
| if( count >= CM_CPU_FASTCOPY_THRESHOLD && doubleQuadWords ) |
| { |
| FastMemCopy_SSE2( cacheDst, cacheSrc, doubleQuadWords ); |
| |
| cacheDst += doubleQuadWords * sizeof(DQWORD); |
| cacheSrc += doubleQuadWords * sizeof(DQWORD); |
| count -= doubleQuadWords * sizeof(DQWORD); |
| } |
| |
| // Copy remaining uint8_t(s) |
| if( count ) |
| { |
| MOS_SecureMemcpy( cacheDst, count, cacheSrc, count ); |
| } |
| } |
| |
| void CmFastMemCopyWC_SSE2( void* dst, const void* src, const size_t bytes ) |
| { |
| // Cache pointers to memory |
| uint8_t *cacheDst = (uint8_t*)dst; |
| uint8_t *cacheSrc = (uint8_t*)src; |
| |
| size_t count = bytes; |
| |
| if( count >= CM_CPU_FASTCOPY_THRESHOLD ) |
| { |
| const size_t doubleQuadwordAlignBytes = |
| GetAlignmentOffset( cacheDst, sizeof(DQWORD) ); |
| |
| // The destination pointer should be 128-bit aligned |
| if( doubleQuadwordAlignBytes ) |
| { |
| MOS_SecureMemcpy( cacheDst, doubleQuadwordAlignBytes,cacheSrc, doubleQuadwordAlignBytes ); |
| |
| cacheDst += doubleQuadwordAlignBytes; |
| cacheSrc += doubleQuadwordAlignBytes; |
| count -= doubleQuadwordAlignBytes; |
| } |
| |
| // Get the number of DQWORDs to be copied |
| const size_t doubleQuadWords = count / sizeof(DQWORD); |
| |
| if( doubleQuadWords && count >= sizeof(PREFETCH)) |
| { |
| // Determine if the source and destination addresses are |
| // 128-bit aligned |
| CM_ASSERT( IsAligned( cacheDst, sizeof(DQWORD) ) ); |
| |
| const bool isSrcDoubleQuadWordAligned = |
| IsAligned( cacheSrc, sizeof(DQWORD) ); |
| |
| if( isSrcDoubleQuadWordAligned ) |
| { |
| FastMemCopy_SSE2_movntdq_movdqa( cacheDst, cacheSrc, |
| doubleQuadWords ); |
| } |
| else |
| { |
| FastMemCopy_SSE2_movntdq_movdqu( cacheDst, cacheSrc, |
| doubleQuadWords ); |
| } |
| |
| cacheDst += doubleQuadWords * sizeof(DQWORD); |
| cacheSrc += doubleQuadWords * sizeof(DQWORD); |
| count -= doubleQuadWords * sizeof(DQWORD); |
| } |
| } |
| |
| // Copy remaining uint8_t(s) |
| if( count ) |
| { |
| MOS_SecureMemcpy( cacheDst, count, cacheSrc, count ); |
| } |
| } |
| |
| #endif // __SSE2__ || !(LINUX || ANDROID) |