blob: 2551e36d7bea91e5ebe8215e79bc0b6384f75aa1 [file] [log] [blame]
// Copyright 2015 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// SSE3 instrinsics for cumulative sum and conversion to pixels
#include <stdint.h>
#include <tmmintrin.h>
void accumulate_sse(const float *in, uint8_t *out, uint32_t n) {
__m128 offset = _mm_setzero_ps();
__m128i mask = _mm_set1_epi32(0x0c080400);
__m128 sign_mask = _mm_set1_ps(-0.f);
for (uint32_t i = 0; i < n; i += 4) {
__m128 x = _mm_load_ps(&in[i]);
x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4)));
x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40));
x = _mm_add_ps(x, offset);
__m128 y = _mm_andnot_ps(sign_mask, x); // fabs(x)
y = _mm_min_ps(y, _mm_set1_ps(1.0f));
y = _mm_mul_ps(y, _mm_set1_ps(255.0f));
__m128i z = _mm_cvttps_epi32(y);
z = _mm_shuffle_epi8(z, mask);
_mm_store_ss((float *)&out[i], _mm_castsi128_ps(z));
offset = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 3, 3, 3));
}
}