fix: round position for vr{add,sub}hn_{s,u}{16,32,64}

ut:
    int64_t r[12];
    int8x8_t   a0 = vraddhn_s16(vdupq_n_s16(1UL<< 7), vdupq_n_s16(0)); r[ 0] = vget_lane_s8 (a0, 0);
    int16x4_t  a1 = vraddhn_s32(vdupq_n_s32(1UL<<15), vdupq_n_s32(0)); r[ 1] = vget_lane_s16(a1, 0);
    int32x2_t  a2 = vraddhn_s64(vdupq_n_s64(1UL<<31), vdupq_n_s64(0)); r[ 2] = vget_lane_s32(a2, 0);
    uint8x8_t  a3 = vraddhn_u16(vdupq_n_u16(1UL<< 7), vdupq_n_u16(0)); r[ 3] = vget_lane_u8 (a3, 0);
    uint16x4_t a4 = vraddhn_u32(vdupq_n_u32(1UL<<15), vdupq_n_u32(0)); r[ 4] = vget_lane_u16(a4, 0);
    uint32x2_t a5 = vraddhn_u64(vdupq_n_u64(1UL<<31), vdupq_n_u64(0)); r[ 5] = vget_lane_u32(a5, 0);
    int8x8_t   s0 = vrsubhn_s16(vdupq_n_s16(1UL<< 7), vdupq_n_s16(0)); r[ 6] = vget_lane_s8 (s0, 0);
    int16x4_t  s1 = vrsubhn_s32(vdupq_n_s32(1UL<<15), vdupq_n_s32(0)); r[ 7] = vget_lane_s16(s1, 0);
    int32x2_t  s2 = vrsubhn_s64(vdupq_n_s64(1UL<<31), vdupq_n_s64(0)); r[ 8] = vget_lane_s32(s2, 0);
    uint8x8_t  s3 = vrsubhn_u16(vdupq_n_u16(1UL<< 7), vdupq_n_u16(0)); r[ 9] = vget_lane_u8 (s3, 0);
    uint16x4_t s4 = vrsubhn_u32(vdupq_n_u32(1UL<<15), vdupq_n_u32(0)); r[10] = vget_lane_u16(s4, 0);
    uint32x2_t s5 = vrsubhn_u64(vdupq_n_u64(1UL<<31), vdupq_n_u64(0)); r[11] = vget_lane_u32(s5, 0);
    for(unsigned i = 0; i < 12; i++) {
        assert(r[i] == 1);
    }
diff --git a/NEON_2_SSE.h b/NEON_2_SSE.h
index eda9e0a..4cc0617 100644
--- a/NEON_2_SSE.h
+++ b/NEON_2_SSE.h
@@ -3452,7 +3452,7 @@
     int8x8_t res64;
     __m128i sum, mask1;
     sum = _mm_add_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
     sum = _mm_srai_epi16 (sum, 8); //get high half
     sum = _mm_add_epi16 (sum, mask1); //actual rounding
@@ -3467,7 +3467,7 @@
     int16x4_t res64;
     __m128i sum, mask1;
     sum = _mm_add_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
     sum = _mm_srai_epi32 (sum, 16); //get high half
     sum = _mm_add_epi32 (sum, mask1); //actual rounding
@@ -3482,9 +3482,9 @@
     int32x2_t res64;
     __m128i sum, mask1;
     sum = _mm_add_epi64 (a, b);
-    mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
-    sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
+    mask1 = _mm_slli_epi64(sum, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    sum = _mm_add_epi32 (sum, mask1); //actual high half rounding
     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
     return64(sum);
 }
@@ -3495,7 +3495,7 @@
     uint8x8_t res64;
     __m128i sum, mask1;
     sum = _mm_add_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
     sum = _mm_srai_epi16 (sum, 8); //get high half
     sum = _mm_add_epi16 (sum, mask1); //actual rounding
@@ -3510,7 +3510,7 @@
     uint16x4_t res64;
     __m128i sum, mask1;
     sum = _mm_add_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
     sum = _mm_srai_epi32 (sum, 16); //get high half
     sum = _mm_add_epi32 (sum, mask1); //actual rounding
@@ -4919,7 +4919,7 @@
     int8x8_t res64;
     __m128i sub, mask1;
     sub = _mm_sub_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
     sub = _mm_srai_epi16 (sub, 8); //get high half
     sub = _mm_add_epi16 (sub, mask1); //actual rounding
@@ -4934,7 +4934,7 @@
     int16x4_t res64;
     __m128i sub, mask1;
     sub = _mm_sub_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
     sub = _mm_srai_epi32 (sub, 16); //get high half
     sub = _mm_add_epi32 (sub, mask1); //actual rounding
@@ -4949,8 +4949,8 @@
     int32x2_t res64;
     __m128i sub, mask1;
     sub = _mm_sub_epi64 (a, b);
-    mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    mask1 = _mm_slli_epi64(sub, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
     sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
     return64(sub);
@@ -4962,7 +4962,7 @@
     uint8x8_t res64;
     __m128i sub, mask1;
     sub = _mm_sub_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
     sub = _mm_srai_epi16 (sub, 8); //get high half
     sub = _mm_add_epi16 (sub, mask1); //actual rounding
@@ -4977,7 +4977,7 @@
     uint16x4_t res64;
     __m128i sub, mask1;
     sub = _mm_sub_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
     sub = _mm_srai_epi32 (sub, 16); //get high half
     sub = _mm_add_epi32 (sub, mask1); //actual rounding