arch/power/adler32_vmx.c - third_party/github.com/zlib-ng/zlib-ng - Git at Google

 /* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
  * Copyright (C) 1995-2011 Mark Adler
  * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
  * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
  * For conditions of distribution and use, see copyright notice in zlib.h
  */

 #ifdef PPC_VMX_ADLER32
 #include <altivec.h>
 #include "zbuild.h"
 #include "adler32_p.h"

 #define vmx_zero()  (vec_splat_u32(0))

 static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
     unsigned int i;
     for (i = 0; i < len; ++i) {
         pair[0] += buf[i];
         pair[1] += pair[0];
     }
 }

 static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
     /* Different taps for the separable components of sums */
     const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
     const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
     const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
     const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
     /* As silly and inefficient as it seems, creating 1 permutation vector to permute
      * a 2 element vector from a single load + a subsequent shift is just barely faster
      * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
     const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
     const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
     vector unsigned int  adacc, s2acc;
     vector unsigned int pair_vec = vec_ld(0, s);
     adacc = vec_perm(pair_vec, pair_vec, s0_perm);
     s2acc = vec_slo(pair_vec, shift_vec);

     vector unsigned int zero = vmx_zero();
     vector unsigned int s3acc = zero;
     vector unsigned int s3acc_0 = zero;
     vector unsigned int adacc_prev = adacc;
     vector unsigned int adacc_prev_0 = zero;

     vector unsigned int s2acc_0 = zero;
     vector unsigned int s2acc_1 = zero;
     vector unsigned int s2acc_2 = zero;

     /* Maintain a running sum of a second half, this might help use break yet another
      * data dependency bubble in the sum */
     vector unsigned int adacc_0 = zero;

     int num_iter = len / 4;
     int rem = len & 3;

     for (int i = 0; i < num_iter; ++i) {
         vector unsigned char d0 = vec_ld(0, buf);
         vector unsigned char d1 = vec_ld(16, buf);
         vector unsigned char d2 = vec_ld(32, buf);
         vector unsigned char d3 = vec_ld(48, buf);

         /* The core operation of the loop, basically
          * what is being unrolled below */
         adacc = vec_sum4s(d0, adacc);
         s3acc = vec_add(s3acc, adacc_prev);
         s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
         s2acc = vec_msum(t0, d0, s2acc);

         /* interleave dependent sums in here */
         adacc_0 = vec_sum4s(d1, adacc_0);
         s2acc_0 = vec_msum(t1, d1, s2acc_0);
         adacc = vec_sum4s(d2, adacc);
         s2acc_1 = vec_msum(t2, d2, s2acc_1);
         s2acc_2 = vec_msum(t3, d3, s2acc_2);
         adacc_0 = vec_sum4s(d3, adacc_0);

         adacc_prev = adacc;
         adacc_prev_0 = adacc_0;
         buf += 64;
     }

     adacc = vec_add(adacc, adacc_0);
     s3acc = vec_add(s3acc, s3acc_0);
     s3acc = vec_sl(s3acc, vec_splat_u32(6));

     if (rem) {
         adacc_prev = vec_add(adacc_prev_0, adacc_prev);
         adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
         while (rem--) {
             vector unsigned char d0 = vec_ld(0, buf);
             adacc = vec_sum4s(d0, adacc);
             s3acc = vec_add(s3acc, adacc_prev);
             s2acc = vec_msum(t3, d0, s2acc);
             adacc_prev = vec_sl(adacc, vec_splat_u32(4));
             buf += 16;
         }
     }


     /* Sum up independent second sums */
     s2acc = vec_add(s2acc, s2acc_0);
     s2acc_2 = vec_add(s2acc_1, s2acc_2);
     s2acc = vec_add(s2acc, s2acc_2);

     s2acc = vec_add(s2acc, s3acc);

     adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
     s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
     adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
     s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));

     vec_ste(adacc, 0, s);
     vec_ste(s2acc, 0, s+1);
 }

 uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
     uint32_t sum2;
     uint32_t pair[16] ALIGNED_(16);
     memset(&pair[2], 0, 14);
     int n = NMAX;
     unsigned int done = 0, i;

     /* Split Adler-32 into component sums, it can be supplied by
      * the caller sites (e.g. in a PNG file).
      */
     sum2 = (adler >> 16) & 0xffff;
     adler &= 0xffff;
     pair[0] = adler;
     pair[1] = sum2;

     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
         return adler32_len_1(adler, buf, sum2);

     /* initial Adler-32 value (deferred check for len == 1 speed) */
     if (UNLIKELY(buf == NULL))
         return 1L;

     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
         return adler32_len_16(adler, buf, len, sum2);

     // Align buffer
     unsigned int al = 0;
     if ((uintptr_t)buf & 0xf) {
         al = 16-((uintptr_t)buf & 0xf);
         if (al > len) {
             al=len;
         }
         vmx_handle_head_or_tail(pair, buf, al);

         done += al;
         /* Rather than rebasing, we can reduce the max sums for the
          * first round only */
         n -= al;
     }
     for (i = al; i < len; i += n) {
         int remaining = (int)(len-i);
         n = MIN(remaining, (i == al) ? n : NMAX);

         if (n < 16)
             break;

         vmx_accum32(pair, buf + i, n / 16);
         pair[0] %= BASE;
         pair[1] %= BASE;

         done += (n / 16) * 16;
     }

     /* Handle the tail elements. */
     if (done < len) {
         vmx_handle_head_or_tail(pair, (buf + done), len - done);
         pair[0] %= BASE;
         pair[1] %= BASE;
     }

     /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
     return (pair[1] << 16) | pair[0];
 }
 #endif
	/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
	* Copyright (C) 1995-2011 Mark Adler
	* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
	* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
	* For conditions of distribution and use, see copyright notice in zlib.h
	*/

	#ifdef PPC_VMX_ADLER32
	#include <altivec.h>
	#include "zbuild.h"
	#include "adler32_p.h"

	#define vmx_zero() (vec_splat_u32(0))

	static inline void vmx_handle_head_or_tail(uint32_t pair, const uint8_t buf, uint64_t len) {
	unsigned int i;
	for (i = 0; i < len; ++i) {
	pair[0] += buf[i];
	pair[1] += pair[0];
	}
	}

	static void vmx_accum32(uint32_t s, const uint8_t buf, uint64_t len) {
	/* Different taps for the separable components of sums */
	const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
	const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
	const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
	const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
	/* As silly and inefficient as it seems, creating 1 permutation vector to permute
	* a 2 element vector from a single load + a subsequent shift is just barely faster
	* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
	const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
	const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
	vector unsigned int adacc, s2acc;
	vector unsigned int pair_vec = vec_ld(0, s);
	adacc = vec_perm(pair_vec, pair_vec, s0_perm);
	s2acc = vec_slo(pair_vec, shift_vec);

	vector unsigned int zero = vmx_zero();
	vector unsigned int s3acc = zero;
	vector unsigned int s3acc_0 = zero;
	vector unsigned int adacc_prev = adacc;
	vector unsigned int adacc_prev_0 = zero;

	vector unsigned int s2acc_0 = zero;
	vector unsigned int s2acc_1 = zero;
	vector unsigned int s2acc_2 = zero;

	/* Maintain a running sum of a second half, this might help use break yet another
	* data dependency bubble in the sum */
	vector unsigned int adacc_0 = zero;

	int num_iter = len / 4;
	int rem = len & 3;

	for (int i = 0; i < num_iter; ++i) {
	vector unsigned char d0 = vec_ld(0, buf);
	vector unsigned char d1 = vec_ld(16, buf);
	vector unsigned char d2 = vec_ld(32, buf);
	vector unsigned char d3 = vec_ld(48, buf);

	/* The core operation of the loop, basically
	* what is being unrolled below */
	adacc = vec_sum4s(d0, adacc);
	s3acc = vec_add(s3acc, adacc_prev);
	s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
	s2acc = vec_msum(t0, d0, s2acc);

	/* interleave dependent sums in here */
	adacc_0 = vec_sum4s(d1, adacc_0);
	s2acc_0 = vec_msum(t1, d1, s2acc_0);
	adacc = vec_sum4s(d2, adacc);
	s2acc_1 = vec_msum(t2, d2, s2acc_1);
	s2acc_2 = vec_msum(t3, d3, s2acc_2);
	adacc_0 = vec_sum4s(d3, adacc_0);

	adacc_prev = adacc;
	adacc_prev_0 = adacc_0;
	buf += 64;
	}

	adacc = vec_add(adacc, adacc_0);
	s3acc = vec_add(s3acc, s3acc_0);
	s3acc = vec_sl(s3acc, vec_splat_u32(6));

	if (rem) {
	adacc_prev = vec_add(adacc_prev_0, adacc_prev);
	adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
	while (rem--) {
	vector unsigned char d0 = vec_ld(0, buf);
	adacc = vec_sum4s(d0, adacc);
	s3acc = vec_add(s3acc, adacc_prev);
	s2acc = vec_msum(t3, d0, s2acc);
	adacc_prev = vec_sl(adacc, vec_splat_u32(4));
	buf += 16;
	}
	}


	/* Sum up independent second sums */
	s2acc = vec_add(s2acc, s2acc_0);
	s2acc_2 = vec_add(s2acc_1, s2acc_2);
	s2acc = vec_add(s2acc, s2acc_2);

	s2acc = vec_add(s2acc, s3acc);

	adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
	s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
	adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
	s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));

	vec_ste(adacc, 0, s);
	vec_ste(s2acc, 0, s+1);
	}

	uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
	uint32_t sum2;
	uint32_t pair[16] ALIGNED_(16);
	memset(&pair[2], 0, 14);
	int n = NMAX;
	unsigned int done = 0, i;

	/* Split Adler-32 into component sums, it can be supplied by
	* the caller sites (e.g. in a PNG file).
	*/
	sum2 = (adler >> 16) & 0xffff;
	adler &= 0xffff;
	pair[0] = adler;
	pair[1] = sum2;

	/* in case user likes doing a byte at a time, keep it fast */
	if (UNLIKELY(len == 1))
	return adler32_len_1(adler, buf, sum2);

	/* initial Adler-32 value (deferred check for len == 1 speed) */
	if (UNLIKELY(buf == NULL))
	return 1L;

	/* in case short lengths are provided, keep it somewhat fast */
	if (UNLIKELY(len < 16))
	return adler32_len_16(adler, buf, len, sum2);

	// Align buffer
	unsigned int al = 0;
	if ((uintptr_t)buf & 0xf) {
	al = 16-((uintptr_t)buf & 0xf);
	if (al > len) {
	al=len;
	}
	vmx_handle_head_or_tail(pair, buf, al);

	done += al;
	/* Rather than rebasing, we can reduce the max sums for the
	* first round only */
	n -= al;
	}
	for (i = al; i < len; i += n) {
	int remaining = (int)(len-i);
	n = MIN(remaining, (i == al) ? n : NMAX);

	if (n < 16)
	break;

	vmx_accum32(pair, buf + i, n / 16);
	pair[0] %= BASE;
	pair[1] %= BASE;

	done += (n / 16) * 16;
	}

	/* Handle the tail elements. */
	if (done < len) {
	vmx_handle_head_or_tail(pair, (buf + done), len - done);
	pair[0] %= BASE;
	pair[1] %= BASE;
	}

	/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
	return (pair[1] << 16) \| pair[0];
	}
	#endif