linker/arch/arm_neon/linker_gnu_hash_neon.cpp - third_party/android.googlesource.com/platform/bionic - Git at Google

 /*
  * Copyright (C) 2019 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 // A Neon vectorized implementation of the GNU symbol hash function.

 // This function generally accesses beyond the bounds of the name string. Specifically, it reads
 // each aligned 8-byte chunk containing a byte of the string, including the final NUL byte. This
 // should be acceptable for use with MTE, which uses 16-byte granules. Typically, the function is
 // used to hash strings in an ELF file's string table, where MTE is presumably unaware of the
 // bounds of each symbol, but the linker also hashes the symbol name passed to dlsym.

 #include "linker_gnu_hash_neon.h"

 #include <arm_neon.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>

 struct __attribute__((aligned(8))) GnuHashInitEntry {
   uint64_t ignore_mask;
   uint32_t accum;
 };

 constexpr uint32_t kStep0 = 1;
 constexpr uint32_t kStep1 = kStep0 * 33;
 constexpr uint32_t kStep2 = kStep1 * 33;
 constexpr uint32_t kStep3 = kStep2 * 33;
 constexpr uint32_t kStep4 = kStep3 * 33;
 constexpr uint32_t kStep5 = kStep4 * 33;
 constexpr uint32_t kStep6 = kStep5 * 33;
 constexpr uint32_t kStep7 = kStep6 * 33;
 constexpr uint32_t kStep8 = kStep7 * 33;
 constexpr uint32_t kStep9 = kStep8 * 33;
 constexpr uint32_t kStep10 = kStep9 * 33;
 constexpr uint32_t kStep11 = kStep10 * 33;

 // Step by -1 through -7:  33 * 0x3e0f83e1 == 1 (mod 2**32)
 constexpr uint32_t kStepN1 = kStep0 * 0x3e0f83e1;
 constexpr uint32_t kStepN2 = kStepN1 * 0x3e0f83e1;
 constexpr uint32_t kStepN3 = kStepN2 * 0x3e0f83e1;
 constexpr uint32_t kStepN4 = kStepN3 * 0x3e0f83e1;
 constexpr uint32_t kStepN5 = kStepN4 * 0x3e0f83e1;
 constexpr uint32_t kStepN6 = kStepN5 * 0x3e0f83e1;
 constexpr uint32_t kStepN7 = kStepN6 * 0x3e0f83e1;

 // Calculate the GNU hash and string length of the symbol name.
 //
 // The hash calculation is an optimized version of this function:
 //
 //    uint32_t calculate_gnu_hash(const uint8_t* name) {
 //      uint32_t h = 5381;
 //      for (; *name != '\0'; ++name) {
 //        h *= 33;
 //        h += *name;
 //      }
 //      return h;
 //    }
 //
 // This does an within-alignment out-of-bounds read for performance reasons.
 __attribute__((no_sanitize("hwaddress")))
 std::pair<uint32_t, uint32_t> calculate_gnu_hash_neon(const char* name) {

   // The input string may be misaligned by 0-7 bytes (K). This function loads the first aligned
   // 8-byte chunk, then counteracts the misalignment:
   //  - The initial K bytes are set to 0xff in the working chunk vector.
   //  - The accumulator is initialized to 5381 * modinv(33)**K.
   //  - The accumulator also cancels out each initial 0xff byte.
   // If we could set bytes to NUL instead, then the accumulator wouldn't need to cancel out the
   // 0xff values, but this would break the NUL check.

   static const struct GnuHashInitEntry kInitTable[] = {
     { // (addr&7) == 0
       0ull,
       5381u*kStep0,
     }, { // (addr&7) == 1
       0xffull,
       5381u*kStepN1 - 0xffu*kStepN1,
     }, { // (addr&7) == 2
       0xffffull,
       5381u*kStepN2 - 0xffu*kStepN1 - 0xffu*kStepN2,
     }, { // (addr&7) == 3
       0xffffffull,
       5381u*kStepN3 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3,
     }, { // (addr&7) == 4
       0xffffffffull,
       5381u*kStepN4 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4,
     }, { // (addr&7) == 5
       0xffffffffffull,
       5381u*kStepN5 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5,
     }, { // (addr&7) == 6
       0xffffffffffffull,
       5381u*kStepN6 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6,
     }, { // (addr&7) == 7
       0xffffffffffffffull,
       5381u*kStepN7 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6 - 0xffu*kStepN7,
     },
   };

   uint8_t offset = reinterpret_cast<uintptr_t>(name) & 7;
   const uint64_t* chunk_ptr = reinterpret_cast<const uint64_t*>(reinterpret_cast<uintptr_t>(name) & ~7);
   const struct GnuHashInitEntry* entry = &kInitTable[offset];

   uint8x8_t chunk = vld1_u8(reinterpret_cast<const uint8_t*>(chunk_ptr));
   chunk |= vld1_u8(reinterpret_cast<const uint8_t*>(&entry->ignore_mask));

   uint32x4_t accum_lo = { 0 };
   uint32x4_t accum_hi = { entry->accum, 0, 0, 0 };
   const uint16x4_t kInclineVec = { kStep3, kStep2, kStep1, kStep0 };
   const uint32x4_t kStep8Vec = vdupq_n_u32(kStep8);
   uint8x8_t is_nul;
   uint16x8_t expand;

   while (1) {
     // Exit the loop if any of the 8 bytes is NUL.
     is_nul = vceq_u8(chunk, (uint8x8_t){ 0 });
     expand = vmovl_u8(chunk);
     uint64x1_t is_nul_64 = vreinterpret_u64_u8(is_nul);
     if (vget_lane_u64(is_nul_64, 0)) break;

     // Multiply both accumulators by 33**8.
     accum_lo = vmulq_u32(accum_lo, kStep8Vec);
     accum_hi = vmulq_u32(accum_hi, kStep8Vec);

     // Multiply each 4-piece subchunk by (33**3, 33**2, 33*1, 1), then accumulate the result. The lo
     // accumulator will be behind by 33**4 until the very end of the computation.
     accum_lo = vmlal_u16(accum_lo, vget_low_u16(expand), kInclineVec);
     accum_hi = vmlal_u16(accum_hi, vget_high_u16(expand), kInclineVec);

     // Load the next chunk.
     chunk = vld1_u8(reinterpret_cast<const uint8_t*>(++chunk_ptr));
   }

   // Reverse the is-NUL vector so we can use clz to count the number of remaining bytes.
   is_nul = vrev64_u8(is_nul);
   const uint64_t is_nul_u64 = vget_lane_u64(vreinterpret_u64_u8(is_nul), 0);
   const uint32_t num_valid_bits = __builtin_clzll(is_nul_u64);

   const uint32_t name_len = reinterpret_cast<const char*>(chunk_ptr) - name + (num_valid_bits >> 3);

   static const uint32_t kFinalStepTable[] = {
     kStep4, kStep0,   // 0 remaining bytes
     kStep5, kStep1,   // 1 remaining byte
     kStep6, kStep2,   // 2 remaining bytes
     kStep7, kStep3,   // 3 remaining bytes
     kStep8, kStep4,   // 4 remaining bytes
     kStep9, kStep5,   // 5 remaining bytes
     kStep10, kStep6,  // 6 remaining bytes
     kStep11, kStep7,  // 7 remaining bytes
   };

   // Advance the lo/hi accumulators appropriately for the number of remaining bytes. Multiply 33**4
   // into the lo accumulator to catch it up with the hi accumulator.
   const uint32_t* final_step = &kFinalStepTable[num_valid_bits >> 2];
   accum_lo = vmulq_u32(accum_lo, vdupq_n_u32(final_step[0]));
   accum_lo = vmlaq_u32(accum_lo, accum_hi, vdupq_n_u32(final_step[1]));

   static const uint32_t kFinalInclineTable[] = {
     0,      kStep6, kStep5, kStep4, kStep3, kStep2, kStep1, kStep0,
     0,      0,      0,      0,      0,      0,      0,      0,
   };

   // Prepare a vector to multiply powers of 33 into each of the remaining bytes.
   const uint32_t* const incline = &kFinalInclineTable[8 - (num_valid_bits >> 3)];
   const uint32x4_t incline_lo = vld1q_u32(incline);
   const uint32x4_t incline_hi = vld1q_u32(incline + 4);

   // Multiply 33 into each of the remaining 4-piece vectors, then accumulate everything into
   // accum_lo. Combine everything into a single 32-bit result.
   accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_low_u16(expand)), incline_lo);
   accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_high_u16(expand)), incline_hi);

   uint32x2_t sum = vadd_u32(vget_low_u32(accum_lo), vget_high_u32(accum_lo));
   const uint32_t hash = sum[0] + sum[1];

   return { hash, name_len };
 }
	/*
	* Copyright (C) 2019 The Android Open Source Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	// A Neon vectorized implementation of the GNU symbol hash function.

	// This function generally accesses beyond the bounds of the name string. Specifically, it reads
	// each aligned 8-byte chunk containing a byte of the string, including the final NUL byte. This
	// should be acceptable for use with MTE, which uses 16-byte granules. Typically, the function is
	// used to hash strings in an ELF file's string table, where MTE is presumably unaware of the
	// bounds of each symbol, but the linker also hashes the symbol name passed to dlsym.

	#include "linker_gnu_hash_neon.h"

	#include <arm_neon.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <stdlib.h>

	struct __attribute__((aligned(8))) GnuHashInitEntry {
	uint64_t ignore_mask;
	uint32_t accum;
	};

	constexpr uint32_t kStep0 = 1;
	constexpr uint32_t kStep1 = kStep0 * 33;
	constexpr uint32_t kStep2 = kStep1 * 33;
	constexpr uint32_t kStep3 = kStep2 * 33;
	constexpr uint32_t kStep4 = kStep3 * 33;
	constexpr uint32_t kStep5 = kStep4 * 33;
	constexpr uint32_t kStep6 = kStep5 * 33;
	constexpr uint32_t kStep7 = kStep6 * 33;
	constexpr uint32_t kStep8 = kStep7 * 33;
	constexpr uint32_t kStep9 = kStep8 * 33;
	constexpr uint32_t kStep10 = kStep9 * 33;
	constexpr uint32_t kStep11 = kStep10 * 33;

	// Step by -1 through -7: 33 * 0x3e0f83e1 == 1 (mod 2**32)
	constexpr uint32_t kStepN1 = kStep0 * 0x3e0f83e1;
	constexpr uint32_t kStepN2 = kStepN1 * 0x3e0f83e1;
	constexpr uint32_t kStepN3 = kStepN2 * 0x3e0f83e1;
	constexpr uint32_t kStepN4 = kStepN3 * 0x3e0f83e1;
	constexpr uint32_t kStepN5 = kStepN4 * 0x3e0f83e1;
	constexpr uint32_t kStepN6 = kStepN5 * 0x3e0f83e1;
	constexpr uint32_t kStepN7 = kStepN6 * 0x3e0f83e1;

	// Calculate the GNU hash and string length of the symbol name.
	//
	// The hash calculation is an optimized version of this function:
	//
	// uint32_t calculate_gnu_hash(const uint8_t* name) {
	// uint32_t h = 5381;
	// for (; *name != '\0'; ++name) {
	// h *= 33;
	// h += *name;
	// }
	// return h;
	// }
	//
	// This does an within-alignment out-of-bounds read for performance reasons.
	__attribute__((no_sanitize("hwaddress")))
	std::pair<uint32_t, uint32_t> calculate_gnu_hash_neon(const char* name) {

	// The input string may be misaligned by 0-7 bytes (K). This function loads the first aligned
	// 8-byte chunk, then counteracts the misalignment:
	// - The initial K bytes are set to 0xff in the working chunk vector.
	// - The accumulator is initialized to 5381 * modinv(33)**K.
	// - The accumulator also cancels out each initial 0xff byte.
	// If we could set bytes to NUL instead, then the accumulator wouldn't need to cancel out the
	// 0xff values, but this would break the NUL check.

	static const struct GnuHashInitEntry kInitTable[] = {
	{ // (addr&7) == 0
	0ull,
	5381u*kStep0,
	}, { // (addr&7) == 1
	0xffull,
	5381ukStepN1 - 0xffukStepN1,
	}, { // (addr&7) == 2
	0xffffull,
	5381ukStepN2 - 0xffukStepN1 - 0xffu*kStepN2,
	}, { // (addr&7) == 3
	0xffffffull,
	5381ukStepN3 - 0xffukStepN1 - 0xffukStepN2 - 0xffukStepN3,
	}, { // (addr&7) == 4
	0xffffffffull,
	5381ukStepN4 - 0xffukStepN1 - 0xffukStepN2 - 0xffukStepN3 - 0xffu*kStepN4,
	}, { // (addr&7) == 5
	0xffffffffffull,
	5381ukStepN5 - 0xffukStepN1 - 0xffukStepN2 - 0xffukStepN3 - 0xffukStepN4 - 0xffukStepN5,
	}, { // (addr&7) == 6
	0xffffffffffffull,
	5381ukStepN6 - 0xffukStepN1 - 0xffukStepN2 - 0xffukStepN3 - 0xffukStepN4 - 0xffukStepN5 - 0xffu*kStepN6,
	}, { // (addr&7) == 7
	0xffffffffffffffull,
	5381ukStepN7 - 0xffukStepN1 - 0xffukStepN2 - 0xffukStepN3 - 0xffukStepN4 - 0xffukStepN5 - 0xffukStepN6 - 0xffukStepN7,
	},
	};

	uint8_t offset = reinterpret_cast<uintptr_t>(name) & 7;
	const uint64_t* chunk_ptr = reinterpret_cast<const uint64_t*>(reinterpret_cast<uintptr_t>(name) & ~7);
	const struct GnuHashInitEntry* entry = &kInitTable[offset];

	uint8x8_t chunk = vld1_u8(reinterpret_cast<const uint8_t*>(chunk_ptr));
	chunk \|= vld1_u8(reinterpret_cast<const uint8_t*>(&entry->ignore_mask));

	uint32x4_t accum_lo = { 0 };
	uint32x4_t accum_hi = { entry->accum, 0, 0, 0 };
	const uint16x4_t kInclineVec = { kStep3, kStep2, kStep1, kStep0 };
	const uint32x4_t kStep8Vec = vdupq_n_u32(kStep8);
	uint8x8_t is_nul;
	uint16x8_t expand;

	while (1) {
	// Exit the loop if any of the 8 bytes is NUL.
	is_nul = vceq_u8(chunk, (uint8x8_t){ 0 });
	expand = vmovl_u8(chunk);
	uint64x1_t is_nul_64 = vreinterpret_u64_u8(is_nul);
	if (vget_lane_u64(is_nul_64, 0)) break;

	// Multiply both accumulators by 33**8.
	accum_lo = vmulq_u32(accum_lo, kStep8Vec);
	accum_hi = vmulq_u32(accum_hi, kStep8Vec);

	// Multiply each 4-piece subchunk by (333, 332, 33*1, 1), then accumulate the result. The lo
	// accumulator will be behind by 33**4 until the very end of the computation.
	accum_lo = vmlal_u16(accum_lo, vget_low_u16(expand), kInclineVec);
	accum_hi = vmlal_u16(accum_hi, vget_high_u16(expand), kInclineVec);

	// Load the next chunk.
	chunk = vld1_u8(reinterpret_cast<const uint8_t*>(++chunk_ptr));
	}

	// Reverse the is-NUL vector so we can use clz to count the number of remaining bytes.
	is_nul = vrev64_u8(is_nul);
	const uint64_t is_nul_u64 = vget_lane_u64(vreinterpret_u64_u8(is_nul), 0);
	const uint32_t num_valid_bits = __builtin_clzll(is_nul_u64);

	const uint32_t name_len = reinterpret_cast<const char*>(chunk_ptr) - name + (num_valid_bits >> 3);

	static const uint32_t kFinalStepTable[] = {
	kStep4, kStep0, // 0 remaining bytes
	kStep5, kStep1, // 1 remaining byte
	kStep6, kStep2, // 2 remaining bytes
	kStep7, kStep3, // 3 remaining bytes
	kStep8, kStep4, // 4 remaining bytes
	kStep9, kStep5, // 5 remaining bytes
	kStep10, kStep6, // 6 remaining bytes
	kStep11, kStep7, // 7 remaining bytes
	};

	// Advance the lo/hi accumulators appropriately for the number of remaining bytes. Multiply 33**4
	// into the lo accumulator to catch it up with the hi accumulator.
	const uint32_t* final_step = &kFinalStepTable[num_valid_bits >> 2];
	accum_lo = vmulq_u32(accum_lo, vdupq_n_u32(final_step[0]));
	accum_lo = vmlaq_u32(accum_lo, accum_hi, vdupq_n_u32(final_step[1]));

	static const uint32_t kFinalInclineTable[] = {
	0, kStep6, kStep5, kStep4, kStep3, kStep2, kStep1, kStep0,
	0, 0, 0, 0, 0, 0, 0, 0,
	};

	// Prepare a vector to multiply powers of 33 into each of the remaining bytes.
	const uint32_t* const incline = &kFinalInclineTable[8 - (num_valid_bits >> 3)];
	const uint32x4_t incline_lo = vld1q_u32(incline);
	const uint32x4_t incline_hi = vld1q_u32(incline + 4);

	// Multiply 33 into each of the remaining 4-piece vectors, then accumulate everything into
	// accum_lo. Combine everything into a single 32-bit result.
	accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_low_u16(expand)), incline_lo);
	accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_high_u16(expand)), incline_hi);

	uint32x2_t sum = vadd_u32(vget_low_u32(accum_lo), vget_high_u32(accum_lo));
	const uint32_t hash = sum[0] + sum[1];

	return { hash, name_len };
	}