| /************************************************************************** |
| * |
| * Copyright 2010 VMware, Inc. |
| * All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the |
| * "Software"), to deal in the Software without restriction, including |
| * without limitation the rights to use, copy, modify, merge, publish, |
| * distribute, sub license, and/or sell copies of the Software, and to |
| * permit persons to whom the Software is furnished to do so, subject to |
| * the following conditions: |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
| * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, |
| * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
| * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
| * USE OR OTHER DEALINGS IN THE SOFTWARE. |
| * |
| * The above copyright notice and this permission notice (including the |
| * next paragraph) shall be included in all copies or substantial portions |
| * of the Software. |
| * |
| **************************************************************************/ |
| |
| |
| #include "util/u_debug.h" |
| #include "util/u_cpu_detect.h" |
| #include "util/u_math.h" |
| #include "lp_bld_debug.h" |
| #include "lp_bld_const.h" |
| #include "lp_bld_format.h" |
| #include "lp_bld_gather.h" |
| #include "lp_bld_swizzle.h" |
| #include "lp_bld_type.h" |
| #include "lp_bld_init.h" |
| #include "lp_bld_intr.h" |
| #include "lp_bld_pack.h" |
| |
| |
| /** |
| * Get the pointer to one element from scatter positions in memory. |
| * |
| * @sa lp_build_gather() |
| */ |
| LLVMValueRef |
| lp_build_gather_elem_ptr(struct gallivm_state *gallivm, |
| unsigned length, |
| LLVMValueRef base_ptr, |
| LLVMValueRef offsets, |
| unsigned i) |
| { |
| LLVMValueRef offset; |
| LLVMValueRef ptr; |
| |
| assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); |
| |
| if (length == 1) { |
| assert(i == 0); |
| offset = offsets; |
| } else { |
| LLVMValueRef index = lp_build_const_int32(gallivm, i); |
| offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, ""); |
| } |
| |
| ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, ""); |
| |
| return ptr; |
| } |
| |
| |
| /** |
| * Gather one element from scatter positions in memory. |
| * |
| * @sa lp_build_gather() |
| */ |
| LLVMValueRef |
| lp_build_gather_elem(struct gallivm_state *gallivm, |
| unsigned length, |
| unsigned src_width, |
| unsigned dst_width, |
| boolean aligned, |
| LLVMValueRef base_ptr, |
| LLVMValueRef offsets, |
| unsigned i, |
| boolean vector_justify) |
| { |
| LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); |
| LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); |
| LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); |
| LLVMValueRef ptr; |
| LLVMValueRef res; |
| |
| assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); |
| |
| ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); |
| ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); |
| res = LLVMBuildLoad(gallivm->builder, ptr, ""); |
| |
| /* XXX |
| * On some archs we probably really want to avoid having to deal |
| * with alignments lower than 4 bytes (if fetch size is a power of |
| * two >= 32). On x86 it doesn't matter, however. |
| * We should be able to guarantee full alignment for any kind of texture |
| * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch |
| * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends |
| * but I don't think that's quite what we wanted). |
| * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT |
| * looks like a good fit, but it seems this cap bit (and OpenGL) aren't |
| * enforcing what we want (which is what d3d10 does, the offset needs to |
| * be aligned to element size, but GL has bytes regardless of element |
| * size which would only leave us with minimum alignment restriction of 16 |
| * which doesn't make much sense if the type isn't 4x32bit). Due to |
| * translation of offsets to first_elem in sampler_views it actually seems |
| * gallium could not do anything else except 16 no matter what... |
| */ |
| if (!aligned) { |
| LLVMSetAlignment(res, 1); |
| } else if (!util_is_power_of_two_or_zero(src_width)) { |
| /* |
| * Full alignment is impossible, assume the caller really meant |
| * the individual elements were aligned (e.g. 3x32bit format). |
| * And yes the generated code may otherwise crash, llvm will |
| * really assume 128bit alignment with a 96bit fetch (I suppose |
| * that makes sense as it can just assume the upper 32bit to be |
| * whatever). |
| * Maybe the caller should be able to explicitly set this, but |
| * this should cover all the 3-channel formats. |
| */ |
| if (((src_width / 24) * 24 == src_width) && |
| util_is_power_of_two_or_zero(src_width / 24)) { |
| LLVMSetAlignment(res, src_width / 24); |
| } else { |
| LLVMSetAlignment(res, 1); |
| } |
| } |
| |
| assert(src_width <= dst_width); |
| if (src_width < dst_width) { |
| res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); |
| if (vector_justify) { |
| #ifdef PIPE_ARCH_BIG_ENDIAN |
| res = LLVMBuildShl(gallivm->builder, res, |
| LLVMConstInt(dst_elem_type, dst_width - src_width, 0), ""); |
| #endif |
| } |
| } |
| |
| return res; |
| } |
| |
| |
| /** |
| * Gather one element from scatter positions in memory. |
| * Nearly the same as above, however the individual elements |
| * may be vectors themselves, and fetches may be float type. |
| * Can also do pad vector instead of ZExt. |
| * |
| * @sa lp_build_gather() |
| */ |
| static LLVMValueRef |
| lp_build_gather_elem_vec(struct gallivm_state *gallivm, |
| unsigned length, |
| unsigned src_width, |
| LLVMTypeRef src_type, |
| struct lp_type dst_type, |
| boolean aligned, |
| LLVMValueRef base_ptr, |
| LLVMValueRef offsets, |
| unsigned i, |
| boolean vector_justify) |
| { |
| LLVMValueRef ptr, res; |
| LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); |
| assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); |
| |
| ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); |
| ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); |
| res = LLVMBuildLoad(gallivm->builder, ptr, ""); |
| |
| /* XXX |
| * On some archs we probably really want to avoid having to deal |
| * with alignments lower than 4 bytes (if fetch size is a power of |
| * two >= 32). On x86 it doesn't matter, however. |
| * We should be able to guarantee full alignment for any kind of texture |
| * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch |
| * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends |
| * but I don't think that's quite what we wanted). |
| * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT |
| * looks like a good fit, but it seems this cap bit (and OpenGL) aren't |
| * enforcing what we want (which is what d3d10 does, the offset needs to |
| * be aligned to element size, but GL has bytes regardless of element |
| * size which would only leave us with minimum alignment restriction of 16 |
| * which doesn't make much sense if the type isn't 4x32bit). Due to |
| * translation of offsets to first_elem in sampler_views it actually seems |
| * gallium could not do anything else except 16 no matter what... |
| */ |
| if (!aligned) { |
| LLVMSetAlignment(res, 1); |
| } else if (!util_is_power_of_two_or_zero(src_width)) { |
| /* |
| * Full alignment is impossible, assume the caller really meant |
| * the individual elements were aligned (e.g. 3x32bit format). |
| * And yes the generated code may otherwise crash, llvm will |
| * really assume 128bit alignment with a 96bit fetch (I suppose |
| * that makes sense as it can just assume the upper 32bit to be |
| * whatever). |
| * Maybe the caller should be able to explicitly set this, but |
| * this should cover all the 3-channel formats. |
| */ |
| if (((src_width / 24) * 24 == src_width) && |
| util_is_power_of_two_or_zero(src_width / 24)) { |
| LLVMSetAlignment(res, src_width / 24); |
| } else { |
| LLVMSetAlignment(res, 1); |
| } |
| } |
| |
| assert(src_width <= dst_type.width * dst_type.length); |
| if (src_width < dst_type.width * dst_type.length) { |
| if (dst_type.length > 1) { |
| res = lp_build_pad_vector(gallivm, res, dst_type.length); |
| /* |
| * vector_justify hopefully a non-issue since we only deal |
| * with src_width >= 32 here? |
| */ |
| } else { |
| LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type); |
| |
| /* |
| * Only valid if src_ptr_type is int type... |
| */ |
| res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); |
| |
| #ifdef PIPE_ARCH_BIG_ENDIAN |
| if (vector_justify) { |
| res = LLVMBuildShl(gallivm->builder, res, |
| LLVMConstInt(dst_elem_type, |
| dst_type.width - src_width, 0), ""); |
| } |
| if (src_width == 48) { |
| /* Load 3x16 bit vector. |
| * The sequence of loads on big-endian hardware proceeds as follows. |
| * 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence |
| * of three fields appears in the order X, Y, Z. |
| * |
| * Load 32-bit word: 0.0.X.Y |
| * Load 16-bit halfword: 0.0.0.Z |
| * Rotate left: 0.X.Y.0 |
| * Bitwise OR: 0.X.Y.Z |
| * |
| * The order in which we need the fields in the result is 0.Z.Y.X, |
| * the same as on little-endian; permute 16-bit fields accordingly |
| * within 64-bit register: |
| */ |
| LLVMValueRef shuffles[4] = { |
| lp_build_const_int32(gallivm, 2), |
| lp_build_const_int32(gallivm, 1), |
| lp_build_const_int32(gallivm, 0), |
| lp_build_const_int32(gallivm, 3), |
| }; |
| res = LLVMBuildBitCast(gallivm->builder, res, |
| lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), ""); |
| res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), ""); |
| res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, ""); |
| } |
| #endif |
| } |
| } |
| return res; |
| } |
| |
| |
| |
| |
| static LLVMValueRef |
| lp_build_gather_avx2(struct gallivm_state *gallivm, |
| unsigned length, |
| unsigned src_width, |
| struct lp_type dst_type, |
| LLVMValueRef base_ptr, |
| LLVMValueRef offsets) |
| { |
| LLVMBuilderRef builder = gallivm->builder; |
| LLVMTypeRef src_type, src_vec_type; |
| LLVMValueRef res; |
| struct lp_type res_type = dst_type; |
| res_type.length *= length; |
| |
| if (dst_type.floating) { |
| src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) : |
| LLVMFloatTypeInContext(gallivm->context); |
| } else { |
| src_type = LLVMIntTypeInContext(gallivm->context, src_width); |
| } |
| src_vec_type = LLVMVectorType(src_type, length); |
| |
| /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */ |
| assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); |
| |
| if (0) { |
| /* |
| * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but |
| * will not use the AVX2 gather instrinsics (even with llvm 4.0), at |
| * least with Haswell. See |
| * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html |
| * And the generated code doing the emulation is quite a bit worse |
| * than what we get by doing it ourselves too. |
| */ |
| LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32); |
| LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length); |
| LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1); |
| LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length); |
| LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); |
| LLVMValueRef src_ptr; |
| |
| base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, ""); |
| |
| /* Rescale offsets from bytes to elements */ |
| LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0); |
| scale = lp_build_broadcast(gallivm, i32_vec_type, scale); |
| assert(LLVMTypeOf(offsets) == i32_vec_type); |
| offsets = LLVMBuildSDiv(builder, offsets, scale, ""); |
| |
| src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep"); |
| |
| char intrinsic[64]; |
| util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u", |
| length, dst_type.floating ? "f" : "i", src_width); |
| LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0); |
| LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type); |
| LLVMValueRef passthru = LLVMGetUndef(src_vec_type); |
| |
| LLVMValueRef args[] = { src_ptr, alignment, mask, passthru }; |
| |
| res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0); |
| } else { |
| LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8); |
| const char *intrinsic = NULL; |
| unsigned l_idx = 0; |
| |
| assert(src_width == 32 || src_width == 64); |
| if (src_width == 32) { |
| assert(length == 4 || length == 8); |
| } else { |
| assert(length == 2 || length == 4); |
| } |
| |
| static const char *intrinsics[2][2][2] = { |
| |
| {{"llvm.x86.avx2.gather.d.d", |
| "llvm.x86.avx2.gather.d.d.256"}, |
| {"llvm.x86.avx2.gather.d.q", |
| "llvm.x86.avx2.gather.d.q.256"}}, |
| |
| {{"llvm.x86.avx2.gather.d.ps", |
| "llvm.x86.avx2.gather.d.ps.256"}, |
| {"llvm.x86.avx2.gather.d.pd", |
| "llvm.x86.avx2.gather.d.pd.256"}}, |
| }; |
| |
| if ((src_width == 32 && length == 8) || |
| (src_width == 64 && length == 4)) { |
| l_idx = 1; |
| } |
| intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx]; |
| |
| LLVMValueRef passthru = LLVMGetUndef(src_vec_type); |
| LLVMValueRef mask = LLVMConstAllOnes(src_vec_type); |
| mask = LLVMConstBitCast(mask, src_vec_type); |
| LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0); |
| |
| LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale }; |
| |
| res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0); |
| } |
| res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), ""); |
| |
| return res; |
| } |
| |
| |
| /** |
| * Gather elements from scatter positions in memory into a single vector. |
| * Use for fetching texels from a texture. |
| * For SSE, typical values are length=4, src_width=32, dst_width=32. |
| * |
| * When src_width < dst_width, the return value can be justified in |
| * one of two ways: |
| * "integer justification" is used when the caller treats the destination |
| * as a packed integer bitmask, as described by the channels' "shift" and |
| * "width" fields; |
| * "vector justification" is used when the caller casts the destination |
| * to a vector and needs channel X to be in vector element 0. |
| * |
| * @param length length of the offsets |
| * @param src_width src element width in bits |
| * @param dst_type result element type (src will be expanded to fit, |
| * but truncation is not allowed) |
| * (this may be a vector, must be pot sized) |
| * @param aligned whether the data is guaranteed to be aligned (to src_width) |
| * @param base_ptr base pointer, needs to be a i8 pointer type. |
| * @param offsets vector with offsets |
| * @param vector_justify select vector rather than integer justification |
| */ |
| LLVMValueRef |
| lp_build_gather(struct gallivm_state *gallivm, |
| unsigned length, |
| unsigned src_width, |
| struct lp_type dst_type, |
| boolean aligned, |
| LLVMValueRef base_ptr, |
| LLVMValueRef offsets, |
| boolean vector_justify) |
| { |
| LLVMValueRef res; |
| boolean need_expansion = src_width < dst_type.width * dst_type.length; |
| boolean vec_fetch; |
| struct lp_type fetch_type, fetch_dst_type; |
| LLVMTypeRef src_type; |
| |
| assert(src_width <= dst_type.width * dst_type.length); |
| |
| /* |
| * This is quite a mess... |
| * Figure out if the fetch should be done as: |
| * a) scalar or vector |
| * b) float or int |
| * |
| * As an example, for a 96bit fetch expanded into 4x32bit, it is better |
| * to use (3x32bit) vector type (then pad the vector). Otherwise, the |
| * zext will cause extra instructions. |
| * However, the same isn't true for 3x16bit (the codegen for that is |
| * completely worthless on x86 simd, and for 3x8bit is is way worse |
| * still, don't try that... (To get really good code out of llvm for |
| * these cases, the only way is to decompose the fetches manually |
| * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter |
| * case requires sse41, otherwise simple scalar zext is way better. |
| * But probably not important enough, so don't bother.) |
| * Also, we try to honor the floating bit of destination (but isn't |
| * possible if caller asks for instance for 2x32bit dst_type with |
| * 48bit fetch - the idea would be to use 3x16bit fetch, pad and |
| * cast to 2x32f type, so the fetch is always int and on top of that |
| * we avoid the vec pad and use scalar zext due the above mentioned |
| * issue). |
| * Note this is optimized for x86 sse2 and up backend. Could be tweaked |
| * for other archs if necessary... |
| */ |
| if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) && |
| (dst_type.length > 1)) { |
| /* use vector fetch (if dst_type is vector) */ |
| vec_fetch = TRUE; |
| if (dst_type.floating) { |
| fetch_type = lp_type_float_vec(dst_type.width, src_width); |
| } else { |
| fetch_type = lp_type_int_vec(dst_type.width, src_width); |
| } |
| /* intentionally not using lp_build_vec_type here */ |
| src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type), |
| fetch_type.length); |
| fetch_dst_type = fetch_type; |
| fetch_dst_type.length = dst_type.length; |
| } else { |
| /* use scalar fetch */ |
| vec_fetch = FALSE; |
| if (dst_type.floating && ((src_width == 32) || (src_width == 64))) { |
| fetch_type = lp_type_float(src_width); |
| } else { |
| fetch_type = lp_type_int(src_width); |
| } |
| src_type = lp_build_vec_type(gallivm, fetch_type); |
| fetch_dst_type = fetch_type; |
| fetch_dst_type.width = dst_type.width * dst_type.length; |
| } |
| |
| if (length == 1) { |
| /* Scalar */ |
| res = lp_build_gather_elem_vec(gallivm, length, |
| src_width, src_type, fetch_dst_type, |
| aligned, base_ptr, offsets, 0, |
| vector_justify); |
| return LLVMBuildBitCast(gallivm->builder, res, |
| lp_build_vec_type(gallivm, dst_type), ""); |
| /* |
| * Excluding expansion from these paths because if you need it for |
| * 32bit/64bit fetches you're doing it wrong (this is gather, not |
| * conversion) and it would be awkward for floats. |
| */ |
| } else if (util_cpu_caps.has_avx2 && !need_expansion && |
| src_width == 32 && (length == 4 || length == 8)) { |
| return lp_build_gather_avx2(gallivm, length, src_width, dst_type, |
| base_ptr, offsets); |
| /* |
| * This looks bad on paper wrt throughtput/latency on Haswell. |
| * Even on Broadwell it doesn't look stellar. |
| * Albeit no measurements were done (but tested to work). |
| * Should definitely enable on Skylake. |
| * (In general, should be more of a win if the fetch is 256bit wide - |
| * this is true for the 32bit case above too.) |
| */ |
| } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && |
| src_width == 64 && (length == 2 || length == 4)) { |
| return lp_build_gather_avx2(gallivm, length, src_width, dst_type, |
| base_ptr, offsets); |
| } else { |
| /* Vector */ |
| |
| LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8]; |
| unsigned i; |
| boolean vec_zext = FALSE; |
| struct lp_type res_type, gather_res_type; |
| LLVMTypeRef res_t, gather_res_t; |
| |
| res_type = fetch_dst_type; |
| res_type.length *= length; |
| gather_res_type = res_type; |
| |
| if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) { |
| /* |
| * Note that llvm is never able to optimize zext/insert combos |
| * directly (i.e. zero the simd reg, then place the elements into |
| * the appropriate place directly). (I think this has to do with |
| * scalar/vector transition.) And scalar 16->32bit zext simd loads |
| * aren't possible (instead loading to scalar reg first). |
| * No idea about other archs... |
| * We could do this manually, but instead we just use a vector |
| * zext, which is simple enough (and, in fact, llvm might optimize |
| * this away). |
| * (We're not trying that with other bit widths as that might not be |
| * easier, in particular with 8 bit values at least with only sse2.) |
| */ |
| assert(vec_fetch == FALSE); |
| gather_res_type.width /= 2; |
| fetch_dst_type = fetch_type; |
| src_type = lp_build_vec_type(gallivm, fetch_type); |
| vec_zext = TRUE; |
| } |
| res_t = lp_build_vec_type(gallivm, res_type); |
| gather_res_t = lp_build_vec_type(gallivm, gather_res_type); |
| res = LLVMGetUndef(gather_res_t); |
| for (i = 0; i < length; ++i) { |
| LLVMValueRef index = lp_build_const_int32(gallivm, i); |
| elems[i] = lp_build_gather_elem_vec(gallivm, length, |
| src_width, src_type, fetch_dst_type, |
| aligned, base_ptr, offsets, i, |
| vector_justify); |
| if (!vec_fetch) { |
| res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, ""); |
| } |
| } |
| if (vec_zext) { |
| res = LLVMBuildZExt(gallivm->builder, res, res_t, ""); |
| if (vector_justify) { |
| #ifdef PIPE_ARCH_BIG_ENDIAN |
| unsigned sv = dst_type.width - src_width; |
| res = LLVMBuildShl(gallivm->builder, res, |
| lp_build_const_int_vec(gallivm, res_type, sv), ""); |
| #endif |
| } |
| } |
| if (vec_fetch) { |
| /* |
| * Do bitcast now otherwise llvm might get some funny ideas wrt |
| * float/int types... |
| */ |
| for (i = 0; i < length; i++) { |
| elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i], |
| lp_build_vec_type(gallivm, dst_type), ""); |
| } |
| res = lp_build_concat(gallivm, elems, dst_type, length); |
| } else { |
| struct lp_type really_final_type = dst_type; |
| assert(res_type.length * res_type.width == |
| dst_type.length * dst_type.width * length); |
| really_final_type.length *= length; |
| res = LLVMBuildBitCast(gallivm->builder, res, |
| lp_build_vec_type(gallivm, really_final_type), ""); |
| } |
| } |
| |
| return res; |
| } |
| |
| LLVMValueRef |
| lp_build_gather_values(struct gallivm_state * gallivm, |
| LLVMValueRef * values, |
| unsigned value_count) |
| { |
| LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count); |
| LLVMBuilderRef builder = gallivm->builder; |
| LLVMValueRef vec = LLVMGetUndef(vec_type); |
| unsigned i; |
| |
| for (i = 0; i < value_count; i++) { |
| LLVMValueRef index = lp_build_const_int32(gallivm, i); |
| vec = LLVMBuildInsertElement(builder, vec, values[i], index, ""); |
| } |
| return vec; |
| } |