llvm/lib/Target/VE/VEISelLowering.cpp - third_party/github.com/llvm/llvm-project - Git at Google

 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the interfaces that VE uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//

 #include "VEISelLowering.h"
 #include "MCTargetDesc/VEMCExpr.h"
 #include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
 #include "VETargetMachine.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;

 #define DEBUG_TYPE "ve-lower"

 //===----------------------------------------------------------------------===//
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//

 #include "VEGenCallingConv.inc"

 bool VETargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   CCAssignFn *RetCC = RetCC_VE;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }

 SDValue
 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool IsVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               const SDLoc &DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;

   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());

   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_VE);

   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);

   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     SDValue OutVal = OutVals[i];

     // Integer return values must be sign or zero extended by the callee.
     switch (VA.getLocInfo()) {
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::ZExt:
       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::AExt:
       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::BCvt: {
       // Convert a float return value to i64 with padding.
       //     63     31   0
       //    +------+------+
       //    | float|   0  |
       //    +------+------+
       assert(VA.getLocVT() == MVT::i64);
       assert(VA.getValVT() == MVT::f32);
       SDValue Undef = SDValue(
           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
                                           MVT::i64, Undef, OutVal, Sub_f32),
                        0);
       break;
     }
     default:
       llvm_unreachable("Unknown loc info!");
     }

     assert(!VA.needsCustom() && "Unexpected custom lowering");

     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);

     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }

   RetOps[0] = Chain; // Update chain.

   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);

   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
 }

 SDValue VETargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();

   // Get the base offset of the incoming arguments stack space.
   unsigned ArgsBaseOffset = 176;
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 64;

   // Analyze arguments according to CC_VE.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   // Allocate the preserved area first.
   CCInfo.AllocateStack(ArgsPreserved, Align(8));
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
   CCInfo.AnalyzeFormalArguments(Ins, CC_VE);

   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     if (VA.isRegLoc()) {
       // This argument is passed in a register.
       // All integer register arguments are promoted by the caller to i64.

       // Create a virtual register for the promoted live-in value.
       unsigned VReg =
           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());

       // Get the high bits for i32 struct elements.
       if (VA.getValVT() == MVT::i32 && VA.needsCustom())
         Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
                           DAG.getConstant(32, DL, MVT::i32));

       // The caller promoted the argument, so insert an Assert?ext SDNode so we
       // won't promote the value again in this function.
       switch (VA.getLocInfo()) {
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
       case CCValAssign::ZExt:
         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
       case CCValAssign::BCvt: {
         // Extract a float argument from i64 with padding.
         //     63     31   0
         //    +------+------+
         //    | float|   0  |
         //    +------+------+
         assert(VA.getLocVT() == MVT::i64);
         assert(VA.getValVT() == MVT::f32);
         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                          MVT::f32, Arg, Sub_f32),
                       0);
         break;
       }
       default:
         break;
       }

       // Truncate the register down to the argument type.
       if (VA.isExtInLoc())
         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

       InVals.push_back(Arg);
       continue;
     }

     // The registers are exhausted. This argument was passed on the stack.
     assert(VA.isMemLoc());
     // The CC_VE_Full/Half functions compute stack offsets relative to the
     // beginning of the arguments area at %fp+176.
     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;

     // Adjust offset for a float argument by adding 4 since the argument is
     // stored in 8 bytes buffer with offset like below.  LLVM generates
     // 4 bytes load instruction, so need to adjust offset here.  This
     // adjustment is required in only LowerFormalArguments.  In LowerCall,
     // a float argument is converted to i64 first, and stored as 8 bytes
     // data, which is required by ABI, so no need for adjustment.
     //    0      4
     //    +------+------+
     //    | empty| float|
     //    +------+------+
     if (VA.getValVT() == MVT::f32)
       Offset += 4;

     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
     InVals.push_back(
         DAG.getLoad(VA.getValVT(), DL, Chain,
                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
                     MachinePointerInfo::getFixedStack(MF, FI)));
   }

   if (!IsVarArg)
     return Chain;

   // This function takes variable arguments, some of which may have been passed
   // in registers %s0-%s8.
   //
   // The va_start intrinsic needs to know the offset to the first variable
   // argument.
   // TODO: need to calculate offset correctly once we support f128.
   unsigned ArgOffset = ArgLocs.size() * 8;
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   // Skip the 176 bytes of register save area.
   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);

   return Chain;
 }

 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
                      .Case("sp", VE::SX11)    // Stack pointer
                      .Case("fp", VE::SX9)     // Frame pointer
                      .Case("sl", VE::SX8)     // Stack limit
                      .Case("lr", VE::SX10)    // Link register
                      .Case("tp", VE::SX14)    // Thread pointer
                      .Case("outer", VE::SX12) // Outer regiser
                      .Case("info", VE::SX17)  // Info area register
                      .Case("got", VE::SX15)   // Global offset table register
                      .Case("plt", VE::SX16) // Procedure linkage table register
                      .Default(0);

   if (Reg)
     return Reg;

   report_fatal_error("Invalid register name global variable");
 }

 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//

 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   auto PtrVT = getPointerTy(DAG.getDataLayout());

   // VE target does not yet support tail call optimization.
   CLI.IsTailCall = false;

   // Get the base offset of the outgoing arguments stack space.
   unsigned ArgsBaseOffset = 176;
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 8 * 8u;

   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   // Allocate the preserved area first.
   CCInfo.AllocateStack(ArgsPreserved, Align(8));
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
   CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE);

   // VE requires to use both register and stack for varargs or no-prototyped
   // functions.
   bool UseBoth = CLI.IsVarArg;

   // Analyze operands again if it is required to store BOTH.
   SmallVector<CCValAssign, 16> ArgLocs2;
   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
                   ArgLocs2, *DAG.getContext());
   if (UseBoth)
     CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2);

   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();

   // Keep stack frames 16-byte aligned.
   ArgsSize = alignTo(ArgsSize, 16);

   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);

   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
   // instruction.
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

   // Collect chains from all the memory opeations that copy arguments to the
   // stack. They must follow the stack pointer adjustment above and precede the
   // call instruction itself.
   SmallVector<SDValue, 8> MemOpChains;

   // VE needs to get address of callee function in a register
   // So, prepare to copy it to SX12 here.

   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;

   bool IsPICCall = isPositionIndependent();

   // PC-relative references to external symbols should go through $stub.
   // If so, we need to prepare GlobalBaseReg first.
   const TargetMachine &TM = DAG.getTarget();
   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
   if (CalleeG)
     GV = CalleeG->getGlobal();
   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
   bool UsePlt = !Local;
   MachineFunction &MF = DAG.getMachineFunction();

   // Turn GlobalAddress/ExternalSymbol node into a value node
   // containing the address of them here.
   if (CalleeG) {
     if (IsPICCall) {
       if (UsePlt)
         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
       Callee =
           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
     }
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (IsPICCall) {
       if (UsePlt)
         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
       Callee =
           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
     }
   }

   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));

   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = CLI.OutVals[i];

     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown location info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt: {
       // Convert a float argument to i64 with padding.
       //     63     31   0
       //    +------+------+
       //    | float|   0  |
       //    +------+------+
       assert(VA.getLocVT() == MVT::i64);
       assert(VA.getValVT() == MVT::f32);
       SDValue Undef = SDValue(
           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
                                        MVT::i64, Undef, Arg, Sub_f32),
                     0);
       break;
     }
     }

     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       if (!UseBoth)
         continue;
       VA = ArgLocs2[i];
     }

     assert(VA.isMemLoc());

     // Create a store off the stack pointer for this argument.
     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
     // The argument area starts at %fp+176 in the callee frame,
     // %sp+176 in ours.
     SDValue PtrOff =
         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
     MemOpChains.push_back(
         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
   }

   // Emit all stores, make sure they occur before the call.
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

   // Build a sequence of CopyToReg nodes glued together with token chain and
   // glue operands which copy the outgoing args into registers. The InGlue is
   // necessary since all emitted instructions must be stuck together in order
   // to pass the live physical registers.
   SDValue InGlue;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
                              RegsToPass[i].second, InGlue);
     InGlue = Chain.getValue(1);
   }

   // Build the operands for the call instruction itself.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(Chain);
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));

   // Add a register mask operand representing the call-preserved registers.
   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask =
       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));

   // Make sure the CopyToReg nodes are glued to the call instruction which
   // consumes the registers.
   if (InGlue.getNode())
     Ops.push_back(InGlue);

   // Now the call itself.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
   InGlue = Chain.getValue(1);

   // Revert the stack pointer immediately after the call.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
   InGlue = Chain.getValue(1);

   // Now extract the return values. This is more or less the same as
   // LowerFormalArguments.

   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());

   // Set inreg flag manually for codegen generated library calls that
   // return float.
   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
     CLI.Ins[0].Flags.setInReg();

   RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_VE);

   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     unsigned Reg = VA.getLocReg();

     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
     // reside in the same register in the high and low bits. Reuse the
     // CopyFromReg previous node to avoid duplicate copies.
     SDValue RV;
     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
         RV = Chain.getValue(0);

     // But usually we'll create a new CopyFromReg for a different register.
     if (!RV.getNode()) {
       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
       Chain = RV.getValue(1);
       InGlue = Chain.getValue(2);
     }

     // Get the high bits for i32 struct elements.
     if (VA.getValVT() == MVT::i32 && VA.needsCustom())
       RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
                        DAG.getConstant(32, DL, MVT::i32));

     // The callee promoted the return value, so insert an Assert?ext SDNode so
     // we won't promote the value again in this function.
     switch (VA.getLocInfo()) {
     case CCValAssign::SExt:
       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
     case CCValAssign::ZExt:
       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
     case CCValAssign::BCvt: {
       // Extract a float return value from i64 with padding.
       //     63     31   0
       //    +------+------+
       //    | float|   0  |
       //    +------+------+
       assert(VA.getLocVT() == MVT::i64);
       assert(VA.getValVT() == MVT::f32);
       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                       MVT::f32, RV, Sub_f32),
                    0);
       break;
     }
     default:
       break;
     }

     // Truncate the register down to the return value type.
     if (VA.isExtInLoc())
       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);

     InVals.push_back(RV);
   }

   return Chain;
 }

 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                     bool ForCodeSize) const {
   return VT == MVT::f32 || VT == MVT::f64;
 }

 /// Determine if the target supports unaligned memory accesses.
 ///
 /// This function returns true if the target allows unaligned memory accesses
 /// of the specified type in the given address space. If true, it also returns
 /// whether the unaligned memory access is "fast" in the last argument by
 /// reference. This is used, for example, in situations where an array
 /// copy/move/set is converted to a sequence of store operations. Its use
 /// helps to ensure that such replacements don't generate code that causes an
 /// alignment error (trap) on the target machine.
 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                       unsigned AddrSpace,
                                                       unsigned Align,
                                                       MachineMemOperand::Flags,
                                                       bool *Fast) const {
   if (Fast) {
     // It's fast anytime on VE
     *Fast = true;
   }
   return true;
 }

 bool VETargetLowering::hasAndNot(SDValue Y) const {
   EVT VT = Y.getValueType();

   // VE doesn't have vector and not instruction.
   if (VT.isVector())
     return false;

   // VE allows different immediate values for X and Y where ~X & Y.
   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
   // function is used to check whether an immediate value is OK for and-not
   // instruction as both X and Y.  Generating additional instruction to
   // retrieve an immediate value is no good since the purpose of this
   // function is to convert a series of 3 instructions to another series of
   // 3 instructions with better parallelism.  Therefore, we return false
   // for all immediate values now.
   // FIXME: Change hasAndNot function to have two operands to make it work
   //        correctly with Aurora VE.
   if (isa<ConstantSDNode>(Y))
     return false;

   // It's ok for generic registers.
   return true;
 }

 VETargetLowering::VETargetLowering(const TargetMachine &TM,
                                    const VESubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   // Instructions which use registers as conditionals examine all the
   // bits (as does the pseudo SELECT_CC expansion). I don't think it
   // matters much whether it's ZeroOrOneBooleanContent, or
   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
   // former.
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent);

   // Set up the register classes.
   addRegisterClass(MVT::i32, &VE::I32RegClass);
   addRegisterClass(MVT::i64, &VE::I64RegClass);
   addRegisterClass(MVT::f32, &VE::F32RegClass);
   addRegisterClass(MVT::f64, &VE::I64RegClass);

   /// Load & Store {
   for (MVT FPVT : MVT::fp_valuetypes()) {
     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
       // Turn FP extload into load/fpextend
       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);

       // Turn FP truncstore into trunc + store.
       setTruncStoreAction(FPVT, OtherFPVT, Expand);
     }
   }

   // VE doesn't have i1 sign extending load
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
   /// } Load & Store

   // Custom legalize address nodes into LO/HI parts.
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);

   /// VAARG handling {
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   // VAARG needs to be lowered to access with 8 bytes alignment.
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   // Use the default implementation.
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   /// } VAARG handling

   /// Stack {
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   /// } Stack

   /// Int Ops {
   for (MVT IntVT : {MVT::i32, MVT::i64}) {
     // VE has no REM or DIVREM operations.
     setOperationAction(ISD::UREM, IntVT, Expand);
     setOperationAction(ISD::SREM, IntVT, Expand);
     setOperationAction(ISD::SDIVREM, IntVT, Expand);
     setOperationAction(ISD::UDIVREM, IntVT, Expand);

     // VE has no MULHU/S or U/SMUL_LOHI operations.
     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
     setOperationAction(ISD::MULHU, IntVT, Expand);
     setOperationAction(ISD::MULHS, IntVT, Expand);
     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);

     // VE has no CTTZ, ROTL, ROTR operations.
     setOperationAction(ISD::CTTZ, IntVT, Expand);
     setOperationAction(ISD::ROTL, IntVT, Expand);
     setOperationAction(ISD::ROTR, IntVT, Expand);

     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
     // instruction works fine as i32 BSWAP operation with an additional
     // parameter.  Use isel patterns to lower BSWAP.
     setOperationAction(ISD::BSWAP, IntVT, Legal);

     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
     // operations.  Use isel patterns for i64, promote for i32.
     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
     setOperationAction(ISD::BITREVERSE, IntVT, Act);
     setOperationAction(ISD::CTLZ, IntVT, Act);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
     setOperationAction(ISD::CTPOP, IntVT, Act);

     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
     // Use isel patterns for i64, promote for i32.
     setOperationAction(ISD::AND, IntVT, Act);
     setOperationAction(ISD::OR, IntVT, Act);
     setOperationAction(ISD::XOR, IntVT, Act);
   }
   /// } Int Ops

   /// Conversion {
   // VE doesn't have instructions for fp<->uint, so expand them by llvm
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);

   // fp16 not supported
   for (MVT FPVT : MVT::fp_valuetypes()) {
     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
   }
   /// } Conversion

   setStackPointerRegisterToSaveRestore(VE::SX11);

   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::TRUNCATE);

   // Set function alignment to 16 bytes
   setMinFunctionAlignment(Align(16));

   // VE stores all argument by 8 bytes alignment
   setMinStackArgumentAlignment(Align(8));

   computeRegisterProperties(Subtarget->getRegisterInfo());
 }

 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
 #define TARGET_NODE_CASE(NAME)                                                 \
   case VEISD::NAME:                                                            \
     return "VEISD::" #NAME;
   switch ((VEISD::NodeType)Opcode) {
   case VEISD::FIRST_NUMBER:
     break;
     TARGET_NODE_CASE(Lo)
     TARGET_NODE_CASE(Hi)
     TARGET_NODE_CASE(GETFUNPLT)
     TARGET_NODE_CASE(GETSTACKTOP)
     TARGET_NODE_CASE(GETTLSADDR)
     TARGET_NODE_CASE(CALL)
     TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(GLOBAL_BASE_REG)
   }
 #undef TARGET_NODE_CASE
   return nullptr;
 }

 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                          EVT VT) const {
   return MVT::i32;
 }

 // Convert to a target node and set target flags.
 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                           SelectionDAG &DAG) const {
   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
                                       GA->getValueType(0), GA->getOffset(), TF);

   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
                                      0, TF);

   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
                                        TF);

   llvm_unreachable("Unhandled address SDNode");
 }

 // Split Op into high and low parts according to HiTF and LoTF.
 // Return an ADD node combining the parts.
 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                                        SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
 }

 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
 // or ExternalSymbol SDNode.
 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT PtrVT = Op.getValueType();

   // Handle PIC mode first. VE needs a got load for every variable!
   if (isPositionIndependent()) {
     // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
     // function has calls.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     MFI.setHasCalls(true);
     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);

     if (isa<ConstantPoolSDNode>(Op) ||
         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
       // Create following instructions for local linkage PIC code.
       //     lea %s35, %gotoff_lo(.LCPI0_0)
       //     and %s35, %s35, (32)0
       //     lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35)
       //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
       // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     }
     // Create following instructions for not local linkage PIC code.
     //     lea %s35, %got_lo(.LCPI0_0)
     //     and %s35, %s35, (32)0
     //     lea.sl %s35, %got_hi(.LCPI0_0)(%s35)
     //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
     //     ld     %s35, (,%s35)
     // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }

   // This is one of the absolute code models.
   switch (getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
   case CodeModel::Small:
   case CodeModel::Medium:
   case CodeModel::Large:
     // abs64.
     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
   }
 }

 /// Custom Lower {

 SDValue VETargetLowering::LowerGlobalAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue VETargetLowering::LowerBlockAddress(SDValue Op,
                                             SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue
 VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);

   // Generate the following code:
   //   t1: ch,glue = callseq_start t0, 0, 0
   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
   SDValue Label = withTargetFlags(Op, 0, DAG);
   EVT PtrVT = Op.getValueType();

   // Lowering the machine isd will make sure everything is in the right
   // location.
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
       DAG.getMachineFunction(), CallingConv::C);
   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, dl);
   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
   Chain = DAG.getNode(VEISD::GETTLSADDR, dl, NodeTys, Args);
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, dl, true),
                              DAG.getIntPtrConstant(0, dl, true),
                              Chain.getValue(1), dl);
   Chain = DAG.getCopyFromReg(Chain, dl, VE::SX0, PtrVT, Chain.getValue(1));

   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setHasCalls(true);

   // Also generate code to prepare a GOT register if it is PIC.
   if (isPositionIndependent()) {
     MachineFunction &MF = DAG.getMachineFunction();
     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
   }

   return Chain;
 }

 SDValue VETargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The current implementation of nld (2.26) doesn't allow local exec model
   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
   // generate the general dynamic model code sequence.
   //
   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
   return LowerToTLSGeneralDynamicModel(Op, DAG);
 }

 SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());

   // Need frame address to find the address of VarArgsFrameIndex.
   MF.getFrameInfo().setFrameAddressIsTaken(true);

   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   SDLoc DL(Op);
   SDValue Offset =
       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
                       MachinePointerInfo(SV));
 }

 SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc DL(Node);
   SDValue VAList =
       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
   SDValue Chain = VAList.getValue(1);
   SDValue NextPtr;

   if (VT == MVT::f32) {
     // float --> need special handling like below.
     //    0      4
     //    +------+------+
     //    | empty| float|
     //    +------+------+
     // Increment the pointer, VAList, by 8 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
     // Then, adjust VAList.
     unsigned InternalOffset = 4;
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(InternalOffset, DL, PtrVT));
   } else {
     // Increment the pointer, VAList, by 8 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
   }

   // Store the incremented VAList to the legalized pointer.
   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));

   // Load the actual argument out of the pointer VAList.
   // We can't count on greater alignment than the word size.
   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
 }

 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // Generate following code.
   //   (void)__llvm_grow_stack(size);
   //   ret = GETSTACKTOP;        // pseudo instruction
   SDLoc DL(Op);

   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
   MaybeAlign Alignment(Op.getConstantOperandVal(2));
   EVT VT = Node->getValueType(0);

   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   Align StackAlign = TFI.getStackAlign();
   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;

   // Prepare arguments
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Size;
   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
   Args.push_back(Entry);
   if (NeedsAlign) {
     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
     Args.push_back(Entry);
   }
   Type *RetTy = Type::getVoidTy(*DAG.getContext());

   EVT PtrVT = Op.getValueType();
   SDValue Callee;
   if (NeedsAlign) {
     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
   } else {
     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
   }

   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL)
       .setChain(Chain)
       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
       .setDiscardResult(true);
   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
   Chain = pair.second;
   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
   if (NeedsAlign) {
     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
     Result = DAG.getNode(ISD::AND, DL, VT, Result,
                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
   }
   //  Chain = Result.getValue(1);
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);

   SDValue Ops[2] = {Result, Chain};
   return DAG.getMergeValues(Ops, DL);
 }

 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Should not custom lower this!");
   case ISD::BlockAddress:
     return LowerBlockAddress(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   }
 }
 /// } Custom Lower

 static bool isI32Insn(const SDNode *User, const SDNode *N) {
   switch (User->getOpcode()) {
   default:
     return false;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SETCC:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::SHL:
   case ISD::SRA:
   case ISD::BSWAP:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::BR_CC:
   case ISD::BITCAST:
   case ISD::ATOMIC_CMP_SWAP:
   case ISD::ATOMIC_SWAP:
     return true;
   case ISD::SRL:
     if (N->getOperand(0).getOpcode() != ISD::SRL)
       return true;
     // (srl (trunc (srl ...))) may be optimized by combining srl, so
     // doesn't optimize trunc now.
     return false;
   case ISD::SELECT_CC:
     if (User->getOperand(2).getNode() != N &&
         User->getOperand(3).getNode() != N)
       return true;
     LLVM_FALLTHROUGH;
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::SELECT:
   case ISD::CopyToReg:
     // Check all use of selections, bit operations, and copies.  If all of them
     // are safe, optimize truncate to extract_subreg.
     for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end();
          UI != UE; ++UI) {
       switch ((*UI)->getOpcode()) {
       default:
         // If the use is an instruction which treats the source operand as i32,
         // it is safe to avoid truncate here.
         if (isI32Insn(*UI, N))
           continue;
         break;
       case ISD::ANY_EXTEND:
       case ISD::SIGN_EXTEND:
       case ISD::ZERO_EXTEND: {
         // Special optimizations to the combination of ext and trunc.
         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
         // since this truncate instruction clears higher 32 bits which is filled
         // by one of ext instructions later.
         assert(N->getValueType(0) == MVT::i32 &&
                "find truncate to not i32 integer");
         if (User->getOpcode() == ISD::SELECT_CC ||
             User->getOpcode() == ISD::SELECT)
           continue;
         break;
       }
       }
       return false;
     }
     return true;
   }
 }

 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
 // is sometime too late.  So, doing it at here.
 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
                                           DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::TRUNCATE &&
          "Should be called with a TRUNCATE node");

   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
   EVT VT = N->getValueType(0);

   // We prefer to do this when all types are legal.
   if (!DCI.isAfterLegalizeDAG())
     return SDValue();

   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
     return SDValue();

   // Check all use of this TRUNCATE.
   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
        ++UI) {
     SDNode *User = *UI;

     // Make sure that we're not going to replace TRUNCATE for non i32
     // instructions.
     //
     // FIXME: Although we could sometimes handle this, and it does occur in
     // practice that one of the condition inputs to the select is also one of
     // the outputs, we currently can't deal with this.
     if (isI32Insn(User, N))
       continue;

     return SDValue();
   }

   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
                                     N->getOperand(0), SubI32),
                  0);
 }

 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default:
     break;
   case ISD::TRUNCATE:
     return combineTRUNCATE(N, DCI);
   }

   return SDValue();
 }