diff options
95 files changed, 10849 insertions, 146 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h index b3a7310f04b..6834818fc37 100644 --- a/llvm/lib/Target/SystemZ/SystemZ.h +++ b/llvm/lib/Target/SystemZ/SystemZ.h @@ -87,6 +87,13 @@ const unsigned IPM_CC = 28; const unsigned PFD_READ = 1; const unsigned PFD_WRITE = 2; +// Number of bits in a vector register. +const unsigned VectorBits = 128; + +// Number of bytes in a vector register (and consequently the number of +// bytes in a general permute vector). +const unsigned VectorBytes = VectorBits / 8; + // Return true if Val fits an LLILL operand. static inline bool isImmLL(uint64_t Val) { return (Val & ~0x000000000000ffffULL) == 0; diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index c83a3d0af0d..5f46e6a6313 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -151,6 +151,13 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) { LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG); break; + case SystemZ::VLVGP32: + LoweredMI = MCInstBuilder(SystemZ::VLVGP) + .addReg(MI->getOperand(0).getReg()) + .addReg(SystemZMC::getRegAsGR64(MI->getOperand(1).getReg())) + .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg())); + break; + #define LOWER_LOW(NAME) \ case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index 71605ac1126..8b8146762b6 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -10,6 +10,9 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/CallingConvLower.h" + namespace llvm { namespace SystemZ { const unsigned NumArgGPRs = 5; @@ -18,6 +21,47 @@ namespace SystemZ { const unsigned NumArgFPRs = 4; extern const unsigned ArgFPRs[NumArgFPRs]; } // end namespace SystemZ + +class SystemZCCState : public CCState { +private: + /// Records whether the value was a fixed argument. + /// See ISD::OutputArg::IsFixed. + SmallVector<bool, 4> ArgIsFixed; + +public: + SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, + SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) + : CCState(CC, isVarArg, MF, locs, C) {} + + void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, + CCAssignFn Fn) { + // Formal arguments are always fixed. + ArgIsFixed.clear(); + for (unsigned i = 0; i < Ins.size(); ++i) + ArgIsFixed.push_back(true); + + CCState::AnalyzeFormalArguments(Ins, Fn); + } + + void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, + CCAssignFn Fn) { + // Record whether the call operand was a fixed argument. + ArgIsFixed.clear(); + for (unsigned i = 0; i < Outs.size(); ++i) + ArgIsFixed.push_back(Outs[i].IsFixed); + + CCState::AnalyzeCallOperands(Outs, Fn); + } + + // This version of AnalyzeCallOperands in the base class is not usable + // since we must provide a means of accessing ISD::OutputArg::IsFixed. + void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs, + SmallVectorImpl<ISD::ArgFlagsTy> &Flags, + CCAssignFn Fn) = delete; + + bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } +}; + } // end namespace llvm #endif diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index fb0d1d8a3fe..f5eb32c0a60 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -12,6 +12,15 @@ class CCIfExtend<CCAction A> : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("static_cast<const SystemZSubtarget&>" + "(State.getMachineFunction().getSubtarget()).", F), + A>; + +// Match if this specific argument is a fixed (i.e. named) argument. +class CCIfFixed<CCAction A> + : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>; + //===----------------------------------------------------------------------===// // z/Linux return value calling convention //===----------------------------------------------------------------------===// @@ -31,7 +40,12 @@ def RetCC_SystemZ : CallingConv<[ // doesn't care about the ABI. All floating-point argument registers // are call-clobbered, so we can use all of them here. CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, - CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>> + CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, + + // Similarly for vectors, with V24 being the ABI-compliant choice. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64], + CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> // ABI-compliant code returns long double by reference, but that conversion // is left to higher-level code. Perhaps we could add an f128 definition @@ -60,6 +74,17 @@ def CC_SystemZ : CallingConv<[ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, + // The first 8 named vector arguments are passed in V24-V31. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64], + CCIfFixed<CCAssignToReg<[V24, V26, V28, V30, + V25, V27, V29, V31]>>>>, + + // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64], + CCAssignToStack<16, 8>>>, + // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> ]>; diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 80a98772db7..63992936813 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -255,6 +255,13 @@ class SystemZDAGToDAGISel : public SelectionDAGISel { Addr, Base, Disp, Index); } + // Try to match Addr as an address with a base, 12-bit displacement + // and index, where the index is element Elem of a vector. + // Return true on success, storing the base, displacement and vector + // in Base, Disp and Index respectively. + bool selectBDVAddr12Only(SDValue Addr, SDValue Elem, SDValue &Base, + SDValue &Disp, SDValue &Index) const; + // Check whether (or Op (and X InsertMask)) is effectively an insertion // of X into bits InsertMask of some Y != Op. Return true if so and // set Op to that Y. @@ -292,6 +299,12 @@ class SystemZDAGToDAGISel : public SelectionDAGISel { SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0, uint64_t UpperVal, uint64_t LowerVal); + // Try to use gather instruction Opcode to implement vector insertion N. + SDNode *tryGather(SDNode *N, unsigned Opcode); + + // Try to use scatter instruction Opcode to implement store Store. + SDNode *tryScatter(StoreSDNode *Store, unsigned Opcode); + // Return true if Load and Store are loads and stores of the same size // and are guaranteed not to overlap. Such operations can be implemented // using block (SS-format) instructions. @@ -645,6 +658,30 @@ bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form, return true; } +bool SystemZDAGToDAGISel::selectBDVAddr12Only(SDValue Addr, SDValue Elem, + SDValue &Base, + SDValue &Disp, + SDValue &Index) const { + SDValue Regs[2]; + if (selectBDXAddr12Only(Addr, Regs[0], Disp, Regs[1]) && + Regs[0].getNode() && Regs[1].getNode()) { + for (unsigned int I = 0; I < 2; ++I) { + Base = Regs[I]; + Index = Regs[1 - I]; + // We can't tell here whether the index vector has the right type + // for the access; the caller needs to do that instead. + if (Index.getOpcode() == ISD::ZERO_EXTEND) + Index = Index.getOperand(0); + if (Index.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Index.getOperand(1) == Elem) { + Index = Index.getOperand(0); + return true; + } + } + } + return false; +} + bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const { // We're only interested in cases where the insertion is into some operand @@ -984,6 +1021,71 @@ SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node, return Or.getNode(); } +SDNode *SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) { + SDValue ElemV = N->getOperand(2); + auto *ElemN = dyn_cast<ConstantSDNode>(ElemV); + if (!ElemN) + return 0; + + unsigned Elem = ElemN->getZExtValue(); + EVT VT = N->getValueType(0); + if (Elem >= VT.getVectorNumElements()) + return 0; + + auto *Load = dyn_cast<LoadSDNode>(N->getOperand(1)); + if (!Load || !Load->hasOneUse()) + return 0; + if (Load->getMemoryVT().getSizeInBits() != + Load->getValueType(0).getSizeInBits()) + return 0; + + SDValue Base, Disp, Index; + if (!selectBDVAddr12Only(Load->getBasePtr(), ElemV, Base, Disp, Index) || + Index.getValueType() != VT.changeVectorElementTypeToInteger()) + return 0; + + SDLoc DL(Load); + SDValue Ops[] = { + N->getOperand(0), Base, Disp, Index, + CurDAG->getTargetConstant(Elem, DL, MVT::i32), Load->getChain() + }; + SDNode *Res = CurDAG->getMachineNode(Opcode, DL, VT, MVT::Other, Ops); + ReplaceUses(SDValue(Load, 1), SDValue(Res, 1)); + return Res; +} + +SDNode *SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) { + SDValue Value = Store->getValue(); + if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return 0; + if (Store->getMemoryVT().getSizeInBits() != + Value.getValueType().getSizeInBits()) + return 0; + + SDValue ElemV = Value.getOperand(1); + auto *ElemN = dyn_cast<ConstantSDNode>(ElemV); + if (!ElemN) + return 0; + + SDValue Vec = Value.getOperand(0); + EVT VT = Vec.getValueType(); + unsigned Elem = ElemN->getZExtValue(); + if (Elem >= VT.getVectorNumElements()) + return 0; + + SDValue Base, Disp, Index; + if (!selectBDVAddr12Only(Store->getBasePtr(), ElemV, Base, Disp, Index) || + Index.getValueType() != VT.changeVectorElementTypeToInteger()) + return 0; + + SDLoc DL(Store); + SDValue Ops[] = { + Vec, Base, Disp, Index, CurDAG->getTargetConstant(Elem, DL, MVT::i32), + Store->getChain() + }; + return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); +} + bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const { // Check that the two memory operands have the same size. @@ -1120,6 +1222,26 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) { } break; } + + case ISD::INSERT_VECTOR_ELT: { + EVT VT = Node->getValueType(0); + unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits(); + if (ElemBitSize == 32) + ResNode = tryGather(Node, SystemZ::VGEF); + else if (ElemBitSize == 64) + ResNode = tryGather(Node, SystemZ::VGEG); + break; + } + + case ISD::STORE: { + auto *Store = cast<StoreSDNode>(Node); + unsigned ElemBitSize = Store->getValue().getValueType().getSizeInBits(); + if (ElemBitSize == 32) + ResNode = tryScatter(Store, SystemZ::VSCEF); + else if (ElemBitSize == 64) + ResNode = tryScatter(Store, SystemZ::VSCEG); + break; + } } // Select the default instruction diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 44bc8acb6d8..ddcb792ee09 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -96,6 +96,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); + if (Subtarget.hasVector()) { + addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); + } + // Compute derived properties from the register classes computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -111,7 +118,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, setSchedulingPreference(Sched::RegPressure); setBooleanContents(ZeroOrOneBooleanContent); - setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Instructions are strings of 2-byte aligned 2-byte values. setMinFunctionAlignment(2); @@ -250,6 +257,76 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Handle prefetches with PFD or PFDRL. setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + for (MVT VT : MVT::vector_valuetypes()) { + // Assume by default that all vector operations need to be expanded. + for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode) + if (getOperationAction(Opcode, VT) == Legal) + setOperationAction(Opcode, VT, Expand); + + // Likewise all truncating stores and extending loads. + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + + if (isTypeLegal(VT)) { + // These operations are legal for anything that can be stored in a + // vector register, even if there is no native support for the format + // as such. + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::UNDEF, VT, Legal); + + // Likewise, except that we need to replace the nodes with something + // more specific. + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + } + + // Handle integer vector types. + for (MVT VT : MVT::integer_vector_valuetypes()) { + if (isTypeLegal(VT)) { + // These operations have direct equivalents. + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal); + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::SUB, VT, Legal); + if (VT != MVT::v2i64) + setOperationAction(ISD::MUL, VT, Legal); + setOperationAction(ISD::AND, VT, Legal); + setOperationAction(ISD::OR, VT, Legal); + setOperationAction(ISD::XOR, VT, Legal); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Legal); + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + + // Convert a GPR scalar to a vector by inserting it into element 0. + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + + // Detect shifts by a scalar amount and convert them into + // V*_BY_SCALAR. + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + + // At present ROTL isn't matched by DAGCombiner. ROTR should be + // converted into ROTL. + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + + // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands + // and inverting the result as necessary. + setOperationAction(ISD::SETCC, VT, Custom); + } + } + // Handle floating-point types. for (unsigned I = MVT::FIRST_FP_VALUETYPE; I <= MVT::LAST_FP_VALUETYPE; @@ -304,6 +381,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Codes for which we want to perform some z-specific combinations. setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -703,7 +782,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ); unsigned NumFixedGPRs = 0; @@ -735,6 +814,12 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, NumFixedFPRs += 1; RC = &SystemZ::FP64BitRegClass; break; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + RC = &SystemZ::VR128BitRegClass; + break; } unsigned VReg = MRI.createVirtualRegister(RC); @@ -842,7 +927,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ); // We don't support GuaranteedTailCallOpt, only automatically-detected @@ -1809,12 +1894,78 @@ static SDValue emitSETCC(SelectionDAG &DAG, SDLoc DL, SDValue Glue, return Result; } +// Return the SystemZISD vector comparison operation for CC, or 0 if it cannot +// be done directly. +static unsigned getVectorComparison(ISD::CondCode CC) { + switch (CC) { + case ISD::SETEQ: + return SystemZISD::VICMPE; + + case ISD::SETGT: + return SystemZISD::VICMPH; + + case ISD::SETUGT: + return SystemZISD::VICMPHL; + + default: + return 0; + } +} + +// Return the SystemZISD vector comparison operation for CC or its inverse, +// or 0 if neither can be done directly. Indicate in Invert whether the +// result is for the inverse of CC. +static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool &Invert) { + if (unsigned Opcode = getVectorComparison(CC)) { + Invert = false; + return Opcode; + } + + CC = ISD::getSetCCInverse(CC, true); + if (unsigned Opcode = getVectorComparison(CC)) { + Invert = true; + return Opcode; + } + + return 0; +} + +// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing +// an integer mask of type VT. +static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT, + ISD::CondCode CC, SDValue CmpOp0, + SDValue CmpOp1) { + bool Invert = false; + SDValue Cmp; + // It doesn't really matter whether we try the inversion or the swap first, + // since there are no cases where both work. + if (unsigned Opcode = getVectorComparisonOrInvert(CC, Invert)) + Cmp = DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1); + else { + CC = ISD::getSetCCSwappedOperands(CC); + if (unsigned Opcode = getVectorComparisonOrInvert(CC, Invert)) + Cmp = DAG.getNode(Opcode, DL, VT, CmpOp1, CmpOp0); + else + llvm_unreachable("Unhandled comparison"); + } + if (Invert) { + SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(65535, DL, MVT::i32)); + Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask); + Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask); + } + return Cmp; +} + SDValue SystemZTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue CmpOp0 = Op.getOperand(0); SDValue CmpOp1 = Op.getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); SDLoc DL(Op); + EVT VT = Op.getValueType(); + if (VT.isVector()) + return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1); Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL)); SDValue Glue = emitCmp(DAG, DL, C); @@ -2146,6 +2297,13 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op, EVT InVT = In.getValueType(); EVT ResVT = Op.getValueType(); + // Convert loads directly. This is normally done by DAGCombiner, + // but we need this case for bitcasts that are created during lowering + // and which are then lowered themselves. + if (auto *LoadN = dyn_cast<LoadSDNode>(In)) + return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(), + LoadN->getMemOperand()); + if (InVT == MVT::i32 && ResVT == MVT::f32) { SDValue In64; if (Subtarget.hasHighWord()) { @@ -2421,11 +2579,44 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const { SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - int64_t OrigBitSize = VT.getSizeInBits(); SDLoc DL(Op); + Op = Op.getOperand(0); + + // Handle vector types via VPOPCT. + if (VT.isVector()) { + Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op); + Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op); + switch (VT.getVectorElementType().getSizeInBits()) { + case 8: + break; + case 16: { + Op = DAG.getNode(ISD::BITCAST, DL, VT, Op); + SDValue Shift = DAG.getConstant(8, DL, MVT::i32); + SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift); + Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp); + Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift); + break; + } + case 32: { + SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(0, DL, MVT::i32)); + Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); + break; + } + case 64: { + SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(0, DL, MVT::i32)); + Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp); + Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); + break; + } + default: + llvm_unreachable("Unexpected type"); + } + return Op; + } // Get the known-zero mask for the operand. - Op = Op.getOperand(0); APInt KnownZero, KnownOne; DAG.computeKnownBits(Op, KnownZero, KnownOne); unsigned NumSignificantBits = (~KnownZero).getActiveBits(); @@ -2433,6 +2624,7 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, return DAG.getConstant(0, DL, VT); // Skip known-zero high parts of the operand. + int64_t OrigBitSize = VT.getSizeInBits(); int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits); BitSize = std::min(BitSize, OrigBitSize); @@ -2698,6 +2890,837 @@ SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op, return SDValue(); } +namespace { +// Says that SystemZISD operation Opcode can be used to perform the equivalent +// of a VPERM with permute vector Bytes. If Opcode takes three operands, +// Operand is the constant third operand, otherwise it is the number of +// bytes in each element of the result. +struct Permute { + unsigned Opcode; + unsigned Operand; + unsigned char Bytes[SystemZ::VectorBytes]; +}; +} + +static const Permute PermuteForms[] = { + // VMRHG + { SystemZISD::MERGE_HIGH, 8, + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } }, + // VMRHF + { SystemZISD::MERGE_HIGH, 4, + { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } }, + // VMRHH + { SystemZISD::MERGE_HIGH, 2, + { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } }, + // VMRHB + { SystemZISD::MERGE_HIGH, 1, + { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } }, + // VMRLG + { SystemZISD::MERGE_LOW, 8, + { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } }, + // VMRLF + { SystemZISD::MERGE_LOW, 4, + { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } }, + // VMRLH + { SystemZISD::MERGE_LOW, 2, + { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } }, + // VMRLB + { SystemZISD::MERGE_LOW, 1, + { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } }, + // VPKG + { SystemZISD::PACK, 4, + { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } }, + // VPKF + { SystemZISD::PACK, 2, + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } }, + // VPKH + { SystemZISD::PACK, 1, + { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } }, + // VPDI V1, V2, 4 (low half of V1, high half of V2) + { SystemZISD::PERMUTE_DWORDS, 4, + { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } }, + // VPDI V1, V2, 1 (high half of V1, low half of V2) + { SystemZISD::PERMUTE_DWORDS, 1, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } } +}; + +// Called after matching a vector shuffle against a particular pattern. +// Both the original shuffle and the pattern have two vector operands. +// OpNos[0] is the operand of the original shuffle that should be used for +// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything. +// OpNos[1] is the same for operand 1 of the pattern. Resolve these -1s and +// set OpNo0 and OpNo1 to the shuffle operands that should actually be used +// for operands 0 and 1 of the pattern. +static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) { + if (OpNos[0] < 0) { + if (OpNos[1] < 0) + return false; + OpNo0 = OpNo1 = OpNos[1]; + } else if (OpNos[1] < 0) { + OpNo0 = OpNo1 = OpNos[0]; + } else { + OpNo0 = OpNos[0]; + OpNo1 = OpNos[1]; + } + return true; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. Return true if the VPERM can be implemented using P. +// When returning true set OpNo0 to the VPERM operand that should be +// used for operand 0 of P and likewise OpNo1 for operand 1 of P. +// +// For example, if swapping the VPERM operands allows P to match, OpNo0 +// will be 1 and OpNo1 will be 0. If instead Bytes only refers to one +// operand, but rewriting it to use two duplicated operands allows it to +// match P, then OpNo0 and OpNo1 will be the same. +static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P, + unsigned &OpNo0, unsigned &OpNo1) { + int OpNos[] = { -1, -1 }; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + int Elt = Bytes[I]; + if (Elt >= 0) { + // Make sure that the two permute vectors use the same suboperand + // byte number. Only the operand numbers (the high bits) are + // allowed to differ. + if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1)) + return false; + int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes; + int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes; + // Make sure that the operand mappings are consistent with previous + // elements. + if (OpNos[ModelOpNo] == 1 - RealOpNo) + return false; + OpNos[ModelOpNo] = RealOpNo; + } + } + return chooseShuffleOpNos(OpNos, OpNo0, OpNo1); +} + +// As above, but search for a matching permute. +static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes, + unsigned &OpNo0, unsigned &OpNo1) { + for (auto &P : PermuteForms) + if (matchPermute(Bytes, P, OpNo0, OpNo1)) + return &P; + return nullptr; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. This permute is an operand of an outer permute. +// See whether redistributing the -1 bytes gives a shuffle that can be +// implemented using P. If so, set Transform to a VPERM-like permute vector +// that, when applied to the result of P, gives the original permute in Bytes. +static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes, + const Permute &P, + SmallVectorImpl<int> &Transform) { + unsigned To = 0; + for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) { + int Elt = Bytes[From]; + if (Elt < 0) + // Byte number From of the result is undefined. + Transform[From] = -1; + else { + while (P.Bytes[To] != Elt) { + To += 1; + if (To == SystemZ::VectorBytes) + return false; + } + Transform[From] = To; + } + } + return true; +} + +// As above, but search for a matching permute. +static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes, + SmallVectorImpl<int> &Transform) { + for (auto &P : PermuteForms) + if (matchDoublePermute(Bytes, P, Transform)) + return &P; + return nullptr; +} + +// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask, +// as if it had type vNi8. +static void getVPermMask(ShuffleVectorSDNode *VSN, + SmallVectorImpl<int> &Bytes) { + EVT VT = VSN->getValueType(0); + unsigned NumElements = VT.getVectorNumElements(); + unsigned BytesPerElement = VT.getVectorElementType().getStoreSize(); + Bytes.resize(NumElements * BytesPerElement, -1); + for (unsigned I = 0; I < NumElements; ++I) { + int Index = VSN->getMaskElt(I); + if (Index >= 0) + for (unsigned J = 0; J < BytesPerElement; ++J) + Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J; + } +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. See whether bytes [Start, Start + BytesPerElement) of +// the result come from a contiguous sequence of bytes from one input. +// Set Base to the selector for the first byte if so. +static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start, + unsigned BytesPerElement, int &Base) { + Base = -1; + for (unsigned I = 0; I < BytesPerElement; ++I) { + if (Bytes[Start + I] >= 0) { + unsigned Elem = Bytes[Start + I]; + if (Base < 0) { + Base = Elem - I; + // Make sure the bytes would come from one input operand. + if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size()) + return false; + } else if (unsigned(Base) != Elem - I) + return false; + } + } + return true; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. Return true if it can be performed using VSLDI. +// When returning true, set StartIndex to the shift amount and OpNo0 +// and OpNo1 to the VPERM operands that should be used as the first +// and second shift operand respectively. +static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes, + unsigned &StartIndex, unsigned &OpNo0, + unsigned &OpNo1) { + int OpNos[] = { -1, -1 }; + int Shift = -1; + for (unsigned I = 0; I < 16; ++I) { + int Index = Bytes[I]; + if (Index >= 0) { + int ExpectedShift = (Index - I) % SystemZ::VectorBytes; + int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes; + int RealOpNo = unsigned(Index) / SystemZ::VectorBytes; + if (Shift < 0) + Shift = ExpectedShift; + else if (Shift != ExpectedShift) + return false; + // Make sure that the operand mappings are consistent with previous + // elements. + if (OpNos[ModelOpNo] == 1 - RealOpNo) + return false; + OpNos[ModelOpNo] = RealOpNo; + } + } + StartIndex = Shift; + return chooseShuffleOpNos(OpNos, OpNo0, OpNo1); +} + +// Create a node that performs P on operands Op0 and Op1, casting the +// operands to the appropriate type. The type of the result is determined by P. +static SDValue getPermuteNode(SelectionDAG &DAG, SDLoc DL, + const Permute &P, SDValue Op0, SDValue Op1) { + // VPDI (PERMUTE_DWORDS) always operates on v2i64s. The input + // elements of a PACK are twice as wide as the outputs. + unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 : + P.Opcode == SystemZISD::PACK ? P.Operand * 2 : + P.Operand); + // Cast both operands to the appropriate type. + MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8), + SystemZ::VectorBytes / InBytes); + Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0); + Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1); + SDValue Op; + if (P.Opcode == SystemZISD::PERMUTE_DWORDS) { + SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32); + Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2); + } else if (P.Opcode == SystemZISD::PACK) { + MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8), + SystemZ::VectorBytes / P.Operand); + Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1); + } else { + Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1); + } + return Op; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. Implement it on operands Ops[0] and Ops[1] using +// VSLDI or VPERM. +static SDValue getGeneralPermuteNode(SelectionDAG &DAG, SDLoc DL, SDValue *Ops, + const SmallVectorImpl<int> &Bytes) { + for (unsigned I = 0; I < 2; ++I) + Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]); + + // First see whether VSLDI can be used. + unsigned StartIndex, OpNo0, OpNo1; + if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1)) + return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0], + Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32)); + + // Fall back on VPERM. Construct an SDNode for the permute vector. + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= 0) + IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32); + else + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + SDValue Op2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, IndexNodes); + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2); +} + +namespace { +// Describes a general N-operand vector shuffle. +struct GeneralShuffle { + GeneralShuffle(EVT vt) : VT(vt) {} + void addUndef(); + void add(SDValue, unsigned); + SDValue getNode(SelectionDAG &, SDLoc); + + // The operands of the shuffle. + SmallVector<SDValue, SystemZ::VectorBytes> Ops; + + // Index I is -1 if byte I of the result is undefined. Otherwise the + // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand + // Bytes[I] / SystemZ::VectorBytes. + SmallVector<int, SystemZ::VectorBytes> Bytes; + + // The type of the shuffle result. + EVT VT; +}; +} + +// Add an extra undefined element to the shuffle. +void GeneralShuffle::addUndef() { + unsigned BytesPerElement = VT.getVectorElementType().getStoreSize(); + for (unsigned I = 0; I < BytesPerElement; ++I) + Bytes.push_back(-1); +} + +// Add an extra element to the shuffle, taking it from element Elem of Op. +// A null Op indicates a vector input whose value will be calculated later; +// there is at most one such input per shuffle and it always has the same +// type as the result. +void GeneralShuffle::add(SDValue Op, unsigned Elem) { + unsigned BytesPerElement = VT.getVectorElementType().getStoreSize(); + + // The source vector can have wider elements than the result, + // either through an explicit TRUNCATE or because of type legalization. + // We want the least significant part. + EVT FromVT = Op.getNode() ? Op.getValueType() : VT; + unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize(); + assert(FromBytesPerElement >= BytesPerElement && + "Invalid EXTRACT_VECTOR_ELT"); + unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes + + (FromBytesPerElement - BytesPerElement)); + + // Look through things like shuffles and bitcasts. + while (Op.getNode()) { + if (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) { + // See whether the bytes we need come from a contiguous part of one + // operand. + SmallVector<int, SystemZ::VectorBytes> OpBytes; + getVPermMask(cast<ShuffleVectorSDNode>(Op), OpBytes); + int NewByte; + if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte)) + break; + if (NewByte < 0) { + addUndef(); + return; + } + Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes); + Byte = unsigned(NewByte) % SystemZ::VectorBytes; + } else if (Op.getOpcode() == ISD::UNDEF) { + addUndef(); + return; + } else + break; + } + + // Make sure that the source of the extraction is in Ops. + unsigned OpNo = 0; + for (; OpNo < Ops.size(); ++OpNo) + if (Ops[OpNo] == Op) + break; + if (OpNo == Ops.size()) + Ops.push_back(Op); + + // Add the element to Bytes. + unsigned Base = OpNo * SystemZ::VectorBytes + Byte; + for (unsigned I = 0; I < BytesPerElement; ++I) + Bytes.push_back(Base + I); +} + +// Return SDNodes for the completed shuffle. +SDValue GeneralShuffle::getNode(SelectionDAG &DAG, SDLoc DL) { + assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector"); + + if (Ops.size() == 0) + return DAG.getUNDEF(VT); + + // Make sure that there are at least two shuffle operands. + if (Ops.size() == 1) + Ops.push_back(DAG.getUNDEF(MVT::v16i8)); + + // Create a tree of shuffles, deferring root node until after the loop. + // Try to redistribute the undefined elements of non-root nodes so that + // the non-root shuffles match something like a pack or merge, then adjust + // the parent node's permute vector to compensate for the new order. + // Among other things, this copes with vectors like <2 x i16> that were + // padded with undefined elements during type legalization. + // + // In the best case this redistribution will lead to the whole tree + // using packs and merges. It should rarely be a loss in other cases. + unsigned Stride = 1; + for (; Stride * 2 < Ops.size(); Stride *= 2) { + for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) { + SDValue SubOps[] = { Ops[I], Ops[I + Stride] }; + + // Create a mask for just these two operands. + SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes); + for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) { + unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes; + if (OpNo == I) + NewBytes[J] = Byte; + else if (OpNo == I + Stride) + NewBytes[J] = SystemZ::VectorBytes + Byte; + else + NewBytes[J] = -1; + } + // See if it would be better to reorganize NewMask to avoid using VPERM. + SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes); + if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) { + Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]); + // Applying NewBytesMap to Ops[I] gets back to NewBytes. + for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) { + if (NewBytes[J] >= 0) { + assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes && + "Invalid double permute"); + Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J]; + } else + assert(NewBytesMap[J] < 0 && "Invalid double permute"); + } + } else { + // Just use NewBytes on the operands. + Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes); + for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) + if (NewBytes[J] >= 0) + Bytes[J] = I * SystemZ::VectorBytes + J; + } + } + } + + // Now we just have 2 inputs. Put the second operand in Ops[1]. + if (Stride > 1) { + Ops[1] = Ops[Stride]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= int(SystemZ::VectorBytes)) + Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes; + } + + // Look for an instruction that can do the permute without resorting + // to VPERM. + unsigned OpNo0, OpNo1; + SDValue Op; + if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) + Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]); + else + Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); +} + +// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64 +// vector for them. +static SDValue joinDwords(SelectionDAG &DAG, SDLoc DL, SDValue Op0, + SDValue Op1) { + if (Op0.getOpcode() == ISD::UNDEF && Op1.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(MVT::v2i64); + // If one of the two inputs is undefined then replicate the other one, + // in order to avoid using another register unnecessarily. + if (Op0.getOpcode() == ISD::UNDEF) + Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1); + else if (Op1.getOpcode() == ISD::UNDEF) + Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + else { + Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1); + } + return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1); +} + +// Try to represent constant BUILD_VECTOR node BVN using a +// SystemZISD::BYTE_MASK-style mask. Store the mask value in Mask +// on success. +static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) { + EVT ElemVT = BVN->getValueType(0).getVectorElementType(); + unsigned BytesPerElement = ElemVT.getStoreSize(); + for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) { + SDValue Op = BVN->getOperand(I); + if (Op.getOpcode() != ISD::UNDEF) { + uint64_t Value; + if (Op.getOpcode() == ISD::Constant) + Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue(); + else if (Op.getOpcode() == ISD::ConstantFP) + Value = (dyn_cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt() + .getZExtValue()); + else + return false; + for (unsigned J = 0; J < BytesPerElement; ++J) { + uint64_t Byte = (Value >> (J * 8)) & 0xff; + if (Byte == 0xff) + Mask |= 1 << ((E - I - 1) * BytesPerElement + J); + else if (Byte != 0) + return false; + } + } + } + return true; +} + +// Try to load a vector constant in which BitsPerElement-bit value Value +// is replicated to fill the vector. VT is the type of the resulting +// constant, which may have elements of a different size from BitsPerElement. +// Return the SDValue of the constant on success, otherwise return +// an empty value. +static SDValue tryBuildVectorReplicate(SelectionDAG &DAG, + const SystemZInstrInfo *TII, + SDLoc DL, EVT VT, uint64_t Value, + unsigned BitsPerElement) { + // Signed 16-bit values can be replicated using VREPI. + int64_t SignedValue = SignExtend64(Value, BitsPerElement); + if (isInt<16>(SignedValue)) { + MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), + SystemZ::VectorBits / BitsPerElement); + SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT, + DAG.getConstant(SignedValue, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + // See whether rotating the constant left some N places gives a value that + // is one less than a power of 2 (i.e. all zeros followed by all ones). + // If so we can use VGM. + unsigned Start, End; + if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) { + // isRxSBGMask returns the bit numbers for a full 64-bit value, + // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to + // bit numbers for an BitsPerElement value, so that 0 denotes + // 1 << (BitsPerElement-1). + Start -= 64 - BitsPerElement; + End -= 64 - BitsPerElement; + MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), + SystemZ::VectorBits / BitsPerElement); + SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT, + DAG.getConstant(Start, DL, MVT::i32), + DAG.getConstant(End, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + return SDValue(); +} + +// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually +// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for +// the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR +// would benefit from this representation and return it if so. +static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, + BuildVectorSDNode *BVN) { + EVT VT = BVN->getValueType(0); + unsigned NumElements = VT.getVectorNumElements(); + + // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation + // on byte vectors. If there are non-EXTRACT_VECTOR_ELT elements that still + // need a BUILD_VECTOR, add an additional placeholder operand for that + // BUILD_VECTOR and store its operands in ResidueOps. + GeneralShuffle GS(VT); + SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps; + bool FoundOne = false; + for (unsigned I = 0; I < NumElements; ++I) { + SDValue Op = BVN->getOperand(I); + if (Op.getOpcode() == ISD::TRUNCATE) + Op = Op.getOperand(0); + if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op.getOperand(1).getOpcode() == ISD::Constant) { + unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + GS.add(Op.getOperand(0), Elem); + FoundOne = true; + } else if (Op.getOpcode() == ISD::UNDEF) { + GS.addUndef(); + } else { + GS.add(SDValue(), ResidueOps.size()); + ResidueOps.push_back(Op); + } + } + + // Nothing to do if there are no EXTRACT_VECTOR_ELTs. + if (!FoundOne) + return SDValue(); + + // Create the BUILD_VECTOR for the remaining elements, if any. + if (!ResidueOps.empty()) { + while (ResidueOps.size() < NumElements) + ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType())); + for (auto &Op : GS.Ops) { + if (!Op.getNode()) { + Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps); + break; + } + } + } + return GS.getNode(DAG, SDLoc(BVN)); +} + +// Combine GPR scalar values Elems into a vector of type VT. +static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT, + SmallVectorImpl<SDValue> &Elems) { + // See whether there is a single replicated value. + SDValue Single; + unsigned int NumElements = Elems.size(); + unsigned int Count = 0; + for (auto Elem : Elems) { + if (Elem.getOpcode() != ISD::UNDEF) { + if (!Single.getNode()) + Single = Elem; + else if (Elem != Single) { + Single = SDValue(); + break; + } + Count += 1; + } + } + // There are three cases here: + // + // - if the only defined element is a loaded one, the best sequence + // is a replicating load. + // + // - otherwise, if the only defined element is an i64 value, we will + // end up with the same VLVGP sequence regardless of whether we short-cut + // for replication or fall through to the later code. + // + // - otherwise, if the only defined element is an i32 or smaller value, + // we would need 2 instructions to replicate it: VLVGP followed by VREPx. + // This is only a win if the single defined element is used more than once. + // In other cases we're better off using a single VLVGx. + if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD)) + return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single); + + // The best way of building a v2i64 from two i64s is to use VLVGP. + if (VT == MVT::v2i64) + return joinDwords(DAG, DL, Elems[0], Elems[1]); + + // Collect the constant terms. + SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue()); + SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false); + + unsigned NumConstants = 0; + for (unsigned I = 0; I < NumElements; ++I) { + SDValue Elem = Elems[I]; + if (Elem.getOpcode() == ISD::Constant || + Elem.getOpcode() == ISD::ConstantFP) { + NumConstants += 1; + Constants[I] = Elem; + Done[I] = true; + } + } + // If there was at least one constant, fill in the other elements of + // Constants with undefs to get a full vector constant and use that + // as the starting point. + SDValue Result; + if (NumConstants > 0) { + for (unsigned I = 0; I < NumElements; ++I) + if (!Constants[I].getNode()) + Constants[I] = DAG.getUNDEF(Elems[I].getValueType()); + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Constants); + } else { + // Otherwise try to use VLVGP to start the sequence in order to + // avoid a false dependency on any previous contents of the vector + // register. This only makes sense if one of the associated elements + // is defined. + unsigned I1 = NumElements / 2 - 1; + unsigned I2 = NumElements - 1; + bool Def1 = (Elems[I1].getOpcode() != ISD::UNDEF); + bool Def2 = (Elems[I2].getOpcode() != ISD::UNDEF); + if (Def1 || Def2) { + SDValue Elem1 = Elems[Def1 ? I1 : I2]; + SDValue Elem2 = Elems[Def2 ? I2 : I1]; + Result = DAG.getNode(ISD::BITCAST, DL, VT, + joinDwords(DAG, DL, Elem1, Elem2)); + Done[I1] = true; + Done[I2] = true; + } else + Result = DAG.getUNDEF(VT); + } + + // Use VLVGx to insert the other elements. + for (unsigned I = 0; I < NumElements; ++I) + if (!Done[I] && Elems[I].getOpcode() != ISD::UNDEF) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I], + DAG.getConstant(I, DL, MVT::i32)); + return Result; +} + +SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + auto *BVN = cast<BuildVectorSDNode>(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (BVN->isConstant()) { + // Try using VECTOR GENERATE BYTE MASK. This is the architecturally- + // preferred way of creating all-zero and all-one vectors so give it + // priority over other methods below. + uint64_t Mask = 0; + if (tryBuildVectorByteMask(BVN, Mask)) { + SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(Mask, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + + // Try using some form of replication. + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + 8, true) && + SplatBitSize <= 64) { + // First try assuming that any undefined bits above the highest set bit + // and below the lowest set bit are 1s. This increases the likelihood of + // being able to use a sign-extended element value in VECTOR REPLICATE + // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK. + uint64_t SplatBitsZ = SplatBits.getZExtValue(); + uint64_t SplatUndefZ = SplatUndef.getZExtValue(); + uint64_t Lower = (SplatUndefZ + & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1)); + uint64_t Upper = (SplatUndefZ + & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1)); + uint64_t Value = SplatBitsZ | Upper | Lower; + SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, + SplatBitSize); + if (Op.getNode()) + return Op; + + // Now try assuming that any undefined bits between the first and + // last defined set bits are set. This increases the chances of + // using a non-wraparound mask. + uint64_t Middle = SplatUndefZ & ~Upper & ~Lower; + Value = SplatBitsZ | Middle; + Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize); + if (Op.getNode()) + return Op; + } + + // Fall back to loading it from memory. + return SDValue(); + } + + // See if we should use shuffles to construct the vector from other vectors. + SDValue Res = tryBuildVectorShuffle(DAG, BVN); + if (Res.getNode()) + return Res; + + // Otherwise use buildVector to build the vector up from GPRs. + unsigned NumElements = Op.getNumOperands(); + SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements); + for (unsigned I = 0; I < NumElements; ++I) + Ops[I] = Op.getOperand(I); + return buildVector(DAG, DL, VT, Ops); +} + +SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned NumElements = VT.getVectorNumElements(); + + if (VSN->isSplat()) { + SDValue Op0 = Op.getOperand(0); + unsigned Index = VSN->getSplatIndex(); + assert(Index < VT.getVectorNumElements() && + "Splat index should be defined and in first operand"); + // See whether the value we're splatting is directly available as a scalar. + if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) || + Op0.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index)); + // Otherwise keep it as a vector-to-vector operation. + return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0), + DAG.getConstant(Index, DL, MVT::i32)); + } + + GeneralShuffle GS(VT); + for (unsigned I = 0; I < NumElements; ++I) { + int Elt = VSN->getMaskElt(I); + if (Elt < 0) + GS.addUndef(); + else + GS.add(Op.getOperand(unsigned(Elt) / NumElements), + unsigned(Elt) % NumElements); + } + return GS.getNode(DAG, SDLoc(VSN)); +} + +SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + // Just insert the scalar into element 0 of an undefined vector. + return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + Op.getValueType(), DAG.getUNDEF(Op.getValueType()), + Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32)); +} + +SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, + unsigned ByScalar) const { + // Look for cases where a vector shift can use the *_BY_SCALAR form. + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits(); + + // See whether the shift vector is a splat represented as BUILD_VECTOR. + if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) { + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + // Check for constant splats. Use ElemBitSize as the minimum element + // width and reject splats that need wider elements. + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + ElemBitSize, true) && + SplatBitSize == ElemBitSize) { + SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff, + DL, MVT::i32); + return DAG.getNode(ByScalar, DL, VT, Op0, Shift); + } + // Check for variable splats. + BitVector UndefElements; + SDValue Splat = BVN->getSplatValue(&UndefElements); + if (Splat) { + // Since i32 is the smallest legal type, we either need a no-op + // or a truncation. + SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat); + return DAG.getNode(ByScalar, DL, VT, Op0, Shift); + } + } + + // See whether the shift vector is a splat represented as SHUFFLE_VECTOR, + // and the shift amount is directly available in a GPR. + if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) { + if (VSN->isSplat()) { + SDValue VSNOp0 = VSN->getOperand(0); + unsigned Index = VSN->getSplatIndex(); + assert(Index < VT.getVectorNumElements() && + "Splat index should be defined and in first operand"); + if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) || + VSNOp0.getOpcode() == ISD::BUILD_VECTOR) { + // Since i32 is the smallest legal type, we either need a no-op + // or a truncation. + SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, + VSNOp0.getOperand(Index)); + return DAG.getNode(ByScalar, DL, VT, Op0, Shift); + } + } + } + + // Otherwise just treat the current form as legal. + return Op; +} + SDValue SystemZTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2737,6 +3760,12 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerOR(Op, DAG); case ISD::CTPOP: return lowerCTPOP(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: + return DAG.getNode(ISD::CTLZ, SDLoc(Op), + Op.getValueType(), Op.getOperand(0)); + case ISD::CTTZ_ZERO_UNDEF: + return DAG.getNode(ISD::CTTZ, SDLoc(Op), + Op.getValueType(), Op.getOperand(0)); case ISD::ATOMIC_SWAP: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW); case ISD::ATOMIC_STORE: @@ -2773,6 +3802,18 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerPREFETCH(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return lowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::BUILD_VECTOR: + return lowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return lowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SCALAR_TO_VECTOR: + return lowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SHL: + return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); + case ISD::SRL: + return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR); + case ISD::SRA: + return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR); default: llvm_unreachable("Unexpected node to lower"); } @@ -2820,6 +3861,24 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(TBEGIN); OPCODE(TBEGIN_NOFLOAT); OPCODE(TEND); + OPCODE(BYTE_MASK); + OPCODE(ROTATE_MASK); + OPCODE(REPLICATE); + OPCODE(JOIN_DWORDS); + OPCODE(SPLAT); + OPCODE(MERGE_HIGH); + OPCODE(MERGE_LOW); + OPCODE(SHL_DOUBLE); + OPCODE(PERMUTE_DWORDS); + OPCODE(PERMUTE); + OPCODE(PACK); + OPCODE(VSHL_BY_SCALAR); + OPCODE(VSRL_BY_SCALAR); + OPCODE(VSRA_BY_SCALAR); + OPCODE(VSUM); + OPCODE(VICMPE); + OPCODE(VICMPH); + OPCODE(VICMPHL); OPCODE(ATOMIC_SWAPW); OPCODE(ATOMIC_LOADW_ADD); OPCODE(ATOMIC_LOADW_SUB); @@ -2838,6 +3897,157 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { #undef OPCODE } +// Return true if VT is a vector whose elements are a whole number of bytes +// in width. +static bool canTreatAsByteVector(EVT VT) { + return VT.isVector() && VT.getVectorElementType().getSizeInBits() % 8 == 0; +} + +// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT +// producing a result of type ResVT. Op is a possibly bitcast version +// of the input vector and Index is the index (based on type VecVT) that +// should be extracted. Return the new extraction if a simplification +// was possible or if Force is true. +SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT, + SDValue Op, unsigned Index, + DAGCombinerInfo &DCI, + bool Force) const { + SelectionDAG &DAG = DCI.DAG; + + // The number of bytes being extracted. + unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize(); + + for (;;) { + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::BITCAST) + // Look through bitcasts. + Op = Op.getOperand(0); + else if (Opcode == ISD::VECTOR_SHUFFLE && + canTreatAsByteVector(Op.getValueType())) { + // Get a VPERM-like permute mask and see whether the bytes covered + // by the extracted element are a contiguous sequence from one + // source operand. + SmallVector<int, SystemZ::VectorBytes> Bytes; + getVPermMask(cast<ShuffleVectorSDNode>(Op), Bytes); + int First; + if (!getShuffleInput(Bytes, Index * BytesPerElement, + BytesPerElement, First)) + break; + if (First < 0) + return DAG.getUNDEF(ResVT); + // Make sure the contiguous sequence starts at a multiple of the + // original element size. + unsigned Byte = unsigned(First) % Bytes.size(); + if (Byte % BytesPerElement != 0) + break; + // We can get the extracted value directly from an input. + Index = Byte / BytesPerElement; + Op = Op.getOperand(unsigned(First) / Bytes.size()); + Force = true; + } else if (Opcode == ISD::BUILD_VECTOR && + canTreatAsByteVector(Op.getValueType())) { + // We can only optimize this case if the BUILD_VECTOR elements are + // at least as wide as the extracted value. + EVT OpVT = Op.getValueType(); + unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize(); + if (OpBytesPerElement < BytesPerElement) + break; + // Make sure that the least-significant bit of the extracted value + // is the least significant bit of an input. + unsigned End = (Index + 1) * BytesPerElement; + if (End % OpBytesPerElement != 0) + break; + // We're extracting the low part of one operand of the BUILD_VECTOR. + Op = Op.getOperand(End / OpBytesPerElement - 1); + if (!Op.getValueType().isInteger()) { + EVT VT = MVT::getIntegerVT(Op.getValueType().getSizeInBits()); + Op = DAG.getNode(ISD::BITCAST, DL, VT, Op); + DCI.AddToWorklist(Op.getNode()); + } + EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits()); + Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op); + if (VT != ResVT) { + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op); + } + return Op; + } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || + Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || + Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && + canTreatAsByteVector(Op.getValueType()) && + canTreatAsByteVector(Op.getOperand(0).getValueType())) { + // Make sure that only the unextended bits are significant. + EVT ExtVT = Op.getValueType(); + EVT OpVT = Op.getOperand(0).getValueType(); + unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize(); + unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize(); + unsigned Byte = Index * BytesPerElement; + unsigned SubByte = Byte % ExtBytesPerElement; + unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement; + if (SubByte < MinSubByte || + SubByte + BytesPerElement > ExtBytesPerElement) + break; + // Get the byte offset of the unextended element + Byte = Byte / ExtBytesPerElement * OpBytesPerElement; + // ...then add the byte offset relative to that element. + Byte += SubByte - MinSubByte; + if (Byte % BytesPerElement != 0) + break; + Op = Op.getOperand(0); + Index = Byte / BytesPerElement; + Force = true; + } else + break; + } + if (Force) { + if (Op.getValueType() != VecVT) { + Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op); + DCI.AddToWorklist(Op.getNode()); + } + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op, + DAG.getConstant(Index, DL, MVT::i32)); + } + return SDValue(); +} + +// Optimize vector operations in scalar value Op on the basis that Op +// is truncated to TruncVT. +SDValue +SystemZTargetLowering::combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op, + DAGCombinerInfo &DCI) const { + // If we have (trunc (extract_vector_elt X, Y)), try to turn it into + // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements + // of type TruncVT. + if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + TruncVT.getSizeInBits() % 8 == 0) { + SDValue Vec = Op.getOperand(0); + EVT VecVT = Vec.getValueType(); + if (canTreatAsByteVector(VecVT)) { + if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize(); + unsigned TruncBytes = TruncVT.getStoreSize(); + if (BytesPerElement % TruncBytes == 0) { + // Calculate the value of Y' in the above description. We are + // splitting the original elements into Scale equal-sized pieces + // and for truncation purposes want the last (least-significant) + // of these pieces for IndexN. This is easiest to do by calculating + // the start index of the following element and then subtracting 1. + unsigned Scale = BytesPerElement / TruncBytes; + unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1; + + // Defer the creation of the bitcast from X to combineExtract, + // which might be able to optimize the extraction. + VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8), + VecVT.getStoreSize() / TruncBytes); + EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT); + return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true); + } + } + } + } + return SDValue(); +} + SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2869,6 +4079,40 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, } } } + // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better + // for the extraction to be done on a vMiN value, so that we can use VSTE. + // If X has wider elements then convert it to: + // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z). + if (Opcode == ISD::STORE) { + auto *SN = cast<StoreSDNode>(N); + EVT MemVT = SN->getMemoryVT(); + if (MemVT.isInteger()) { + SDValue Value = combineTruncateExtract(SDLoc(N), MemVT, + SN->getValue(), DCI); + if (Value.getNode()) { + DCI.AddToWorklist(Value.getNode()); + + // Rewrite the store with the new form of stored value. + return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value, + SN->getBasePtr(), SN->getMemoryVT(), + SN->getMemOperand()); + } + } + } + // Try to simplify a vector extraction. + if (Opcode == ISD::EXTRACT_VECTOR_ELT) { + if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + SDValue Op0 = N->getOperand(0); + EVT VecVT = Op0.getValueType(); + return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0, + IndexN->getZExtValue(), DCI, false); + } + } + // (join_dwords X, X) == (replicate X) + if (Opcode == SystemZISD::JOIN_DWORDS && + N->getOperand(0) == N->getOperand(1)) + return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0), + N->getOperand(0)); return SDValue(); } @@ -3681,11 +4925,18 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI, } } - // Add FPR clobbers. + // Add FPR/VR clobbers. if (!NoFloat && (Control & 4) != 0) { - for (int I = 0; I < 16; I++) { - unsigned Reg = SystemZMC::FP64Regs[I]; - MI->addOperand(MachineOperand::CreateReg(Reg, true, true)); + if (Subtarget.hasVector()) { + for (int I = 0; I < 32; I++) { + unsigned Reg = SystemZMC::VR128Regs[I]; + MI->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } + } else { + for (int I = 0; I < 16; I++) { + unsigned Reg = SystemZMC::FP64Regs[I]; + MI->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } } } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 56d7ef45568..4b7d5908946 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -155,6 +155,70 @@ enum { // Transaction end. Just the chain operand. Returns chain and glue. TEND, + // Create a vector constant by filling byte N of the result with bit + // 15-N of the single operand. + BYTE_MASK, + + // Create a vector constant by replicating an element-sized RISBG-style mask. + // The first operand specifies the starting set bit and the second operand + // specifies the ending set bit. Both operands count from the MSB of the + // element. + ROTATE_MASK, + + // Replicate a GPR scalar value into all elements of a vector. + REPLICATE, + + // Create a vector from two i64 GPRs. + JOIN_DWORDS, + + // Replicate one element of a vector into all elements. The first operand + // is the vector and the second is the index of the element to replicate. + SPLAT, + + // Interleave elements from the high half of operand 0 and the high half + // of operand 1. + MERGE_HIGH, + + // Likewise for the low halves. + MERGE_LOW, + + // Concatenate the vectors in the first two operands, shift them left + // by the third operand, and take the first half of the result. + SHL_DOUBLE, + + // Take one element of the first v2i64 operand and the one element of + // the second v2i64 operand and concatenate them to form a v2i64 result. + // The third operand is a 4-bit value of the form 0A0B, where A and B + // are the element selectors for the first operand and second operands + // respectively. + PERMUTE_DWORDS, + + // Perform a general vector permute on vector operands 0 and 1. + // Each byte of operand 2 controls the corresponding byte of the result, + // in the same way as a byte-level VECTOR_SHUFFLE mask. + PERMUTE, + + // Pack vector operands 0 and 1 into a single vector with half-sized elements. + PACK, + + // Shift each element of vector operand 0 by the number of bits specified + // by scalar operand 1. + VSHL_BY_SCALAR, + VSRL_BY_SCALAR, + VSRA_BY_SCALAR, + + // For each element of the output type, sum across all sub-elements of + // operand 0 belonging to the corresponding element, and add in the + // rightmost sub-element of the corresponding element of operand 1. + VSUM, + + // Compare integer vector operands 0 and 1 to produce the usual 0/-1 + // vector result. VICMPE is for equality, VICMPH for "signed greater than" + // and VICMPHL for "unsigned greater than". + VICMPE, + VICMPH, + VICMPHL, + // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or // ATOMIC_LOAD_<op>. // @@ -222,6 +286,11 @@ public: MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getVectorIdxTy() const override { + // Only the lower 12 bits of an element index are used, so we don't + // want to clobber the upper 32 bits of a GPR unnecessarily. + return MVT::i32; + } EVT getSetCCResultType(LLVMContext &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -328,6 +397,16 @@ private: SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; + + SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp, + unsigned Index, DAGCombinerInfo &DCI, + bool Force) const; + SDValue combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op, + DAGCombinerInfo &DCI) const; // If the last instruction before MBBI in MBB was some form of COMPARE, // try to replace it with a COMPARE AND BRANCH just before MBBI. diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index 2c87871cdca..d7bfc12b938 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2414,6 +2414,10 @@ class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls, let Constraints = "$R1 = $R1src"; } +// An alias of a BinaryVRRf, but with different register sizes. +class BinaryAliasVRRf<RegisterOperand cls> + : Alias<6, (outs VR128:$V1), (ins cls:$R2, cls:$R3), []>; + // An alias of a CompareRI, but with different register sizes. class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls, Immediate imm> diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 3a028594fa4..63101a9d000 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -578,6 +578,8 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = SystemZ::LDR; else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg)) Opcode = SystemZ::LXR; + else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg)) + Opcode = SystemZ::VLR; else llvm_unreachable("Impossible reg-to-reg copy"); @@ -1116,6 +1118,10 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC, } else if (RC == &SystemZ::FP128BitRegClass) { LoadOpcode = SystemZ::LX; StoreOpcode = SystemZ::STX; + } else if (RC == &SystemZ::VF128BitRegClass || + RC == &SystemZ::VR128BitRegClass) { + LoadOpcode = SystemZ::VL; + StoreOpcode = SystemZ::VST; } else llvm_unreachable("Unsupported regclass to load or store"); } @@ -1185,6 +1191,7 @@ static bool isStringOfOnes(uint64_t Mask, unsigned &LSB, unsigned &Length) { bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize, unsigned &Start, unsigned &End) const { // Reject trivial all-zero masks. + Mask &= allOnes(BitSize); if (Mask == 0) return false; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 650cae0b35d..d94725b7913 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -19,18 +19,34 @@ let Predicates = [FeatureVector] in { def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>; def VLGVH : BinaryVRSc<"vlgvh", 0xE721, null_frag, v128h, 1>; def VLGVF : BinaryVRSc<"vlgvf", 0xE721, null_frag, v128f, 2>; - def VLGVG : BinaryVRSc<"vlgvg", 0xE721, null_frag, v128g, 3>; + def VLGVG : BinaryVRSc<"vlgvg", 0xE721, z_vector_extract, v128g, 3>; // Load VR element from GR. - def VLVGB : TernaryVRSb<"vlvgb", 0xE722, null_frag, v128b, v128b, GR32, 0>; - def VLVGH : TernaryVRSb<"vlvgh", 0xE722, null_frag, v128h, v128h, GR32, 1>; - def VLVGF : TernaryVRSb<"vlvgf", 0xE722, null_frag, v128f, v128f, GR32, 2>; - def VLVGG : TernaryVRSb<"vlvgg", 0xE722, null_frag, v128g, v128g, GR64, 3>; + def VLVGB : TernaryVRSb<"vlvgb", 0xE722, z_vector_insert, + v128b, v128b, GR32, 0>; + def VLVGH : TernaryVRSb<"vlvgh", 0xE722, z_vector_insert, + v128h, v128h, GR32, 1>; + def VLVGF : TernaryVRSb<"vlvgf", 0xE722, z_vector_insert, + v128f, v128f, GR32, 2>; + def VLVGG : TernaryVRSb<"vlvgg", 0xE722, z_vector_insert, + v128g, v128g, GR64, 3>; // Load VR from GRs disjoint. - def VLVGP : BinaryVRRf<"vlvgp", 0xE762, null_frag, v128g>; + def VLVGP : BinaryVRRf<"vlvgp", 0xE762, z_join_dwords, v128g>; + def VLVGP32 : BinaryAliasVRRf<GR32>; } +// Extractions always assign to the full GR64, even if the element would +// fit in the lower 32 bits. Sub-i64 extracts therefore need to take a +// subreg of the result. +class VectorExtractSubreg<ValueType type, Instruction insn> + : Pat<(i32 (z_vector_extract (type VR128:$vec), shift12only:$index)), + (EXTRACT_SUBREG (insn VR128:$vec, shift12only:$index), subreg_l32)>; + +def : VectorExtractSubreg<v16i8, VLGVB>; +def : VectorExtractSubreg<v8i16, VLGVH>; +def : VectorExtractSubreg<v4i32, VLGVF>; + //===----------------------------------------------------------------------===// // Immediate instructions //===----------------------------------------------------------------------===// @@ -39,29 +55,38 @@ let Predicates = [FeatureVector] in { // Generate byte mask. def VZERO : InherentVRIa<"vzero", 0xE744, 0>; def VONE : InherentVRIa<"vone", 0xE744, 0xffff>; - def VGBM : UnaryVRIa<"vgbm", 0xE744, null_frag, v128b, imm32zx16>; + def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>; // Generate mask. - def VGMB : BinaryVRIb<"vgmb", 0xE746, null_frag, v128b, 0>; - def VGMH : BinaryVRIb<"vgmh", 0xE746, null_frag, v128h, 1>; - def VGMF : BinaryVRIb<"vgmf", 0xE746, null_frag, v128f, 2>; - def VGMG : BinaryVRIb<"vgmg", 0xE746, null_frag, v128g, 3>; + def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>; + def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>; + def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>; + def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>; // Load element immediate. - def VLEIB : TernaryVRIa<"vleib", 0xE740, null_frag, - v128b, v128b, imm32sx16trunc, imm32zx4>; - def VLEIH : TernaryVRIa<"vleih", 0xE741, null_frag, - v128h, v128h, imm32sx16trunc, imm32zx3>; - def VLEIF : TernaryVRIa<"vleif", 0xE743, null_frag, - v128f, v128f, imm32sx16, imm32zx2>; - def VLEIG : TernaryVRIa<"vleig", 0xE742, null_frag, - v128g, v128g, imm64sx16, imm32zx1>; + // + // We want these instructions to be used ahead of VLVG* where possible. + // However, VLVG* takes a variable BD-format index whereas VLEI takes + // a plain immediate index. This means that VLVG* has an extra "base" + // register operand and is 3 units more complex. Bumping the complexity + // of the VLEI* instructions by 4 means that they are strictly better + // than VLVG* in cases where both forms match. + let AddedComplexity = 4 in { + def VLEIB : TernaryVRIa<"vleib", 0xE740, z_vector_insert, + v128b, v128b, imm32sx16trunc, imm32zx4>; + def VLEIH : TernaryVRIa<"vleih", 0xE741, z_vector_insert, + v128h, v128h, imm32sx16trunc, imm32zx3>; + def VLEIF : TernaryVRIa<"vleif", 0xE743, z_vector_insert, + v128f, v128f, imm32sx16, imm32zx2>; + def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert, + v128g, v128g, imm64sx16, imm32zx1>; + } // Replicate immediate. - def VREPIB : UnaryVRIa<"vrepib", 0xE745, null_frag, v128b, imm32sx16, 0>; - def VREPIH : UnaryVRIa<"vrepih", 0xE745, null_frag, v128h, imm32sx16, 1>; - def VREPIF : UnaryVRIa<"vrepif", 0xE745, null_frag, v128f, imm32sx16, 2>; - def VREPIG : UnaryVRIa<"vrepig", 0xE745, null_frag, v128g, imm32sx16, 3>; + def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>; + def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>; + def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>; + def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>; } //===----------------------------------------------------------------------===// @@ -89,28 +114,45 @@ let Predicates = [FeatureVector] in { def VLM : LoadMultipleVRSa<"vlm", 0xE736>; // Load and replicate - def VLREPB : UnaryVRX<"vlrepb", 0xE705, null_frag, v128b, 1, 0>; - def VLREPH : UnaryVRX<"vlreph", 0xE705, null_frag, v128h, 2, 1>; - def VLREPF : UnaryVRX<"vlrepf", 0xE705, null_frag, v128f, 4, 2>; - def VLREPG : UnaryVRX<"vlrepg", 0xE705, null_frag, v128g, 8, 3>; + def VLREPB : UnaryVRX<"vlrepb", 0xE705, z_replicate_loadi8, v128b, 1, 0>; + def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>; + def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>; + def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>; // Load logical element and zero. - def VLLEZB : UnaryVRX<"vllezb", 0xE704, null_frag, v128b, 1, 0>; - def VLLEZH : UnaryVRX<"vllezh", 0xE704, null_frag, v128h, 2, 1>; - def VLLEZF : UnaryVRX<"vllezf", 0xE704, null_frag, v128f, 4, 2>; - def VLLEZG : UnaryVRX<"vllezg", 0xE704, null_frag, v128g, 8, 3>; + def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8, v128b, 1, 0>; + def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>; + def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>; + def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>; // Load element. - def VLEB : TernaryVRX<"vleb", 0xE700, null_frag, v128b, v128b, 1, imm32zx4>; - def VLEH : TernaryVRX<"vleh", 0xE701, null_frag, v128h, v128h, 2, imm32zx3>; - def VLEF : TernaryVRX<"vlef", 0xE703, null_frag, v128f, v128f, 4, imm32zx2>; - def VLEG : TernaryVRX<"vleg", 0xE702, null_frag, v128g, v128g, 8, imm32zx1>; + def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8, v128b, v128b, 1, imm32zx4>; + def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>; + def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>; + def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>; // Gather element. def VGEF : TernaryVRV<"vgef", 0xE713, 4, imm32zx2>; def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>; } +// Use replicating loads if we're inserting a single element into an +// undefined vector. This avoids a false dependency on the previous +// register contents. +multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype, + SDPatternOperator load, ValueType scalartype> { + def : Pat<(vectype (z_vector_insert + (undef), (scalartype (load bdxaddr12only:$addr)), 0)), + (vlrep bdxaddr12only:$addr)>; + def : Pat<(vectype (scalar_to_vector + (scalartype (load bdxaddr12only:$addr)))), + (vlrep bdxaddr12only:$addr)>; +} +defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>; +defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>; +defm : ReplicatePeephole<VLREPF, v4i32, load, i32>; +defm : ReplicatePeephole<VLREPG, v2i64, load, i64>; + //===----------------------------------------------------------------------===// // Stores //===----------------------------------------------------------------------===// @@ -126,10 +168,10 @@ let Predicates = [FeatureVector] in { def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>; // Store element. - def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, null_frag, v128b, 1, imm32zx4>; - def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, null_frag, v128h, 2, imm32zx3>; - def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, null_frag, v128f, 4, imm32zx2>; - def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, null_frag, v128g, 8, imm32zx1>; + def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8, v128b, 1, imm32zx4>; + def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>; + def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>; + def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>; // Scatter element. def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>; @@ -142,28 +184,28 @@ let Predicates = [FeatureVector] in { let Predicates = [FeatureVector] in { // Merge high. - def VMRHB : BinaryVRRc<"vmrhb", 0xE761, null_frag, v128b, v128b, 0>; - def VMRHH : BinaryVRRc<"vmrhh", 0xE761, null_frag, v128h, v128h, 1>; - def VMRHF : BinaryVRRc<"vmrhf", 0xE761, null_frag, v128f, v128f, 2>; - def VMRHG : BinaryVRRc<"vmrhg", 0xE761, null_frag, v128g, v128g, 3>; + def VMRHB : BinaryVRRc<"vmrhb", 0xE761, z_merge_high, v128b, v128b, 0>; + def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>; + def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>; + def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>; // Merge low. - def VMRLB : BinaryVRRc<"vmrlb", 0xE760, null_frag, v128b, v128b, 0>; - def VMRLH : BinaryVRRc<"vmrlh", 0xE760, null_frag, v128h, v128h, 1>; - def VMRLF : BinaryVRRc<"vmrlf", 0xE760, null_frag, v128f, v128f, 2>; - def VMRLG : BinaryVRRc<"vmrlg", 0xE760, null_frag, v128g, v128g, 3>; + def VMRLB : BinaryVRRc<"vmrlb", 0xE760, z_merge_low, v128b, v128b, 0>; + def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>; + def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>; + def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>; // Permute. - def VPERM : TernaryVRRe<"vperm", 0xE78C, null_frag, v128b, v128b>; + def VPERM : TernaryVRRe<"vperm", 0xE78C, z_permute, v128b, v128b>; // Permute doubleword immediate. - def VPDI : TernaryVRRc<"vpdi", 0xE784, null_frag, v128b, v128b>; + def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>; // Replicate. - def VREPB : BinaryVRIc<"vrepb", 0xE74D, null_frag, v128b, v128b, 0>; - def VREPH : BinaryVRIc<"vreph", 0xE74D, null_frag, v128h, v128h, 1>; - def VREPF : BinaryVRIc<"vrepf", 0xE74D, null_frag, v128f, v128f, 2>; - def VREPG : BinaryVRIc<"vrepg", 0xE74D, null_frag, v128g, v128g, 3>; + def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>; + def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>; + def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>; + def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>; // Select. def VSEL : TernaryVRRe<"vsel", 0xE78D, null_frag, v128any, v128any>; @@ -175,9 +217,9 @@ let Predicates = [FeatureVector] in { let Predicates = [FeatureVector] in { // Pack - def VPKH : BinaryVRRc<"vpkh", 0xE794, null_frag, v128b, v128h, 1>; - def VPKF : BinaryVRRc<"vpkf", 0xE794, null_frag, v128h, v128f, 2>; - def VPKG : BinaryVRRc<"vpkg", 0xE794, null_frag, v128f, v128g, 3>; + def VPKH : BinaryVRRc<"vpkh", 0xE794, z_pack, v128b, v128h, 1>; + def VPKF : BinaryVRRc<"vpkf", 0xE794, z_pack, v128h, v128f, 2>; + def VPKG : BinaryVRRc<"vpkg", 0xE794, z_pack, v128f, v128g, 3>; // Pack saturate. defm VPKSH : BinaryVRRbSPair<"vpksh", 0xE797, null_frag, null_frag, @@ -196,9 +238,12 @@ let Predicates = [FeatureVector] in { v128f, v128g, 3>; // Sign-extend to doubleword. - def VSEGB : UnaryVRRa<"vsegb", 0xE75F, null_frag, v128g, v128b, 0>; - def VSEGH : UnaryVRRa<"vsegh", 0xE75F, null_frag, v128g, v128h, 1>; - def VSEGF : UnaryVRRa<"vsegf", 0xE75F, null_frag, v128g, v128f, 2>; + def VSEGB : UnaryVRRa<"vsegb", 0xE75F, z_vsei8, v128g, v128g, 0>; + def VSEGH : UnaryVRRa<"vsegh", 0xE75F, z_vsei16, v128g, v128g, 1>; + def VSEGF : UnaryVRRa<"vsegf", 0xE75F, z_vsei32, v128g, v128g, 2>; + def : Pat<(z_vsei8_by_parts (v16i8 VR128:$src)), (VSEGB VR128:$src)>; + def : Pat<(z_vsei16_by_parts (v8i16 VR128:$src)), (VSEGH VR128:$src)>; + def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>; // Unpack high. def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>; @@ -222,15 +267,37 @@ let Predicates = [FeatureVector] in { } //===----------------------------------------------------------------------===// +// Instantiating generic operations for specific types. +//===----------------------------------------------------------------------===// + +multiclass GenericVectorOps<ValueType type, ValueType inttype> { + let Predicates = [FeatureVector] in { + def : Pat<(type (load bdxaddr12only:$addr)), + (VL bdxaddr12only:$addr)>; + def : Pat<(store (type VR128:$src), bdxaddr12only:$addr), + (VST VR128:$src, bdxaddr12only:$addr)>; + def : Pat<(type (vselect (inttype VR128:$x), VR128:$y, VR128:$z)), + (VSEL VR128:$y, VR128:$z, VR128:$x)>; + def : Pat<(type (vselect (inttype (z_vnot VR128:$x)), VR128:$y, VR128:$z)), + (VSEL VR128:$z, VR128:$y, VR128:$x)>; + } +} + +defm : GenericVectorOps<v16i8, v16i8>; +defm : GenericVectorOps<v8i16, v8i16>; +defm : GenericVectorOps<v4i32, v4i32>; +defm : GenericVectorOps<v2i64, v2i64>; + +//===----------------------------------------------------------------------===// // Integer arithmetic //===----------------------------------------------------------------------===// let Predicates = [FeatureVector] in { // Add. - def VAB : BinaryVRRc<"vab", 0xE7F3, null_frag, v128b, v128b, 0>; - def VAH : BinaryVRRc<"vah", 0xE7F3, null_frag, v128h, v128h, 1>; - def VAF : BinaryVRRc<"vaf", 0xE7F3, null_frag, v128f, v128f, 2>; - def VAG : BinaryVRRc<"vag", 0xE7F3, null_frag, v128g, v128g, 3>; + def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>; + def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>; + def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>; + def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>; def VAQ : BinaryVRRc<"vaq", 0xE7F3, null_frag, v128q, v128q, 4>; // Add compute carry. @@ -268,16 +335,16 @@ let Predicates = [FeatureVector] in { def VCKSM : BinaryVRRc<"vcksm", 0xE766, null_frag, v128any, v128any>; // Count leading zeros. - def VCLZB : UnaryVRRa<"vclzb", 0xE753, null_frag, v128b, v128b, 0>; - def VCLZH : UnaryVRRa<"vclzh", 0xE753, null_frag, v128h, v128h, 1>; - def VCLZF : UnaryVRRa<"vclzf", 0xE753, null_frag, v128f, v128f, 2>; - def VCLZG : UnaryVRRa<"vclzg", 0xE753, null_frag, v128g, v128g, 3>; + def VCLZB : UnaryVRRa<"vclzb", 0xE753, ctlz, v128b, v128b, 0>; + def VCLZH : UnaryVRRa<"vclzh", 0xE753, ctlz, v128h, v128h, 1>; + def VCLZF : UnaryVRRa<"vclzf", 0xE753, ctlz, v128f, v128f, 2>; + def VCLZG : UnaryVRRa<"vclzg", 0xE753, ctlz, v128g, v128g, 3>; // Count trailing zeros. - def VCTZB : UnaryVRRa<"vctzb", 0xE752, null_frag, v128b, v128b, 0>; - def VCTZH : UnaryVRRa<"vctzh", 0xE752, null_frag, v128h, v128h, 1>; - def VCTZF : UnaryVRRa<"vctzf", 0xE752, null_frag, v128f, v128f, 2>; - def VCTZG : UnaryVRRa<"vctzg", 0xE752, null_frag, v128g, v128g, 3>; + def VCTZB : UnaryVRRa<"vctzb", 0xE752, cttz, v128b, v128b, 0>; + def VCTZH : UnaryVRRa<"vctzh", 0xE752, cttz, v128h, v128h, 1>; + def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>; + def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>; // Exclusive or. def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>; @@ -295,16 +362,16 @@ let Predicates = [FeatureVector] in { def VGFMAG : TernaryVRRd<"vgfmag", 0xE7BC, null_frag, v128g, v128g, 3>; // Load complement. - def VLCB : UnaryVRRa<"vlcb", 0xE7DE, null_frag, v128b, v128b, 0>; - def VLCH : UnaryVRRa<"vlch", 0xE7DE, null_frag, v128h, v128h, 1>; - def VLCF : UnaryVRRa<"vlcf", 0xE7DE, null_frag, v128f, v128f, 2>; - def VLCG : UnaryVRRa<"vlcg", 0xE7DE, null_frag, v128g, v128g, 3>; + def VLCB : UnaryVRRa<"vlcb", 0xE7DE, z_vneg, v128b, v128b, 0>; + def VLCH : UnaryVRRa<"vlch", 0xE7DE, z_vneg, v128h, v128h, 1>; + def VLCF : UnaryVRRa<"vlcf", 0xE7DE, z_vneg, v128f, v128f, 2>; + def VLCG : UnaryVRRa<"vlcg", 0xE7DE, z_vneg, v128g, v128g, 3>; // Load positive. - def VLPB : UnaryVRRa<"vlpb", 0xE7DF, null_frag, v128b, v128b, 0>; - def VLPH : UnaryVRRa<"vlph", 0xE7DF, null_frag, v128h, v128h, 1>; - def VLPF : UnaryVRRa<"vlpf", 0xE7DF, null_frag, v128f, v128f, 2>; - def VLPG : UnaryVRRa<"vlpg", 0xE7DF, null_frag, v128g, v128g, 3>; + def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8, v128b, v128b, 0>; + def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>; + def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>; + def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>; // Maximum. def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>; @@ -331,9 +398,9 @@ let Predicates = [FeatureVector] in { def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>; // Multiply and add low. - def VMALB : TernaryVRRd<"vmalb", 0xE7AA, null_frag, v128b, v128b, 0>; - def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, null_frag, v128h, v128h, 1>; - def VMALF : TernaryVRRd<"vmalf", 0xE7AA, null_frag, v128f, v128f, 2>; + def VMALB : TernaryVRRd<"vmalb", 0xE7AA, z_muladd, v128b, v128b, 0>; + def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>; + def VMALF : TernaryVRRd<"vmalf", 0xE7AA, z_muladd, v128f, v128f, 2>; // Multiply and add high. def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, null_frag, v128b, v128b, 0>; @@ -376,9 +443,9 @@ let Predicates = [FeatureVector] in { def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, null_frag, v128f, v128f, 2>; // Multiply low. - def VMLB : BinaryVRRc<"vmlb", 0xE7A2, null_frag, v128b, v128b, 0>; - def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, null_frag, v128h, v128h, 1>; - def VMLF : BinaryVRRc<"vmlf", 0xE7A2, null_frag, v128f, v128f, 2>; + def VMLB : BinaryVRRc<"vmlb", 0xE7A2, mul, v128b, v128b, 0>; + def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>; + def VMLF : BinaryVRRc<"vmlf", 0xE7A2, mul, v128f, v128f, 2>; // Multiply even. def VMEB : BinaryVRRc<"vmeb", 0xE7A6, null_frag, v128h, v128b, 0>; @@ -408,6 +475,7 @@ let Predicates = [FeatureVector] in { // Population count. def VPOPCT : BinaryVRRa<"vpopct", 0xE750>; + def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>; // Element rotate left logical (with vector shift amount). def VERLLVB : BinaryVRRc<"verllvb", 0xE773, null_frag, v128b, v128b, 0>; @@ -428,40 +496,40 @@ let Predicates = [FeatureVector] in { def VERIMG : QuaternaryVRId<"verimg", 0xE772, null_frag, v128g, v128g, 3>; // Element shift left (with vector shift amount). - def VESLVB : BinaryVRRc<"veslvb", 0xE770, null_frag, v128b, v128b, 0>; - def VESLVH : BinaryVRRc<"veslvh", 0xE770, null_frag, v128h, v128h, 1>; - def VESLVF : BinaryVRRc<"veslvf", 0xE770, null_frag, v128f, v128f, 2>; - def VESLVG : BinaryVRRc<"veslvg", 0xE770, null_frag, v128g, v128g, 3>; + def VESLVB : BinaryVRRc<"veslvb", 0xE770, z_vshl, v128b, v128b, 0>; + def VESLVH : BinaryVRRc<"veslvh", 0xE770, z_vshl, v128h, v128h, 1>; + def VESLVF : BinaryVRRc<"veslvf", 0xE770, z_vshl, v128f, v128f, 2>; + def VESLVG : BinaryVRRc<"veslvg", 0xE770, z_vshl, v128g, v128g, 3>; // Element shift left (with scalar shift amount). - def VESLB : BinaryVRSa<"veslb", 0xE730, null_frag, v128b, v128b, 0>; - def VESLH : BinaryVRSa<"veslh", 0xE730, null_frag, v128h, v128h, 1>; - def VESLF : BinaryVRSa<"veslf", 0xE730, null_frag, v128f, v128f, 2>; - def VESLG : BinaryVRSa<"veslg", 0xE730, null_frag, v128g, v128g, 3>; + def VESLB : BinaryVRSa<"veslb", 0xE730, z_vshl_by_scalar, v128b, v128b, 0>; + def VESLH : BinaryVRSa<"veslh", 0xE730, z_vshl_by_scalar, v128h, v128h, 1>; + def VESLF : BinaryVRSa<"veslf", 0xE730, z_vshl_by_scalar, v128f, v128f, 2>; + def VESLG : BinaryVRSa<"veslg", 0xE730, z_vshl_by_scalar, v128g, v128g, 3>; // Element shift right arithmetic (with vector shift amount). - def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, null_frag, v128b, v128b, 0>; - def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, null_frag, v128h, v128h, 1>; - def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, null_frag, v128f, v128f, 2>; - def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, null_frag, v128g, v128g, 3>; + def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, z_vsra, v128b, v128b, 0>; + def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, z_vsra, v128h, v128h, 1>; + def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, z_vsra, v128f, v128f, 2>; + def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, z_vsra, v128g, v128g, 3>; // Element shift right arithmetic (with scalar shift amount). - def VESRAB : BinaryVRSa<"vesrab", 0xE73A, null_frag, v128b, v128b, 0>; - def VESRAH : BinaryVRSa<"vesrah", 0xE73A, null_frag, v128h, v128h, 1>; - def VESRAF : BinaryVRSa<"vesraf", 0xE73A, null_frag, v128f, v128f, 2>; - def VESRAG : BinaryVRSa<"vesrag", 0xE73A, null_frag, v128g, v128g, 3>; + def VESRAB : BinaryVRSa<"vesrab", 0xE73A, z_vsra_by_scalar, v128b, v128b, 0>; + def VESRAH : BinaryVRSa<"vesrah", 0xE73A, z_vsra_by_scalar, v128h, v128h, 1>; + def VESRAF : BinaryVRSa<"vesraf", 0xE73A, z_vsra_by_scalar, v128f, v128f, 2>; + def VESRAG : BinaryVRSa<"vesrag", 0xE73A, z_vsra_by_scalar, v128g, v128g, 3>; // Element shift right logical (with vector shift amount). - def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, null_frag, v128b, v128b, 0>; - def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, null_frag, v128h, v128h, 1>; - def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, null_frag, v128f, v128f, 2>; - def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, null_frag, v128g, v128g, 3>; + def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, z_vsrl, v128b, v128b, 0>; + def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, z_vsrl, v128h, v128h, 1>; + def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, z_vsrl, v128f, v128f, 2>; + def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, z_vsrl, v128g, v128g, 3>; // Element shift right logical (with scalar shift amount). - def VESRLB : BinaryVRSa<"vesrlb", 0xE738, null_frag, v128b, v128b, 0>; - def VESRLH : BinaryVRSa<"vesrlh", 0xE738, null_frag, v128h, v128h, 1>; - def VESRLF : BinaryVRSa<"vesrlf", 0xE738, null_frag, v128f, v128f, 2>; - def VESRLG : BinaryVRSa<"vesrlg", 0xE738, null_frag, v128g, v128g, 3>; + def VESRLB : BinaryVRSa<"vesrlb", 0xE738, z_vsrl_by_scalar, v128b, v128b, 0>; + def VESRLH : BinaryVRSa<"vesrlh", 0xE738, z_vsrl_by_scalar, v128h, v128h, 1>; + def VESRLF : BinaryVRSa<"vesrlf", 0xE738, z_vsrl_by_scalar, v128f, v128f, 2>; + def VESRLG : BinaryVRSa<"vesrlg", 0xE738, z_vsrl_by_scalar, v128g, v128g, 3>; // Shift left. def VSL : BinaryVRRc<"vsl", 0xE774, null_frag, v128b, v128b>; @@ -470,7 +538,7 @@ let Predicates = [FeatureVector] in { def VSLB : BinaryVRRc<"vslb", 0xE775, null_frag, v128b, v128b>; // Shift left double by byte. - def VSLDB : TernaryVRId<"vsldb", 0xE777, null_frag, v128b, v128b, 0>; + def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>; // Shift right arithmetic. def VSRA : BinaryVRRc<"vsra", 0xE77E, null_frag, v128b, v128b>; @@ -485,10 +553,10 @@ let Predicates = [FeatureVector] in { def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, null_frag, v128b, v128b>; // Subtract. - def VSB : BinaryVRRc<"vsb", 0xE7F7, null_frag, v128b, v128b, 0>; - def VSH : BinaryVRRc<"vsh", 0xE7F7, null_frag, v128h, v128h, 1>; - def VSF : BinaryVRRc<"vsf", 0xE7F7, null_frag, v128f, v128f, 2>; - def VSG : BinaryVRRc<"vsg", 0xE7F7, null_frag, v128g, v128g, 3>; + def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>; + def VSH : BinaryVRRc<"vsh", 0xE7F7, sub, v128h, v128h, 1>; + def VSF : BinaryVRRc<"vsf", 0xE7F7, sub, v128f, v128f, 2>; + def VSG : BinaryVRRc<"vsg", 0xE7F7, sub, v128g, v128g, 3>; def VSQ : BinaryVRRc<"vsq", 0xE7F7, null_frag, v128q, v128q, 4>; // Subtract compute borrow indication. @@ -505,18 +573,107 @@ let Predicates = [FeatureVector] in { def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, null_frag, v128q, v128q, 4>; // Sum across doubleword. - def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, null_frag, v128g, v128h, 1>; - def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, null_frag, v128g, v128f, 2>; + def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, z_vsum, v128g, v128h, 1>; + def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, z_vsum, v128g, v128f, 2>; // Sum across quadword. - def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, null_frag, v128q, v128f, 2>; - def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, null_frag, v128q, v128g, 3>; + def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, z_vsum, v128q, v128f, 2>; + def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, z_vsum, v128q, v128g, 3>; // Sum across word. - def VSUMB : BinaryVRRc<"vsumb", 0xE764, null_frag, v128f, v128b, 0>; - def VSUMH : BinaryVRRc<"vsumh", 0xE764, null_frag, v128f, v128h, 1>; + def VSUMB : BinaryVRRc<"vsumb", 0xE764, z_vsum, v128f, v128b, 0>; + def VSUMH : BinaryVRRc<"vsumh", 0xE764, z_vsum, v128f, v128h, 1>; +} + +// Instantiate the bitwise ops for type TYPE. +multiclass BitwiseVectorOps<ValueType type> { + let Predicates = [FeatureVector] in { + def : Pat<(type (and VR128:$x, VR128:$y)), (VN VR128:$x, VR128:$y)>; + def : Pat<(type (and VR128:$x, (z_vnot VR128:$y))), + (VNC VR128:$x, VR128:$y)>; + def : Pat<(type (or VR128:$x, VR128:$y)), (VO VR128:$x, VR128:$y)>; + def : Pat<(type (xor VR128:$x, VR128:$y)), (VX VR128:$x, VR128:$y)>; + def : Pat<(type (or (and VR128:$x, VR128:$z), + (and VR128:$y, (z_vnot VR128:$z)))), + (VSEL VR128:$x, VR128:$y, VR128:$z)>; + def : Pat<(type (z_vnot (or VR128:$x, VR128:$y))), + (VNO VR128:$x, VR128:$y)>; + def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>; + } +} + +defm : BitwiseVectorOps<v16i8>; +defm : BitwiseVectorOps<v8i16>; +defm : BitwiseVectorOps<v4i32>; +defm : BitwiseVectorOps<v2i64>; + +// Instantiate additional patterns for absolute-related expressions on +// type TYPE. LC is the negate instruction for TYPE and LP is the absolute +// instruction. +multiclass IntegerAbsoluteVectorOps<ValueType type, Instruction lc, + Instruction lp, int shift> { + let Predicates = [FeatureVector] in { + def : Pat<(type (vselect (type (z_vicmph_zero VR128:$x)), + (z_vneg VR128:$x), VR128:$x)), + (lc (lp VR128:$x))>; + def : Pat<(type (vselect (type (z_vnot (z_vicmph_zero VR128:$x))), + VR128:$x, (z_vneg VR128:$x))), + (lc (lp VR128:$x))>; + def : Pat<(type (vselect (type (z_vicmpl_zero VR128:$x)), + VR128:$x, (z_vneg VR128:$x))), + (lc (lp VR128:$x))>; + def : Pat<(type (vselect (type (z_vnot (z_vicmpl_zero VR128:$x))), + (z_vneg VR128:$x), VR128:$x)), + (lc (lp VR128:$x))>; + def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)), + (z_vneg VR128:$x)), + (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))), + VR128:$x))), + (lp VR128:$x)>; + def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)), + VR128:$x), + (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))), + (z_vneg VR128:$x)))), + (lc (lp VR128:$x))>; + } } +defm : IntegerAbsoluteVectorOps<v16i8, VLCB, VLPB, 7>; +defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>; +defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>; +defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>; + +// Instantiate minimum- and maximum-related patterns for TYPE. CMPH is the +// signed or unsigned "set if greater than" comparison instruction and +// MIN and MAX are the associated minimum and maximum instructions. +multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph, + Instruction min, Instruction max> { + let Predicates = [FeatureVector] in { + def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)), + (max VR128:$x, VR128:$y)>; + def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)), + (min VR128:$x, VR128:$y)>; + def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)), + VR128:$x, VR128:$y)), + (min VR128:$x, VR128:$y)>; + def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)), + VR128:$y, VR128:$x)), + (max VR128:$x, VR128:$y)>; + } +} + +// Signed min/max. +defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>; +defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>; +defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>; +defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>; + +// Unsigned min/max. +defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>; +defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>; +defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>; +defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>; + //===----------------------------------------------------------------------===// // Integer comparison //===----------------------------------------------------------------------===// @@ -539,33 +696,33 @@ let Predicates = [FeatureVector] in { } // Compare equal. - defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, null_frag, null_frag, + defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, z_vicmpe, null_frag, v128b, v128b, 0>; - defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, null_frag, null_frag, + defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, z_vicmpe, null_frag, v128h, v128h, 1>; - defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, null_frag, null_frag, + defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, z_vicmpe, null_frag, v128f, v128f, 2>; - defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, null_frag, null_frag, + defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, z_vicmpe, null_frag, v128g, v128g, 3>; // Compare high. - defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, null_frag, null_frag, + defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, z_vicmph, null_frag, v128b, v128b, 0>; - defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, null_frag, null_frag, + defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, z_vicmph, null_frag, v128h, v128h, 1>; - defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, null_frag, null_frag, + defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, z_vicmph, null_frag, v128f, v128f, 2>; - defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, null_frag, null_frag, + defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, z_vicmph, null_frag, v128g, v128g, 3>; // Compare high logical. - defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, null_frag, null_frag, + defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, z_vicmphl, null_frag, v128b, v128b, 0>; - defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, null_frag, null_frag, + defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, z_vicmphl, null_frag, v128h, v128h, 1>; - defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, null_frag, null_frag, + defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, z_vicmphl, null_frag, v128f, v128f, 2>; - defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, null_frag, null_frag, + defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, z_vicmphl, null_frag, v128g, v128g, 3>; // Test under mask. @@ -686,6 +843,44 @@ let Predicates = [FeatureVector] in { } //===----------------------------------------------------------------------===// +// Conversions +//===----------------------------------------------------------------------===// + +def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + +def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + +def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + +def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + +//===----------------------------------------------------------------------===// +// Replicating scalars +//===----------------------------------------------------------------------===// + +// Define patterns for replicating a scalar GR32 into a vector of type TYPE. +// INDEX is 8 minus the element size in bytes. +class VectorReplicateScalar<ValueType type, Instruction insn, bits<16> index> + : Pat<(type (z_replicate GR32:$scalar)), + (insn (VLVGP32 GR32:$scalar, GR32:$scalar), index)>; + +def : VectorReplicateScalar<v16i8, VREPB, 7>; +def : VectorReplicateScalar<v8i16, VREPH, 3>; +def : VectorReplicateScalar<v4i32, VREPF, 1>; + +// i64 replications are just a single isntruction. +def : Pat<(v2i64 (z_replicate GR64:$scalar)), + (VLVGP GR64:$scalar, GR64:$scalar)>; + +//===----------------------------------------------------------------------===// // String instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index 3151052ecf5..2e431859a86 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -82,6 +82,45 @@ def SDT_ZPrefetch : SDTypeProfile<0, 2, def SDT_ZTBegin : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; +def SDT_ZInsertVectorElt : SDTypeProfile<1, 3, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisVT<3, i32>]>; +def SDT_ZExtractVectorElt : SDTypeProfile<1, 2, + [SDTCisVec<1>, + SDTCisVT<2, i32>]>; +def SDT_ZReplicate : SDTypeProfile<1, 1, + [SDTCisVec<0>]>; +def SDT_ZVecBinary : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def SDT_ZVecBinaryInt : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDT_ZVecBinaryConv : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisVec<1>, + SDTCisSameAs<1, 2>]>; +def SDT_ZRotateMask : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; +def SDT_ZJoinDwords : SDTypeProfile<1, 2, + [SDTCisVT<0, v2i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i64>]>; +def SDT_ZVecTernary : SDTypeProfile<1, 3, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def SDT_ZVecTernaryInt : SDTypeProfile<1, 3, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>]>; //===----------------------------------------------------------------------===// // Node definitions @@ -134,6 +173,34 @@ def z_udivrem64 : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>; def z_serialize : SDNode<"SystemZISD::SERIALIZE", SDTNone, [SDNPHasChain, SDNPMayStore]>; +// Defined because the index is an i32 rather than a pointer. +def z_vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT", + SDT_ZInsertVectorElt>; +def z_vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT", + SDT_ZExtractVectorElt>; +def z_byte_mask : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>; +def z_rotate_mask : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>; +def z_replicate : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>; +def z_join_dwords : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>; +def z_splat : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>; +def z_merge_high : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>; +def z_merge_low : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>; +def z_shl_double : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>; +def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS", + SDT_ZVecTernaryInt>; +def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>; +def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>; +def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR", + SDT_ZVecBinaryInt>; +def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR", + SDT_ZVecBinaryInt>; +def z_vsra_by_scalar : SDNode<"SystemZISD::VSRA_BY_SCALAR", + SDT_ZVecBinaryInt>; +def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>; +def z_vicmpe : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>; +def z_vicmph : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>; +def z_vicmphl : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>; + class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW> : SDNode<"SystemZISD::"##name, profile, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -192,6 +259,10 @@ def z_tbegin_nofloat : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin, def z_tend : SDNode<"SystemZISD::TEND", SDTNone, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def z_vshl : SDNode<"ISD::SHL", SDT_ZVecBinary>; +def z_vsra : SDNode<"ISD::SRA", SDT_ZVecBinary>; +def z_vsrl : SDNode<"ISD::SRL", SDT_ZVecBinary>; + //===----------------------------------------------------------------------===// // Pattern fragments //===----------------------------------------------------------------------===// @@ -215,11 +286,21 @@ def sext8 : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>; def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>; def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>; +// Match extensions of an i32 to an i64, followed by an in-register sign +// extension from a sub-i32 value. +def sext8dbl : PatFrag<(ops node:$src), (sext8 (anyext node:$src))>; +def sext16dbl : PatFrag<(ops node:$src), (sext16 (anyext node:$src))>; + // Register zero-extend operations. Sub-32-bit values are represented as i32s. def zext8 : PatFrag<(ops node:$src), (and node:$src, 0xff)>; def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>; def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>; +// Match extensions of an i32 to an i64, followed by an AND of the low +// i8 or i16 part. +def zext8dbl : PatFrag<(ops node:$src), (zext8 (anyext node:$src))>; +def zext16dbl : PatFrag<(ops node:$src), (zext16 (anyext node:$src))>; + // Typed floating-point loads. def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>; def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>; @@ -383,6 +464,10 @@ def z_iabs64 : PatFrag<(ops node:$src), def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>; def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>; +// Integer multiply-and-add +def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (add (mul node:$src1, node:$src2), node:$src3)>; + // Fused multiply-add and multiply-subtract, but with the order of the // operands matching SystemZ's MA and MS instructions. def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -403,3 +488,88 @@ class loadu<SDPatternOperator operator, SDPatternOperator load = load> class storeu<SDPatternOperator operator, SDPatternOperator store = store> : PatFrag<(ops node:$value, node:$addr), (store (operator node:$value), node:$addr)>; + +// Vector representation of all-zeros and all-ones. +def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>; +def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>; + +// Load a scalar and replicate it in all elements of a vector. +class z_replicate_load<ValueType scalartype, SDPatternOperator load> + : PatFrag<(ops node:$addr), + (z_replicate (scalartype (load node:$addr)))>; +def z_replicate_loadi8 : z_replicate_load<i32, anyextloadi8>; +def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>; +def z_replicate_loadi32 : z_replicate_load<i32, load>; +def z_replicate_loadi64 : z_replicate_load<i64, load>; + +// Load a scalar and insert it into a single element of a vector. +class z_vle<ValueType scalartype, SDPatternOperator load> + : PatFrag<(ops node:$vec, node:$addr, node:$index), + (z_vector_insert node:$vec, (scalartype (load node:$addr)), + node:$index)>; +def z_vlei8 : z_vle<i32, anyextloadi8>; +def z_vlei16 : z_vle<i32, anyextloadi16>; +def z_vlei32 : z_vle<i32, load>; +def z_vlei64 : z_vle<i64, load>; + +// Load a scalar and insert it into the low element of the high i64 of a +// zeroed vector. +class z_vllez<ValueType scalartype, SDPatternOperator load, int index> + : PatFrag<(ops node:$addr), + (z_vector_insert (z_vzero), + (scalartype (load node:$addr)), (i32 index))>; +def z_vllezi8 : z_vllez<i32, anyextloadi8, 7>; +def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>; +def z_vllezi32 : z_vllez<i32, load, 1>; +def z_vllezi64 : PatFrag<(ops node:$addr), + (z_join_dwords (i64 (load node:$addr)), (i64 0))>; + +// Store one element of a vector. +class z_vste<ValueType scalartype, SDPatternOperator store> + : PatFrag<(ops node:$vec, node:$addr, node:$index), + (store (scalartype (z_vector_extract node:$vec, node:$index)), + node:$addr)>; +def z_vstei8 : z_vste<i32, truncstorei8>; +def z_vstei16 : z_vste<i32, truncstorei16>; +def z_vstei32 : z_vste<i32, store>; +def z_vstei64 : z_vste<i64, store>; + +// Arithmetic negation on vectors. +def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>; + +// Bitwise negation on vectors. +def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>; + +// Signed "integer greater than zero" on vectors. +def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>; + +// Signed "integer less than zero" on vectors. +def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>; + +// Integer absolute on vectors. +class z_viabs<int shift> + : PatFrag<(ops node:$src), + (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))), + (z_vsra_by_scalar node:$src, (i32 shift)))>; +def z_viabs8 : z_viabs<7>; +def z_viabs16 : z_viabs<15>; +def z_viabs32 : z_viabs<31>; +def z_viabs64 : z_viabs<63>; + +// Sign-extend the i64 elements of a vector. +class z_vse<int shift> + : PatFrag<(ops node:$src), + (z_vsra_by_scalar (z_vshl_by_scalar node:$src, shift), shift)>; +def z_vsei8 : z_vse<56>; +def z_vsei16 : z_vse<48>; +def z_vsei32 : z_vse<32>; + +// ...and again with the extensions being done on individual i64 scalars. +class z_vse_by_parts<SDPatternOperator operator, int index1, int index2> + : PatFrag<(ops node:$src), + (z_join_dwords + (operator (z_vector_extract node:$src, index1)), + (operator (z_vector_extract node:$src, index2)))>; +def z_vsei8_by_parts : z_vse_by_parts<sext8dbl, 7, 15>; +def z_vsei16_by_parts : z_vse_by_parts<sext16dbl, 3, 7>; +def z_vsei32_by_parts : z_vse_by_parts<sext32, 1, 3>; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp index b2f8175579f..a34cdaf8030 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -21,15 +21,70 @@ extern "C" void LLVMInitializeSystemZTarget() { RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget); } +// Determine whether we use the vector ABI. +static bool UsesVectorABI(StringRef CPU, StringRef FS) { + // We use the vector ABI whenever the vector facility is avaiable. + // This is the case by default if CPU is z13 or later, and can be + // overridden via "[+-]vector" feature string elements. + bool VectorABI = true; + if (CPU.empty() || CPU == "generic" || + CPU == "z10" || CPU == "z196" || CPU == "zEC12") + VectorABI = false; + + SmallVector<StringRef, 3> Features; + FS.split(Features, ",", -1, false /* KeepEmpty */); + for (auto &Feature : Features) { + if (Feature == "vector" || Feature == "+vector") + VectorABI = true; + if (Feature == "-vector") + VectorABI = false; + } + + return VectorABI; +} + +static std::string computeDataLayout(StringRef TT, StringRef CPU, + StringRef FS) { + const Triple Triple(TT); + bool VectorABI = UsesVectorABI(CPU, FS); + std::string Ret = ""; + + // Big endian. + Ret += "E"; + + // Data mangling. + Ret += DataLayout::getManglingComponent(Triple); + + // Make sure that global data has at least 16 bits of alignment by + // default, so that we can refer to it using LARL. We don't have any + // special requirements for stack variables though. + Ret += "-i1:8:16-i8:8:16"; + + // 64-bit integers are naturally aligned. + Ret += "-i64:64"; + + // 128-bit floats are aligned only to 64 bits. + Ret += "-f128:64"; + + // When using the vector ABI, 128-bit vectors are also aligned to 64 bits. + if (VectorABI) + Ret += "-v128:64"; + + // We prefer 16 bits of aligned for all globals; see above. + Ret += "-a:8:16"; + + // Integer registers are 32 or 64 bits. + Ret += "-n32:64"; + + return Ret; +} + SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - // Make sure that global data has at least 16 bits of alignment by - // default, so that we can refer to it using LARL. We don't have any - // special requirements for stack variables though. - : LLVMTargetMachine(T, "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64", + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 3337f6388bd..5a87df1976c 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -238,3 +238,21 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } +unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { + if (!Vector) + // Discount the stack pointer. Also leave out %r0, since it can't + // be used in an address. + return 14; + if (ST->hasVector()) + return 32; + return 0; +} + +unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) { + if (!Vector) + return 64; + if (ST->hasVector()) + return 128; + return 0; +} + diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index d4989130679..e9cabe968ee 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -63,6 +63,14 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + + /// @} }; } // end namespace llvm diff --git a/llvm/test/CodeGen/SystemZ/frame-19.ll b/llvm/test/CodeGen/SystemZ/frame-19.ll new file mode 100644 index 00000000000..f6e327c3ae3 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/frame-19.ll @@ -0,0 +1,314 @@ +; Test spilling of vector registers. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; We need to allocate a 16-byte spill slot and save the 8 call-saved FPRs. +; The frame size should be exactly 160 + 16 + 8 * 8 = 240. +define void @f1(<16 x i8> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: aghi %r15, -240 +; CHECK-DAG: std %f8, +; CHECK-DAG: std %f9, +; CHECK-DAG: std %f10, +; CHECK-DAG: std %f11, +; CHECK-DAG: std %f12, +; CHECK-DAG: std %f13, +; CHECK-DAG: std %f14, +; CHECK-DAG: std %f15, +; CHECK: vst {{%v[0-9]+}}, 160(%r15) +; CHECK: vl {{%v[0-9]+}}, 160(%r15) +; CHECK-DAG: ld %f8, +; CHECK-DAG: ld %f9, +; CHECK-DAG: ld %f10, +; CHECK-DAG: ld %f11, +; CHECK-DAG: ld %f12, +; CHECK-DAG: ld %f13, +; CHECK-DAG: ld %f14, +; CHECK-DAG: ld %f15, +; CHECK: aghi %r15, 240 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v8 = load volatile <16 x i8>, <16 x i8> *%ptr + %v9 = load volatile <16 x i8>, <16 x i8> *%ptr + %v10 = load volatile <16 x i8>, <16 x i8> *%ptr + %v11 = load volatile <16 x i8>, <16 x i8> *%ptr + %v12 = load volatile <16 x i8>, <16 x i8> *%ptr + %v13 = load volatile <16 x i8>, <16 x i8> *%ptr + %v14 = load volatile <16 x i8>, <16 x i8> *%ptr + %v15 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + %vx = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %vx, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v15, <16 x i8> *%ptr + store volatile <16 x i8> %v14, <16 x i8> *%ptr + store volatile <16 x i8> %v13, <16 x i8> *%ptr + store volatile <16 x i8> %v12, <16 x i8> *%ptr + store volatile <16 x i8> %v11, <16 x i8> *%ptr + store volatile <16 x i8> %v10, <16 x i8> *%ptr + store volatile <16 x i8> %v9, <16 x i8> *%ptr + store volatile <16 x i8> %v8, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} + +; Like f1, but no 16-byte slot should be needed. +define void @f2(<16 x i8> *%ptr) { +; CHECK-LABEL: f2: +; CHECK: aghi %r15, -224 +; CHECK-DAG: std %f8, +; CHECK-DAG: std %f9, +; CHECK-DAG: std %f10, +; CHECK-DAG: std %f11, +; CHECK-DAG: std %f12, +; CHECK-DAG: std %f13, +; CHECK-DAG: std %f14, +; CHECK-DAG: std %f15, +; CHECK-NOT: vst {{.*}}(%r15) +; CHECK-NOT: vl {{.*}}(%r15) +; CHECK-DAG: ld %f8, +; CHECK-DAG: ld %f9, +; CHECK-DAG: ld %f10, +; CHECK-DAG: ld %f11, +; CHECK-DAG: ld %f12, +; CHECK-DAG: ld %f13, +; CHECK-DAG: ld %f14, +; CHECK-DAG: ld %f15, +; CHECK: aghi %r15, 224 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v8 = load volatile <16 x i8>, <16 x i8> *%ptr + %v9 = load volatile <16 x i8>, <16 x i8> *%ptr + %v10 = load volatile <16 x i8>, <16 x i8> *%ptr + %v11 = load volatile <16 x i8>, <16 x i8> *%ptr + %v12 = load volatile <16 x i8>, <16 x i8> *%ptr + %v13 = load volatile <16 x i8>, <16 x i8> *%ptr + %v14 = load volatile <16 x i8>, <16 x i8> *%ptr + %v15 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v15, <16 x i8> *%ptr + store volatile <16 x i8> %v14, <16 x i8> *%ptr + store volatile <16 x i8> %v13, <16 x i8> *%ptr + store volatile <16 x i8> %v12, <16 x i8> *%ptr + store volatile <16 x i8> %v11, <16 x i8> *%ptr + store volatile <16 x i8> %v10, <16 x i8> *%ptr + store volatile <16 x i8> %v9, <16 x i8> *%ptr + store volatile <16 x i8> %v8, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} + +; Like f2, but only %f8 should be saved. +define void @f3(<16 x i8> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: aghi %r15, -168 +; CHECK-DAG: std %f8, +; CHECK-NOT: vst {{.*}}(%r15) +; CHECK-NOT: vl {{.*}}(%r15) +; CHECK-NOT: %v9 +; CHECK-NOT: %v10 +; CHECK-NOT: %v11 +; CHECK-NOT: %v12 +; CHECK-NOT: %v13 +; CHECK-NOT: %v14 +; CHECK-NOT: %v15 +; CHECK-DAG: ld %f8, +; CHECK: aghi %r15, 168 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v8 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v8, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} + +; Like f2, but no registers should be saved. +define void @f4(<16 x i8> *%ptr) { +; CHECK-LABEL: f4: +; CHECK-NOT: %r15 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/vec-abi-align.ll b/llvm/test/CodeGen/SystemZ/vec-abi-align.ll new file mode 100644 index 00000000000..01b97a8583e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abi-align.ll @@ -0,0 +1,49 @@ +; Verify that we use the vector ABI datalayout if and only if +; the vector facility is present. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=generic | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector,vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector,+vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=vector,-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+vector,-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -mattr=-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s + +%struct.S = type { i8, <2 x i64> } + +define void @test(%struct.S* %s) nounwind { +; CHECK-VECTOR-LABEL: @test +; CHECK-VECTOR: vl %v0, 8(%r2) +; CHECK-NOVECTOR-LABEL: @test +; CHECK-NOVECTOR-DAG: agsi 16(%r2), 1 +; CHECK-NOVECTOR-DAG: agsi 24(%r2), 1 + %ptr = getelementptr %struct.S, %struct.S* %s, i64 0, i32 1 + %vec = load <2 x i64>, <2 x i64>* %ptr + %add = add <2 x i64> %vec, <i64 1, i64 1> + store <2 x i64> %add, <2 x i64>* %ptr + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/vec-abs-01.ll b/llvm/test/CodeGen/SystemZ/vec-abs-01.ll new file mode 100644 index 00000000000..aec3b9314f1 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abs-01.ll @@ -0,0 +1,146 @@ +; Test v16i8 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f4(<16 x i8> %val) { +; CHECK-LABEL: f4: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test that negative absolute uses VLPB too. There is no vector equivalent +; of LOAD NEGATIVE. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %abs = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + %ret = sub <16 x i8> zeroinitializer, %abs + ret <16 x i8> %ret +} + +; Try another form of negative absolute (slt version). +define <16 x i8> @f6(<16 x i8> %val) { +; CHECK-LABEL: f6: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f8(<16 x i8> %val) { +; CHECK-LABEL: f8: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f9(<16 x i8> %val) { +; CHECK-LABEL: f9: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with an SRA-based boolean vector. +define <16 x i8> @f10(<16 x i8> %val) { +; CHECK-LABEL: f10: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %shr = ashr <16 x i8> %val, + <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> + %neg = sub <16 x i8> zeroinitializer, %val + %and1 = and <16 x i8> %shr, %neg + %not = xor <16 x i8> %shr, + <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %and2 = and <16 x i8> %not, %val + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} + +; ...and again in reverse +define <16 x i8> @f11(<16 x i8> %val) { +; CHECK-LABEL: f11: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <16 x i8> %val, + <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> + %and1 = and <16 x i8> %shr, %val + %not = xor <16 x i8> %shr, + <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %neg = sub <16 x i8> zeroinitializer, %val + %and2 = and <16 x i8> %not, %neg + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-abs-02.ll b/llvm/test/CodeGen/SystemZ/vec-abs-02.ll new file mode 100644 index 00000000000..c5af619f0ba --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abs-02.ll @@ -0,0 +1,142 @@ +; Test v8i16 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <8 x i16> @f1(<8 x i16> %val) { +; CHECK-LABEL: f1: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f2(<8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f3(<8 x i16> %val) { +; CHECK-LABEL: f3: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f4(<8 x i16> %val) { +; CHECK-LABEL: f4: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test that negative absolute uses VLPH too. There is no vector equivalent +; of LOAD NEGATIVE. +define <8 x i16> @f5(<8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %abs = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + %ret = sub <8 x i16> zeroinitializer, %abs + ret <8 x i16> %ret +} + +; Try another form of negative absolute (slt version). +define <8 x i16> @f6(<8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f7(<8 x i16> %val) { +; CHECK-LABEL: f7: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f8(<8 x i16> %val) { +; CHECK-LABEL: f8: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f9(<8 x i16> %val) { +; CHECK-LABEL: f9: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with an SRA-based boolean vector. +define <8 x i16> @f10(<8 x i16> %val) { +; CHECK-LABEL: f10: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %shr = ashr <8 x i16> %val, + <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> + %neg = sub <8 x i16> zeroinitializer, %val + %and1 = and <8 x i16> %shr, %neg + %not = xor <8 x i16> %shr, + <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %and2 = and <8 x i16> %not, %val + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} + +; ...and again in reverse +define <8 x i16> @f11(<8 x i16> %val) { +; CHECK-LABEL: f11: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <8 x i16> %val, + <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> + %and1 = and <8 x i16> %shr, %val + %not = xor <8 x i16> %shr, + <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %neg = sub <8 x i16> zeroinitializer, %val + %and2 = and <8 x i16> %not, %neg + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-abs-03.ll b/llvm/test/CodeGen/SystemZ/vec-abs-03.ll new file mode 100644 index 00000000000..cb17a8895e1 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abs-03.ll @@ -0,0 +1,138 @@ +; Test v4i32 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <4 x i32> @f1(<4 x i32> %val) { +; CHECK-LABEL: f1: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f2(<4 x i32> %val) { +; CHECK-LABEL: f2: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f3(<4 x i32> %val) { +; CHECK-LABEL: f3: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f4(<4 x i32> %val) { +; CHECK-LABEL: f4: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test that negative absolute uses VLPF too. There is no vector equivalent +; of LOAD NEGATIVE. +define <4 x i32> @f5(<4 x i32> %val) { +; CHECK-LABEL: f5: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %abs = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + %ret = sub <4 x i32> zeroinitializer, %abs + ret <4 x i32> %ret +} + +; Try another form of negative absolute (slt version). +define <4 x i32> @f6(<4 x i32> %val) { +; CHECK-LABEL: f6: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f7(<4 x i32> %val) { +; CHECK-LABEL: f7: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f8(<4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f9(<4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with an SRA-based boolean vector. +define <4 x i32> @f10(<4 x i32> %val) { +; CHECK-LABEL: f10: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %shr = ashr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31> + %neg = sub <4 x i32> zeroinitializer, %val + %and1 = and <4 x i32> %shr, %neg + %not = xor <4 x i32> %shr, <i32 -1, i32 -1, i32 -1, i32 -1> + %and2 = and <4 x i32> %not, %val + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} + +; ...and again in reverse +define <4 x i32> @f11(<4 x i32> %val) { +; CHECK-LABEL: f11: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31> + %and1 = and <4 x i32> %shr, %val + %not = xor <4 x i32> %shr, <i32 -1, i32 -1, i32 -1, i32 -1> + %neg = sub <4 x i32> zeroinitializer, %val + %and2 = and <4 x i32> %not, %neg + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-abs-04.ll b/llvm/test/CodeGen/SystemZ/vec-abs-04.ll new file mode 100644 index 00000000000..31c489b00b3 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abs-04.ll @@ -0,0 +1,138 @@ +; Test v2i64 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <2 x i64> @f1(<2 x i64> %val) { +; CHECK-LABEL: f1: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f2(<2 x i64> %val) { +; CHECK-LABEL: f2: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f3(<2 x i64> %val) { +; CHECK-LABEL: f3: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f4(<2 x i64> %val) { +; CHECK-LABEL: f4: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test that negative absolute uses VLPG too. There is no vector equivalent +; of LOAD NEGATIVE. +define <2 x i64> @f5(<2 x i64> %val) { +; CHECK-LABEL: f5: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %abs = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + %ret = sub <2 x i64> zeroinitializer, %abs + ret <2 x i64> %ret +} + +; Try another form of negative absolute (slt version). +define <2 x i64> @f6(<2 x i64> %val) { +; CHECK-LABEL: f6: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f7(<2 x i64> %val) { +; CHECK-LABEL: f7: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f8(<2 x i64> %val) { +; CHECK-LABEL: f8: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f9(<2 x i64> %val) { +; CHECK-LABEL: f9: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with an SRA-based boolean vector. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %shr = ashr <2 x i64> %val, <i64 63, i64 63> + %neg = sub <2 x i64> zeroinitializer, %val + %and1 = and <2 x i64> %shr, %neg + %not = xor <2 x i64> %shr, <i64 -1, i64 -1> + %and2 = and <2 x i64> %not, %val + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} + +; ...and again in reverse +define <2 x i64> @f11(<2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <2 x i64> %val, <i64 63, i64 63> + %and1 = and <2 x i64> %shr, %val + %not = xor <2 x i64> %shr, <i64 -1, i64 -1> + %neg = sub <2 x i64> zeroinitializer, %val + %and2 = and <2 x i64> %not, %neg + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-add-01.ll b/llvm/test/CodeGen/SystemZ/vec-add-01.ll new file mode 100644 index 00000000000..a59a8da1cf8 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-add-01.ll @@ -0,0 +1,39 @@ +; Test vector addition. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 addition. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vab %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 addition. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vah %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 addition. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vaf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 addition. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vag %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-and-01.ll b/llvm/test/CodeGen/SystemZ/vec-and-01.ll new file mode 100644 index 00000000000..d467de69cea --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-and-01.ll @@ -0,0 +1,39 @@ +; Test vector AND. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 AND. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 AND. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 AND. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 AND. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-and-02.ll b/llvm/test/CodeGen/SystemZ/vec-and-02.ll new file mode 100644 index 00000000000..30bc9241689 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-and-02.ll @@ -0,0 +1,91 @@ +; Test vector AND-NOT. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 AND-NOT. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val2, <i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1> + %ret = and <16 x i8> %val1, %not + ret <16 x i8> %ret +} + +; ...and again with the reverse. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <16 x i8> %val1, <i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1> + %ret = and <16 x i8> %not, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 AND-NOT. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val2, <i16 -1, i16 -1, i16 -1, i16 -1, + i16 -1, i16 -1, i16 -1, i16 -1> + %ret = and <8 x i16> %val1, %not + ret <8 x i16> %ret +} + +; ...and again with the reverse. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <8 x i16> %val1, <i16 -1, i16 -1, i16 -1, i16 -1, + i16 -1, i16 -1, i16 -1, i16 -1> + %ret = and <8 x i16> %not, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 AND-NOT. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val2, <i32 -1, i32 -1, i32 -1, i32 -1> + %ret = and <4 x i32> %val1, %not + ret <4 x i32> %ret +} + +; ...and again with the reverse. +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <4 x i32> %val1, <i32 -1, i32 -1, i32 -1, i32 -1> + %ret = and <4 x i32> %not, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 AND-NOT. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val2, <i64 -1, i64 -1> + %ret = and <2 x i64> %val1, %not + ret <2 x i64> %ret +} + +; ...and again with the reverse. +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <2 x i64> %val1, <i64 -1, i64 -1> + %ret = and <2 x i64> %not, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-and-03.ll b/llvm/test/CodeGen/SystemZ/vec-and-03.ll new file mode 100644 index 00000000000..c73d570fb7b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-and-03.ll @@ -0,0 +1,113 @@ +; Test vector zero extensions, which need to be implemented as ANDs. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i1->v16i8 extension. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vrepib [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <16 x i8> %val to <16 x i1> + %ret = zext <16 x i1> %trunc to <16 x i8> + ret <16 x i8> %ret +} + +; Test a v8i1->v8i16 extension. +define <8 x i16> @f2(<8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: vrepih [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i1> + %ret = zext <8 x i1> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v8i8->v8i16 extension. +define <8 x i16> @f3(<8 x i16> %val) { +; CHECK-LABEL: f3: +; CHECK: vgbm [[REG:%v[0-9]+]], 21845 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i8> + %ret = zext <8 x i8> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v4i1->v4i32 extension. +define <4 x i32> @f4(<4 x i32> %val) { +; CHECK-LABEL: f4: +; CHECK: vrepif [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i1> + %ret = zext <4 x i1> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i8->v4i32 extension. +define <4 x i32> @f5(<4 x i32> %val) { +; CHECK-LABEL: f5: +; CHECK: vgbm [[REG:%v[0-9]+]], 4369 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i8> + %ret = zext <4 x i8> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i16->v4i32 extension. +define <4 x i32> @f6(<4 x i32> %val) { +; CHECK-LABEL: f6: +; CHECK: vgbm [[REG:%v[0-9]+]], 13107 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i16> + %ret = zext <4 x i16> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v2i1->v2i64 extension. +define <2 x i64> @f7(<2 x i64> %val) { +; CHECK-LABEL: f7: +; CHECK: vrepig [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i1> + %ret = zext <2 x i1> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i8->v2i64 extension. +define <2 x i64> @f8(<2 x i64> %val) { +; CHECK-LABEL: f8: +; CHECK: vgbm [[REG:%v[0-9]+]], 257 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i8> + %ret = zext <2 x i8> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i16->v2i64 extension. +define <2 x i64> @f9(<2 x i64> %val) { +; CHECK-LABEL: f9: +; CHECK: vgbm [[REG:%v[0-9]+]], 771 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i16> + %ret = zext <2 x i16> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i32->v2i64 extension. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vgbm [[REG:%v[0-9]+]], 3855 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i32> + %ret = zext <2 x i32> %trunc to <2 x i64> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-args-01.ll b/llvm/test/CodeGen/SystemZ/vec-args-01.ll new file mode 100644 index 00000000000..e07ab7447b2 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-args-01.ll @@ -0,0 +1,48 @@ +; Test the handling of named vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK + +; This routine has 6 integer arguments, which fill up r2-r5 and +; the stack slot at offset 160, and 10 vector arguments, which +; fill up v24-v31 and the two double-wide stack slots at 168 +; and 184. +declare void @bar(i64, i64, i64, i64, i64, i64, + <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, + <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, + <4 x i32>, <4 x i32>) + +define void @foo() { +; CHECK-VEC-LABEL: foo: +; CHECK-VEC-DAG: vrepif %v24, 1 +; CHECK-VEC-DAG: vrepif %v26, 2 +; CHECK-VEC-DAG: vrepif %v28, 3 +; CHECK-VEC-DAG: vrepif %v30, 4 +; CHECK-VEC-DAG: vrepif %v25, 5 +; CHECK-VEC-DAG: vrepif %v27, 6 +; CHECK-VEC-DAG: vrepif %v29, 7 +; CHECK-VEC-DAG: vrepif %v31, 8 +; CHECK-VEC: brasl %r14, bar@PLT +; +; CHECK-STACK-LABEL: foo: +; CHECK-STACK: aghi %r15, -200 +; CHECK-STACK-DAG: mvghi 160(%r15), 6 +; CHECK-STACK-DAG: vrepif [[REG1:%v[0-9]+]], 9 +; CHECK-STACK-DAG: vst [[REG1]], 168(%r15) +; CHECK-STACK-DAG: vrepif [[REG2:%v[0-9]+]], 10 +; CHECK-STACK-DAG: vst [[REG2]], 184(%r15) +; CHECK-STACK: brasl %r14, bar@PLT + + call void @bar (i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, + <4 x i32> <i32 1, i32 1, i32 1, i32 1>, + <4 x i32> <i32 2, i32 2, i32 2, i32 2>, + <4 x i32> <i32 3, i32 3, i32 3, i32 3>, + <4 x i32> <i32 4, i32 4, i32 4, i32 4>, + <4 x i32> <i32 5, i32 5, i32 5, i32 5>, + <4 x i32> <i32 6, i32 6, i32 6, i32 6>, + <4 x i32> <i32 7, i32 7, i32 7, i32 7>, + <4 x i32> <i32 8, i32 8, i32 8, i32 8>, + <4 x i32> <i32 9, i32 9, i32 9, i32 9>, + <4 x i32> <i32 10, i32 10, i32 10, i32 10>) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/vec-args-02.ll b/llvm/test/CodeGen/SystemZ/vec-args-02.ll new file mode 100644 index 00000000000..b6081598326 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-args-02.ll @@ -0,0 +1,31 @@ +; Test the handling of unnamed vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK + +; This routine is called with two named vector argument (passed +; in %v24 and %v26) and two unnamed vector arguments (passed +; in the double-wide stack slots at 160 and 176). +declare void @bar(<4 x i32>, <4 x i32>, ...) + +define void @foo() { +; CHECK-VEC-LABEL: foo: +; CHECK-VEC-DAG: vrepif %v24, 1 +; CHECK-VEC-DAG: vrepif %v26, 2 +; CHECK-VEC: brasl %r14, bar@PLT +; +; CHECK-STACK-LABEL: foo: +; CHECK-STACK: aghi %r15, -192 +; CHECK-STACK-DAG: vrepif [[REG1:%v[0-9]+]], 3 +; CHECK-STACK-DAG: vst [[REG1]], 160(%r15) +; CHECK-STACK-DAG: vrepif [[REG2:%v[0-9]+]], 4 +; CHECK-STACK-DAG: vst [[REG2]], 176(%r15) +; CHECK-STACK: brasl %r14, bar@PLT + + call void (<4 x i32>, <4 x i32>, ...) @bar + (<4 x i32> <i32 1, i32 1, i32 1, i32 1>, + <4 x i32> <i32 2, i32 2, i32 2, i32 2>, + <4 x i32> <i32 3, i32 3, i32 3, i32 3>, + <4 x i32> <i32 4, i32 4, i32 4, i32 4>) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/vec-args-03.ll b/llvm/test/CodeGen/SystemZ/vec-args-03.ll new file mode 100644 index 00000000000..e9f51c5e9ee --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-args-03.ll @@ -0,0 +1,16 @@ +; Test the handling of incoming vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; This routine has 10 vector arguments, which fill up %v24-%v31 and +; the two double-wide stack slots at 160 and 176. +define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4, + <4 x i32> %v5, <4 x i32> %v6, <4 x i32> %v7, <4 x i32> %v8, + <4 x i32> %v9, <4 x i32> %v10) { +; CHECK-LABEL: foo: +; CHECK: vl [[REG1:%v[0-9]+]], 176(%r15) +; CHECK: vsf %v24, %v26, [[REG1]] +; CHECK: br %r14 + %y = sub <4 x i32> %v2, %v10 + ret <4 x i32> %y +} diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-01.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-01.ll new file mode 100644 index 00000000000..a7546db8d7f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-cmp-01.ll @@ -0,0 +1,228 @@ +; Test v16i8 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ne. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test sgt. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test sge. +define <16 x i8> @f4(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchb [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test sle. +define <16 x i8> @f5(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test slt. +define <16 x i8> @f6(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchb %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ugt. +define <16 x i8> @f7(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test uge. +define <16 x i8> @f8(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlb [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ule. +define <16 x i8> @f9(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ult. +define <16 x i8> @f10(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlb %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test eq selects. +define <16 x i8> @f11(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ne selects. +define <16 x i8> @f12(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test sgt selects. +define <16 x i8> @f13(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test sge selects. +define <16 x i8> @f14(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test sle selects. +define <16 x i8> @f15(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test slt selects. +define <16 x i8> @f16(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ugt selects. +define <16 x i8> @f17(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test uge selects. +define <16 x i8> @f18(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ule selects. +define <16 x i8> @f19(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ult selects. +define <16 x i8> @f20(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-02.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-02.ll new file mode 100644 index 00000000000..78fb46c01c0 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-cmp-02.ll @@ -0,0 +1,228 @@ +; Test v8i16 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <8 x i16> @f1(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ne. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqh [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test sgt. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test sge. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchh [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test sle. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test slt. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchh %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ugt. +define <8 x i16> @f7(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test uge. +define <8 x i16> @f8(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlh [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ule. +define <8 x i16> @f9(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ult. +define <8 x i16> @f10(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlh %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test eq selects. +define <8 x i16> @f11(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ne selects. +define <8 x i16> @f12(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test sgt selects. +define <8 x i16> @f13(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test sge selects. +define <8 x i16> @f14(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test sle selects. +define <8 x i16> @f15(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test slt selects. +define <8 x i16> @f16(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ugt selects. +define <8 x i16> @f17(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test uge selects. +define <8 x i16> @f18(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ule selects. +define <8 x i16> @f19(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ult selects. +define <8 x i16> @f20(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-03.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-03.ll new file mode 100644 index 00000000000..4b070acc935 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-cmp-03.ll @@ -0,0 +1,228 @@ +; Test v4i32 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <4 x i32> @f1(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ne. +define <4 x i32> @f2(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqf [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test sgt. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test sge. +define <4 x i32> @f4(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchf [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test sle. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test slt. +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchf %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ugt. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test uge. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlf [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ule. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ult. +define <4 x i32> @f10(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlf %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test eq selects. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ne selects. +define <4 x i32> @f12(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test sgt selects. +define <4 x i32> @f13(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test sge selects. +define <4 x i32> @f14(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test sle selects. +define <4 x i32> @f15(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test slt selects. +define <4 x i32> @f16(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ugt selects. +define <4 x i32> @f17(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test uge selects. +define <4 x i32> @f18(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ule selects. +define <4 x i32> @f19(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ult selects. +define <4 x i32> @f20(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-04.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-04.ll new file mode 100644 index 00000000000..5cecaa7251b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-cmp-04.ll @@ -0,0 +1,228 @@ +; Test v2i64 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <2 x i64> @f1(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ne. +define <2 x i64> @f2(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqg [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test sgt. +define <2 x i64> @f3(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test sge. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchg [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test sle. +define <2 x i64> @f5(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test slt. +define <2 x i64> @f6(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchg %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ugt. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test uge. +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlg [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ule. +define <2 x i64> @f9(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ult. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlg %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test eq selects. +define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ne selects. +define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test sgt selects. +define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test sge selects. +define <2 x i64> @f14(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test sle selects. +define <2 x i64> @f15(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test slt selects. +define <2 x i64> @f16(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ugt selects. +define <2 x i64> @f17(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test uge selects. +define <2 x i64> @f18(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ule selects. +define <2 x i64> @f19(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ult selects. +define <2 x i64> @f20(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll new file mode 100644 index 00000000000..f9da34b6475 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll @@ -0,0 +1,107 @@ +; Test various target-specific DAG combiner patterns. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Check that an extraction followed by a truncation is effectively treated +; as a bitcast. +define void @f1(<4 x i32> %v1, <4 x i32> %v2, i8 *%ptr1, i8 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vsteb [[REG]], 0(%r2), 3 +; CHECK-DAG: vsteb [[REG]], 0(%r3), 15 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %elem1 = extractelement <4 x i32> %add, i32 0 + %elem2 = extractelement <4 x i32> %add, i32 3 + %trunc1 = trunc i32 %elem1 to i8 + %trunc2 = trunc i32 %elem2 to i8 + store i8 %trunc1, i8 *%ptr1 + store i8 %trunc2, i8 *%ptr2 + ret void +} + +; Test a case where a pack-type shuffle can be eliminated. +define i16 @f2(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: f2: +; CHECK-NOT: vpk +; CHECK-DAG: vaf [[REG1:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vaf [[REG2:%v[0-9]+]], %v26, %v28 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG1]], 3 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG2]], 7 +; CHECK: br %r14 + %add1 = add <4 x i32> %v1, %v2 + %add2 = add <4 x i32> %v2, %v3 + %shuffle = shufflevector <4 x i32> %add1, <4 x i32> %add2, + <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 1 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} + +; ...and again in a case where there's also a splat and a bitcast. +define i16 @f3(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { +; CHECK-LABEL: f3: +; CHECK-NOT: vrepg +; CHECK-NOT: vpk +; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 6 +; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %splat = shufflevector <2 x i64> %v3, <2 x i64> undef, + <2 x i32> <i32 0, i32 0> + %splatcast = bitcast <2 x i64> %splat to <4 x i32> + %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast, + <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 2 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} + +; ...and again with a merge low instead of a pack. +define i16 @f4(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { +; CHECK-LABEL: f4: +; CHECK-NOT: vrepg +; CHECK-NOT: vmr +; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 6 +; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %splat = shufflevector <2 x i64> %v3, <2 x i64> undef, + <2 x i32> <i32 0, i32 0> + %splatcast = bitcast <2 x i64> %splat to <4 x i32> + %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast, + <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 4 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} + +; ...and again with a merge high. +define i16 @f5(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { +; CHECK-LABEL: f5: +; CHECK-NOT: vrepg +; CHECK-NOT: vmr +; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 2 +; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %splat = shufflevector <2 x i64> %v3, <2 x i64> undef, + <2 x i32> <i32 0, i32 0> + %splatcast = bitcast <2 x i64> %splat to <4 x i32> + %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast, + <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 4 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-01.ll b/llvm/test/CodeGen/SystemZ/vec-const-01.ll new file mode 100644 index 00000000000..f173b92b015 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-01.ll @@ -0,0 +1,55 @@ +; Test vector byte masks, v16i8 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <16 x i8> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <16 x i8> zeroinitializer +} + +; Test an all-ones vector. +define <16 x i8> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1> +} + +; Test a mixed vector (mask 0x8c75). +define <16 x i8> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35957 +; CHECK: br %r14 + ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, + i8 -1, i8 -1, i8 0, i8 0, + i8 0, i8 -1, i8 -1, i8 -1, + i8 0, i8 -1, i8 0, i8 -1> +} + +; Test that undefs are treated as zero. +define <16 x i8> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 35957 +; CHECK: br %r14 + ret <16 x i8> <i8 -1, i8 undef, i8 undef, i8 undef, + i8 -1, i8 -1, i8 undef, i8 undef, + i8 undef, i8 -1, i8 -1, i8 -1, + i8 undef, i8 -1, i8 undef, i8 -1> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <16 x i8> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, + i8 -1, i8 -1, i8 0, i8 1, + i8 0, i8 -1, i8 -1, i8 -1, + i8 0, i8 -1, i8 0, i8 -1> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-02.ll b/llvm/test/CodeGen/SystemZ/vec-const-02.ll new file mode 100644 index 00000000000..541cbb9faca --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-02.ll @@ -0,0 +1,47 @@ +; Test vector byte masks, v8i16 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <8 x i16> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <8 x i16> zeroinitializer +} + +; Test an all-ones vector. +define <8 x i16> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, + i16 -1, i16 -1, i16 -1, i16 -1> +} + +; Test a mixed vector (mask 0x8c76). +define <8 x i16> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <8 x i16> <i16 65280, i16 0, i16 65535, i16 0, + i16 255, i16 65535, i16 255, i16 65280> +} + +; Test that undefs are treated as zero. +define <8 x i16> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <8 x i16> <i16 65280, i16 undef, i16 65535, i16 undef, + i16 255, i16 65535, i16 255, i16 65280> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <8 x i16> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <8 x i16> <i16 65280, i16 0, i16 65535, i16 0, + i16 255, i16 65535, i16 256, i16 65280> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-03.ll b/llvm/test/CodeGen/SystemZ/vec-const-03.ll new file mode 100644 index 00000000000..45ed83866d5 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-03.ll @@ -0,0 +1,43 @@ +; Test vector byte masks, v4i32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <4 x i32> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <4 x i32> zeroinitializer +} + +; Test an all-ones vector. +define <4 x i32> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> +} + +; Test a mixed vector (mask 0x8c76). +define <4 x i32> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <4 x i32> <i32 4278190080, i32 4294901760, i32 16777215, i32 16776960> +} + +; Test that undefs are treated as zero (mask 0x8076). +define <4 x i32> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 32886 +; CHECK: br %r14 + ret <4 x i32> <i32 4278190080, i32 undef, i32 16777215, i32 16776960> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <4 x i32> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <4 x i32> <i32 4278190080, i32 1, i32 16777215, i32 16776960> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-04.ll b/llvm/test/CodeGen/SystemZ/vec-const-04.ll new file mode 100644 index 00000000000..1c2fb414d25 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-04.ll @@ -0,0 +1,43 @@ +; Test vector byte masks, v2i64 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <2 x i64> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <2 x i64> zeroinitializer +} + +; Test an all-ones vector. +define <2 x i64> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <2 x i64> <i64 -1, i64 -1> +} + +; Test a mixed vector (mask 0x8c76). +define <2 x i64> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <2 x i64> <i64 18374686483966525440, i64 72057589759737600> +} + +; Test that undefs are treated as zero (mask 0x8c00). +define <2 x i64> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 35840 +; CHECK: br %r14 + ret <2 x i64> <i64 18374686483966525440, i64 undef> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <2 x i64> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <2 x i64> <i64 18374686483966525441, i64 72057589759737600> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-07.ll b/llvm/test/CodeGen/SystemZ/vec-const-07.ll new file mode 100644 index 00000000000..6fcf95b6921 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-07.ll @@ -0,0 +1,229 @@ +; Test vector replicates, v16i8 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <16 x i8> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <16 x i8> <i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <16 x i8> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <16 x i8> <i8 201, i8 201, i8 201, i8 201, + i8 201, i8 201, i8 201, i8 201, + i8 201, i8 201, i8 201, i8 201, + i8 201, i8 201, i8 201, i8 201> +} + +; Test a byte-granularity replicate with the highest useful value. +define <16 x i8> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <16 x i8> <i8 254, i8 254, i8 254, i8 254, + i8 254, i8 254, i8 254, i8 254, + i8 254, i8 254, i8 254, i8 254, + i8 254, i8 254, i8 254, i8 254> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <16 x i8> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 1, i8 0, i8 1, + i8 0, i8 1, i8 0, i8 1, + i8 0, i8 1, i8 0, i8 1, + i8 0, i8 1, i8 0, i8 1> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <16 x i8> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <16 x i8> <i8 100, i8 50, i8 100, i8 50, + i8 100, i8 50, i8 100, i8 50, + i8 100, i8 50, i8 100, i8 50, + i8 100, i8 50, i8 100, i8 50> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <16 x i8> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 254, i8 255, i8 254, + i8 255, i8 254, i8 255, i8 254, + i8 255, i8 254, i8 255, i8 254, + i8 255, i8 254, i8 255, i8 254> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <16 x i8> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 1, + i8 0, i8 0, i8 0, i8 1, + i8 0, i8 0, i8 0, i8 1, + i8 0, i8 0, i8 0, i8 1> +} + +; Test a word-granularity replicate with the highest in-range value. +define <16 x i8> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 127, i8 255, + i8 0, i8 0, i8 127, i8 255, + i8 0, i8 0, i8 127, i8 255, + i8 0, i8 0, i8 127, i8 255> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <16 x i8> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 0> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <16 x i8> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 128, i8 0, + i8 255, i8 255, i8 128, i8 0, + i8 255, i8 255, i8 128, i8 0, + i8 255, i8 255, i8 128, i8 0> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <16 x i8> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 127, i8 255> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <16 x i8> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 255, i8 254, + i8 255, i8 255, i8 255, i8 254, + i8 255, i8 255, i8 255, i8 254, + i8 255, i8 255, i8 255, i8 254> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <16 x i8> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 0, i8 1, + i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 0, i8 1> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <16 x i8> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 127, i8 255, + i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 127, i8 255> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <16 x i8> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 128, i8 0> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <16 x i8> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 128, i8 0, + i8 255, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 128, i8 0> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <16 x i8> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 127, i8 255> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <16 x i8> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 255, i8 254, + i8 255, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 255, i8 254> +} + +; Repeat f14 with undefs optimistically treated as 0. +define <16 x i8> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 undef, i8 0, i8 0, + i8 0, i8 0, i8 127, i8 255, + i8 undef, i8 0, i8 undef, i8 0, + i8 0, i8 0, i8 127, i8 255> +} + +; Repeat f18 with undefs optimistically treated as -1. +define <16 x i8> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <16 x i8> <i8 undef, i8 255, i8 255, i8 255, + i8 255, i8 255, i8 undef, i8 254, + i8 255, i8 255, i8 255, i8 undef, + i8 255, i8 undef, i8 255, i8 254> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-08.ll b/llvm/test/CodeGen/SystemZ/vec-const-08.ll new file mode 100644 index 00000000000..5ab6947e548 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-08.ll @@ -0,0 +1,189 @@ +; Test vector replicates, v8i16 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <8 x i16> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <8 x i16> <i16 257, i16 257, i16 257, i16 257, + i16 257, i16 257, i16 257, i16 257> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <8 x i16> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <8 x i16> <i16 51657, i16 51657, i16 51657, i16 51657, + i16 51657, i16 51657, i16 51657, i16 51657> +} + +; Test a byte-granularity replicate with the highest useful value. +define <8 x i16> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <8 x i16> <i16 -258, i16 -258, i16 -258, i16 -258, + i16 -258, i16 -258, i16 -258, i16 -258> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <8 x i16> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <8 x i16> <i16 1, i16 1, i16 1, i16 1, + i16 1, i16 1, i16 1, i16 1> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <8 x i16> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <8 x i16> <i16 25650, i16 25650, i16 25650, i16 25650, + i16 25650, i16 25650, i16 25650, i16 25650> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <8 x i16> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, + i16 65534, i16 65534, i16 65534, i16 65534> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <8 x i16> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 1, i16 0, i16 1, + i16 0, i16 1, i16 0, i16 1> +} + +; Test a word-granularity replicate with the highest in-range value. +define <8 x i16> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 32767, i16 0, i16 32767, + i16 0, i16 32767, i16 0, i16 32767> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <8 x i16> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 32768, i16 0, i16 32768, + i16 0, i16 32768, i16 0, i16 32768> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <8 x i16> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -32768, i16 -1, i16 -32768, + i16 -1, i16 -32768, i16 -1, i16 -32768> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <8 x i16> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -32769, i16 -1, i16 -32769, + i16 -1, i16 -32769, i16 -1, i16 -32769> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <8 x i16> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -2, i16 -1, i16 -2, + i16 -1, i16 -2, i16 -1, i16 -2> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <8 x i16> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 0, i16 0, i16 1, + i16 0, i16 0, i16 0, i16 1> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <8 x i16> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 0, i16 0, i16 32767, + i16 0, i16 0, i16 0, i16 32767> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <8 x i16> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 0, i16 0, i16 32768, + i16 0, i16 0, i16 0, i16 32768> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <8 x i16> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -32768, + i16 -1, i16 -1, i16 -1, i16 -32768> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <8 x i16> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -32769, + i16 -1, i16 -1, i16 -1, i16 -32769> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <8 x i16> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -2, + i16 -1, i16 -1, i16 -1, i16 -2> +} + +; Repeat f14 with undefs optimistically treated as 0. +define <8 x i16> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 undef, i16 0, i16 32767, + i16 undef, i16 0, i16 undef, i16 32767> +} + +; Repeat f18 with undefs optimistically treated as -1. +define <8 x i16> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -1, i16 undef, i16 -2, + i16 undef, i16 undef, i16 -1, i16 -2> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-09.ll b/llvm/test/CodeGen/SystemZ/vec-const-09.ll new file mode 100644 index 00000000000..2cbe9259452 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-09.ll @@ -0,0 +1,169 @@ +; Test vector replicates, v4i32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <4 x i32> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <4 x i32> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <4 x i32> <i32 3385444809, i32 3385444809, i32 3385444809, i32 3385444809> +} + +; Test a byte-granularity replicate with the highest useful value. +define <4 x i32> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <4 x i32> <i32 4278124286, i32 4278124286, i32 4278124286, i32 4278124286> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <4 x i32> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <4 x i32> <i32 65537, i32 65537, i32 65537, i32 65537> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <4 x i32> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <4 x i32> <i32 1681024050, i32 1681024050, i32 1681024050, i32 1681024050> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <4 x i32> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <4 x i32> <i32 -65538, i32 -65538, i32 -65538, i32 -65538> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <4 x i32> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <4 x i32> <i32 1, i32 1, i32 1, i32 1> +} + +; Test a word-granularity replicate with the highest in-range value. +define <4 x i32> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <4 x i32> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <4 x i32> <i32 32768, i32 32768, i32 32768, i32 32768> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <4 x i32> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <4 x i32> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <4 x i32> <i32 -32769, i32 -32769, i32 -32769, i32 -32769> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <4 x i32> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <4 x i32> <i32 -2, i32 -2, i32 -2, i32 -2> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <4 x i32> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <4 x i32> <i32 0, i32 1, i32 0, i32 1> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <4 x i32> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <4 x i32> <i32 0, i32 32767, i32 0, i32 32767> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <4 x i32> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <4 x i32> <i32 0, i32 32768, i32 0, i32 32768> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <4 x i32> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <4 x i32> <i32 -1, i32 -32768, i32 -1, i32 -32768> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <4 x i32> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <4 x i32> <i32 -1, i32 -32769, i32 -1, i32 -32769> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <4 x i32> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <4 x i32> <i32 -1, i32 -2, i32 -1, i32 -2> +} + +; Repeat f14 with undefs optimistically treated as 0, 32767. +define <4 x i32> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <4 x i32> <i32 undef, i32 undef, i32 0, i32 32767> +} + +; Repeat f18 with undefs optimistically treated as -2, -1. +define <4 x i32> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <4 x i32> <i32 -1, i32 undef, i32 undef, i32 -2> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-10.ll b/llvm/test/CodeGen/SystemZ/vec-const-10.ll new file mode 100644 index 00000000000..0613b69a277 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-10.ll @@ -0,0 +1,169 @@ +; Test vector replicates, v2i64 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <2 x i64> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <2 x i64> <i64 72340172838076673, i64 72340172838076673> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <2 x i64> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <2 x i64> <i64 -3906369333256140343, i64 -3906369333256140343> +} + +; Test a byte-granularity replicate with the highest useful value. +define <2 x i64> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <2 x i64> <i64 -72340172838076674, i64 -72340172838076674> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <2 x i64> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <2 x i64> <i64 281479271743489, i64 281479271743489> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <2 x i64> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <2 x i64> <i64 7219943320220492850, i64 7219943320220492850> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <2 x i64> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <2 x i64> <i64 -281479271743490, i64 -281479271743490> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <2 x i64> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <2 x i64> <i64 4294967297, i64 4294967297> +} + +; Test a word-granularity replicate with the highest in-range value. +define <2 x i64> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <2 x i64> <i64 140733193420799, i64 140733193420799> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <2 x i64> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <2 x i64> <i64 140737488388096, i64 140737488388096> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <2 x i64> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <2 x i64> <i64 -140733193420800, i64 -140733193420800> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <2 x i64> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <2 x i64> <i64 -140737488388097, i64 -140737488388097> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <2 x i64> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <2 x i64> <i64 -4294967298, i64 -4294967298> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <2 x i64> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <2 x i64> <i64 1, i64 1> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <2 x i64> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <2 x i64> <i64 32767, i64 32767> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <2 x i64> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <2 x i64> <i64 32768, i64 32768> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <2 x i64> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <2 x i64> <i64 -32768, i64 -32768> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <2 x i64> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <2 x i64> <i64 -32769, i64 -32769> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <2 x i64> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <2 x i64> <i64 -2, i64 -2> +} + +; Repeat f14 with undefs optimistically treated as 32767. +define <2 x i64> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <2 x i64> <i64 undef, i64 32767> +} + +; Repeat f18 with undefs optimistically treated as -2. +define <2 x i64> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <2 x i64> <i64 undef, i64 -2> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-13.ll b/llvm/test/CodeGen/SystemZ/vec-const-13.ll new file mode 100644 index 00000000000..2cc425252c2 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-13.ll @@ -0,0 +1,193 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v16i8 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <16 x i8> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 0> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <16 x i8> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 1, i8 255, i8 255, + i8 0, i8 1, i8 255, i8 255, + i8 0, i8 1, i8 255, i8 255, + i8 0, i8 1, i8 255, i8 255> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <16 x i8> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 254, i8 0, i8 0, + i8 255, i8 254, i8 0, i8 0, + i8 255, i8 254, i8 0, i8 0, + i8 255, i8 254, i8 0, i8 0> +} + +; Test a word-granularity replicate that has middle bits set. +define <16 x i8> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 15, i8 192, i8 0, + i8 0, i8 15, i8 192, i8 0, + i8 0, i8 15, i8 192, i8 0, + i8 0, i8 15, i8 192, i8 0> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <16 x i8> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 127, i8 255, + i8 255, i8 255, i8 127, i8 255> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <16 x i8> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 128, i8 0, + i8 0, i8 0, i8 0, i8 0, + i8 0, i8 0, i8 128, i8 0> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <16 x i8> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, + i8 0, i8 63, i8 255, i8 255, + i8 0, i8 0, i8 0, i8 0, + i8 0, i8 63, i8 255, i8 255> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <16 x i8> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <16 x i8> <i8 255, i8 255, i8 255, i8 255, + i8 255, i8 248, i8 0, i8 0, + i8 255, i8 255, i8 255, i8 255, + i8 255, i8 248, i8 0, i8 0> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <16 x i8> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 1, + i8 255, i8 224, i8 0, i8 0, + i8 0, i8 0, i8 0, i8 1, + i8 255, i8 224, i8 0, i8 0> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <16 x i8> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <16 x i8> <i8 128, i8 0, i8 63, i8 255, + i8 255, i8 255, i8 255, i8 255, + i8 128, i8 0, i8 63, i8 255, + i8 255, i8 255, i8 255, i8 255> +} + +; Retest f1 with arbitrary undefs instead of 0s. +define <16 x i8> @f11() { +; CHECK-LABEL: f11: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 undef, i8 128, i8 0, + i8 0, i8 0, i8 128, i8 undef, + i8 undef, i8 0, i8 128, i8 0, + i8 undef, i8 undef, i8 128, i8 0> +} + +; Try a case where we want consistent undefs to be treated as 0. +define <16 x i8> @f12() { +; CHECK-LABEL: f12: +; CHECK: vgmf %v24, 15, 23 +; CHECK: br %r14 + ret <16 x i8> <i8 undef, i8 1, i8 255, i8 0, + i8 undef, i8 1, i8 255, i8 0, + i8 undef, i8 1, i8 255, i8 0, + i8 undef, i8 1, i8 255, i8 0> +} + +; ...and again with the lower bits of the replicated constant. +define <16 x i8> @f13() { +; CHECK-LABEL: f13: +; CHECK: vgmf %v24, 15, 22 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 1, i8 254, i8 undef, + i8 0, i8 1, i8 254, i8 undef, + i8 0, i8 1, i8 254, i8 undef, + i8 0, i8 1, i8 254, i8 undef> +} + +; Try a case where we want consistent undefs to be treated as -1. +define <16 x i8> @f14() { +; CHECK-LABEL: f14: +; CHECK: vgmf %v24, 28, 8 +; CHECK: br %r14 + ret <16 x i8> <i8 undef, i8 128, i8 0, i8 15, + i8 undef, i8 128, i8 0, i8 15, + i8 undef, i8 128, i8 0, i8 15, + i8 undef, i8 128, i8 0, i8 15> +} + +; ...and again with the lower bits of the replicated constant. +define <16 x i8> @f15() { +; CHECK-LABEL: f15: +; CHECK: vgmf %v24, 18, 3 +; CHECK: br %r14 + ret <16 x i8> <i8 240, i8 0, i8 63, i8 undef, + i8 240, i8 0, i8 63, i8 undef, + i8 240, i8 0, i8 63, i8 undef, + i8 240, i8 0, i8 63, i8 undef> +} + +; Repeat f9 with arbitrary undefs. +define <16 x i8> @f16() { +; CHECK-LABEL: f16: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <16 x i8> <i8 undef, i8 0, i8 undef, i8 1, + i8 255, i8 undef, i8 0, i8 0, + i8 0, i8 0, i8 0, i8 1, + i8 undef, i8 224, i8 undef, i8 undef> +} + +; Try a case where we want some consistent undefs to be treated as 0 +; and some to be treated as 255. +define <16 x i8> @f17() { +; CHECK-LABEL: f17: +; CHECK: vgmg %v24, 23, 35 +; CHECK: br %r14 + ret <16 x i8> <i8 0, i8 undef, i8 1, i8 undef, + i8 240, i8 undef, i8 0, i8 0, + i8 0, i8 undef, i8 1, i8 undef, + i8 240, i8 undef, i8 0, i8 0> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-14.ll b/llvm/test/CodeGen/SystemZ/vec-const-14.ll new file mode 100644 index 00000000000..0e3f124dbf6 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-14.ll @@ -0,0 +1,113 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v8i16 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <8 x i16> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 32768, i16 0, i16 32768, + i16 0, i16 32768, i16 0, i16 32768> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <8 x i16> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <8 x i16> <i16 1, i16 -1, i16 1, i16 -1, + i16 1, i16 -1, i16 1, i16 -1> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <8 x i16> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <8 x i16> <i16 -2, i16 0, i16 -2, i16 0, + i16 -2, i16 0, i16 -2, i16 0> +} + +; Test a word-granularity replicate that has middle bits set. +define <8 x i16> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <8 x i16> <i16 15, i16 49152, i16 15, i16 49152, + i16 15, i16 49152, i16 15, i16 49152> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <8 x i16> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 32767, i16 -1, i16 32767, + i16 -1, i16 32767, i16 -1, i16 32767> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <8 x i16> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 0, i16 0, i16 32768, + i16 0, i16 0, i16 0, i16 32768> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <8 x i16> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 0, i16 63, i16 -1, + i16 0, i16 0, i16 63, i16 -1> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <8 x i16> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <8 x i16> <i16 -1, i16 -1, i16 -8, i16 0, + i16 -1, i16 -1, i16 -8, i16 0> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <8 x i16> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <8 x i16> <i16 0, i16 1, i16 -32, i16 0, + i16 0, i16 1, i16 -32, i16 0> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <8 x i16> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <8 x i16> <i16 32768, i16 16383, i16 -1, i16 -1, + i16 32768, i16 16383, i16 -1, i16 -1> +} + +; Retest f1 with arbitrary undefs instead of 0s. +define <8 x i16> @f11() { +; CHECK-LABEL: f11: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <8 x i16> <i16 undef, i16 32768, i16 0, i16 32768, + i16 0, i16 32768, i16 undef, i16 32768> +} + +; ...likewise f9. +define <8 x i16> @f12() { +; CHECK-LABEL: f12: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <8 x i16> <i16 undef, i16 1, i16 -32, i16 0, + i16 0, i16 1, i16 -32, i16 undef> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-15.ll b/llvm/test/CodeGen/SystemZ/vec-const-15.ll new file mode 100644 index 00000000000..cec445efe89 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-15.ll @@ -0,0 +1,85 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v4i32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <4 x i32> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <4 x i32> <i32 32768, i32 32768, i32 32768, i32 32768> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <4 x i32> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <4 x i32> <i32 131071, i32 131071, i32 131071, i32 131071> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <4 x i32> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <4 x i32> <i32 -131072, i32 -131072, i32 -131072, i32 -131072> +} + +; Test a word-granularity replicate that has middle bits set. +define <4 x i32> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <4 x i32> <i32 1032192, i32 1032192, i32 1032192, i32 1032192> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <4 x i32> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <4 x i32> <i32 -32769, i32 -32769, i32 -32769, i32 -32769> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <4 x i32> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <4 x i32> <i32 0, i32 32768, i32 0, i32 32768> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <4 x i32> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <4 x i32> <i32 0, i32 4194303, i32 0, i32 4194303> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <4 x i32> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <4 x i32> <i32 -1, i32 -524288, i32 -1, i32 -524288> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <4 x i32> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <4 x i32> <i32 1, i32 -2097152, i32 1, i32 -2097152> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <4 x i32> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <4 x i32> <i32 -2147467265, i32 -1, i32 -2147467265, i32 -1> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-const-16.ll b/llvm/test/CodeGen/SystemZ/vec-const-16.ll new file mode 100644 index 00000000000..1ab7de2761c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-const-16.ll @@ -0,0 +1,85 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v2i64 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <2 x i64> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <2 x i64> <i64 140737488388096, i64 140737488388096> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <2 x i64> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <2 x i64> <i64 562945658585087, i64 562945658585087> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <2 x i64> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <2 x i64> <i64 -562945658585088, i64 -562945658585088> +} + +; Test a word-granularity replicate that has middle bits set. +define <2 x i64> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <2 x i64> <i64 4433230884225024, i64 4433230884225024> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <2 x i64> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <2 x i64> <i64 -140737488388097, i64 -140737488388097> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <2 x i64> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <2 x i64> <i64 32768, i64 32768> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <2 x i64> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <2 x i64> <i64 4194303, i64 4194303> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <2 x i64> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <2 x i64> <i64 -524288, i64 -524288> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <2 x i64> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <2 x i64> <i64 8587837440, i64 8587837440> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <2 x i64> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <2 x i64> <i64 -9223301668110598145, i64 -9223301668110598145> +} diff --git a/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll b/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll new file mode 100644 index 00000000000..f6502202ef5 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll @@ -0,0 +1,81 @@ +; Test vector count leading zeros +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %src, i1 %is_zero_undef) +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %src, i1 %is_zero_undef) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %src, i1 %is_zero_undef) +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %src, i1 %is_zero_undef) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vclzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) + ret <16 x i8> %res +} + +define <16 x i8> @f2(<16 x i8> %a) { +; CHECK-LABEL: f2: +; CHECK: vclzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true) + ret <16 x i8> %res +} + +define <8 x i16> @f3(<8 x i16> %a) { +; CHECK-LABEL: f3: +; CHECK: vclzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) + ret <8 x i16> %res +} + +define <8 x i16> @f4(<8 x i16> %a) { +; CHECK-LABEL: f4: +; CHECK: vclzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true) + ret <8 x i16> %res +} + +define <4 x i32> @f5(<4 x i32> %a) { +; CHECK-LABEL: f5: +; CHECK: vclzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %res +} + +define <4 x i32> @f6(<4 x i32> %a) { +; CHECK-LABEL: f6: +; CHECK: vclzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true) + ret <4 x i32> %res +} + +define <2 x i64> @f7(<2 x i64> %a) { +; CHECK-LABEL: f7: +; CHECK: vclzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) + ret <2 x i64> %res +} + +define <2 x i64> @f8(<2 x i64> %a) { +; CHECK-LABEL: f8: +; CHECK: vclzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true) + ret <2 x i64> %res +} + diff --git a/llvm/test/CodeGen/SystemZ/vec-ctpop-01.ll b/llvm/test/CodeGen/SystemZ/vec-ctpop-01.ll new file mode 100644 index 00000000000..0056af73a2e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-ctpop-01.ll @@ -0,0 +1,53 @@ +; Test vector population-count instruction +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vpopct %v24, %v24, 0 +; CHECK: br %r14 + + %popcnt = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) + ret <16 x i8> %popcnt +} + +define <8 x i16> @f2(<8 x i16> %a) { +; CHECK-LABEL: f2: +; CHECK: vpopct [[T1:%v[0-9]+]], %v24, 0 +; CHECK: veslh [[T2:%v[0-9]+]], [[T1]], 8 +; CHECK: vah [[T3:%v[0-9]+]], [[T1]], [[T2]] +; CHECK: vesrlh %v24, [[T3]], 8 +; CHECK: br %r14 + + %popcnt = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) + ret <8 x i16> %popcnt +} + +define <4 x i32> @f3(<4 x i32> %a) { +; CHECK-LABEL: f3: +; CHECK: vpopct [[T1:%v[0-9]+]], %v24, 0 +; CHECK: vgbm [[T2:%v[0-9]+]], 0 +; CHECK: vsumb %v24, [[T1]], [[T2]] +; CHECK: br %r14 + + %popcnt = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) + ret <4 x i32> %popcnt +} + +define <2 x i64> @f4(<2 x i64> %a) { +; CHECK-LABEL: f4: +; CHECK: vpopct [[T1:%v[0-9]+]], %v24, 0 +; CHECK: vgbm [[T2:%v[0-9]+]], 0 +; CHECK: vsumb [[T3:%v[0-9]+]], [[T1]], [[T2]] +; CHECK: vsumgf %v24, [[T3]], [[T2]] +; CHECK: br %r14 + + %popcnt = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + ret <2 x i64> %popcnt +} + diff --git a/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll b/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll new file mode 100644 index 00000000000..00a0d21b42f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll @@ -0,0 +1,81 @@ +; Test vector count trailing zeros +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.cttz.v16i8(<16 x i8> %src, i1 %is_zero_undef) +declare <8 x i16> @llvm.cttz.v8i16(<8 x i16> %src, i1 %is_zero_undef) +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32> %src, i1 %is_zero_undef) +declare <2 x i64> @llvm.cttz.v2i64(<2 x i64> %src, i1 %is_zero_undef) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vctzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) + ret <16 x i8> %res +} + +define <16 x i8> @f2(<16 x i8> %a) { +; CHECK-LABEL: f2: +; CHECK: vctzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) + ret <16 x i8> %res +} + +define <8 x i16> @f3(<8 x i16> %a) { +; CHECK-LABEL: f3: +; CHECK: vctzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) + ret <8 x i16> %res +} + +define <8 x i16> @f4(<8 x i16> %a) { +; CHECK-LABEL: f4: +; CHECK: vctzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) + ret <8 x i16> %res +} + +define <4 x i32> @f5(<4 x i32> %a) { +; CHECK-LABEL: f5: +; CHECK: vctzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %res +} + +define <4 x i32> @f6(<4 x i32> %a) { +; CHECK-LABEL: f6: +; CHECK: vctzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) + ret <4 x i32> %res +} + +define <2 x i64> @f7(<2 x i64> %a) { +; CHECK-LABEL: f7: +; CHECK: vctzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) + ret <2 x i64> %res +} + +define <2 x i64> @f8(<2 x i64> %a) { +; CHECK-LABEL: f8: +; CHECK: vctzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) + ret <2 x i64> %res +} + diff --git a/llvm/test/CodeGen/SystemZ/vec-div-01.ll b/llvm/test/CodeGen/SystemZ/vec-div-01.ll new file mode 100644 index 00000000000..3c5ec4f54ee --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-div-01.ll @@ -0,0 +1,62 @@ +; Test vector division. There is no native support for this, so it's really +; a test of the operation legalization code. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 division. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vlvgp [[REG:%v[0-9]+]], +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 0 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 1 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 2 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 3 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 4 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 5 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 6 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 8 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 9 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 10 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 11 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 12 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 13 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 14 +; CHECK: br %r14 + %ret = sdiv <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 division. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vlvgp [[REG:%v[0-9]+]], +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 0 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 1 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 2 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 4 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 5 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 6 +; CHECK: br %r14 + %ret = sdiv <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 division. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vlvgp [[REG:%v[0-9]+]], +; CHECK-DAG: vlvgf [[REG]], {{%r[0-5]}}, 0 +; CHECK-DAG: vlvgf [[REG]], {{%r[0-5]}}, 2 +; CHECK: br %r14 + %ret = sdiv <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 division. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vlvgp %v24, +; CHECK: br %r14 + %ret = sdiv <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-max-01.ll b/llvm/test/CodeGen/SystemZ/vec-max-01.ll new file mode 100644 index 00000000000..ca6f08aa493 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-max-01.ll @@ -0,0 +1,83 @@ +; Test v16i8 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with ult. +define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ule. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ugt. +define <16 x i8> @f7(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with uge. +define <16 x i8> @f8(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-max-02.ll b/llvm/test/CodeGen/SystemZ/vec-max-02.ll new file mode 100644 index 00000000000..2c61603b6f3 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-max-02.ll @@ -0,0 +1,83 @@ +; Test v8i16 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with ult. +define <8 x i16> @f5(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ule. +define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ugt. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with uge. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-max-03.ll b/llvm/test/CodeGen/SystemZ/vec-max-03.ll new file mode 100644 index 00000000000..a4387948399 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-max-03.ll @@ -0,0 +1,83 @@ +; Test v4i32 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f2(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f4(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with ult. +define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ule. +define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ugt. +define <4 x i32> @f7(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with uge. +define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-max-04.ll b/llvm/test/CodeGen/SystemZ/vec-max-04.ll new file mode 100644 index 00000000000..ab7c6239127 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-max-04.ll @@ -0,0 +1,83 @@ +; Test v2i64 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f3(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with ult. +define <2 x i64> @f5(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ule. +define <2 x i64> @f6(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ugt. +define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with uge. +define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-min-01.ll b/llvm/test/CodeGen/SystemZ/vec-min-01.ll new file mode 100644 index 00000000000..255dc57e113 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-min-01.ll @@ -0,0 +1,83 @@ +; Test v16i8 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with ult. +define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ule. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ugt. +define <16 x i8> @f7(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with uge. +define <16 x i8> @f8(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-min-02.ll b/llvm/test/CodeGen/SystemZ/vec-min-02.ll new file mode 100644 index 00000000000..cad8a61506c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-min-02.ll @@ -0,0 +1,83 @@ +; Test v8i16 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with ult. +define <8 x i16> @f5(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ule. +define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ugt. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with uge. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-min-03.ll b/llvm/test/CodeGen/SystemZ/vec-min-03.ll new file mode 100644 index 00000000000..febac50aa46 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-min-03.ll @@ -0,0 +1,83 @@ +; Test v4i32 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f2(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f4(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with ult. +define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ule. +define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ugt. +define <4 x i32> @f7(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with uge. +define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-min-04.ll b/llvm/test/CodeGen/SystemZ/vec-min-04.ll new file mode 100644 index 00000000000..765ce1956b5 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-min-04.ll @@ -0,0 +1,83 @@ +; Test v2i64 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f3(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with ult. +define <2 x i64> @f5(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ule. +define <2 x i64> @f6(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ugt. +define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with uge. +define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-01.ll b/llvm/test/CodeGen/SystemZ/vec-move-01.ll new file mode 100644 index 00000000000..952e5a42126 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-01.ll @@ -0,0 +1,35 @@ +; Test vector register moves. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 moves. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <16 x i8> %val2 +} + +; Test v8i16 moves. +define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <8 x i16> %val2 +} + +; Test v4i32 moves. +define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <4 x i32> %val2 +} + +; Test v2i64 moves. +define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <2 x i64> %val2 +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-02.ll b/llvm/test/CodeGen/SystemZ/vec-move-02.ll new file mode 100644 index 00000000000..b7b3ab6798d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-02.ll @@ -0,0 +1,93 @@ +; Test vector loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 loads. +define <16 x i8> @f1(<16 x i8> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Test v8i16 loads. +define <8 x i16> @f2(<8 x i16> *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <8 x i16>, <8 x i16> *%ptr + ret <8 x i16> %ret +} + +; Test v4i32 loads. +define <4 x i32> @f3(<4 x i32> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <4 x i32>, <4 x i32> *%ptr + ret <4 x i32> %ret +} + +; Test v2i64 loads. +define <2 x i64> @f4(<2 x i64> *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <2 x i64>, <2 x i64> *%ptr + ret <2 x i64> %ret +} + +; Test the highest aligned in-range offset. +define <16 x i8> @f7(<16 x i8> *%base) { +; CHECK-LABEL: f7: +; CHECK: vl %v24, 4080(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 255 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Test the highest unaligned in-range offset. +define <16 x i8> @f8(i8 *%base) { +; CHECK-LABEL: f8: +; CHECK: vl %v24, 4095(%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 4095 + %ptr = bitcast i8 *%addr to <16 x i8> * + %ret = load <16 x i8>, <16 x i8> *%ptr, align 1 + ret <16 x i8> %ret +} + +; Test the next offset up, which requires separate address logic, +define <16 x i8> @f9(<16 x i8> *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 256 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Test negative offsets, which also require separate address logic, +define <16 x i8> @f10(<16 x i8> *%base) { +; CHECK-LABEL: f10: +; CHECK: aghi %r2, -16 +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 -1 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Check that indexes are allowed. +define <16 x i8> @f11(i8 *%base, i64 %index) { +; CHECK-LABEL: f11: +; CHECK: vl %v24, 0(%r3,%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 %index + %ptr = bitcast i8 *%addr to <16 x i8> * + %ret = load <16 x i8>, <16 x i8> *%ptr, align 1 + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-03.ll b/llvm/test/CodeGen/SystemZ/vec-move-03.ll new file mode 100644 index 00000000000..ddce4ef209a --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-03.ll @@ -0,0 +1,93 @@ +; Test vector stores. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 stores. +define void @f1(<16 x i8> %val, <16 x i8> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Test v8i16 stores. +define void @f2(<8 x i16> %val, <8 x i16> *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <8 x i16> %val, <8 x i16> *%ptr + ret void +} + +; Test v4i32 stores. +define void @f3(<4 x i32> %val, <4 x i32> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <4 x i32> %val, <4 x i32> *%ptr + ret void +} + +; Test v2i64 stores. +define void @f4(<2 x i64> %val, <2 x i64> *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <2 x i64> %val, <2 x i64> *%ptr + ret void +} + +; Test the highest aligned in-range offset. +define void @f7(<16 x i8> %val, <16 x i8> *%base) { +; CHECK-LABEL: f7: +; CHECK: vst %v24, 4080(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 255 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Test the highest unaligned in-range offset. +define void @f8(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f8: +; CHECK: vst %v24, 4095(%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 4095 + %ptr = bitcast i8 *%addr to <16 x i8> * + store <16 x i8> %val, <16 x i8> *%ptr, align 1 + ret void +} + +; Test the next offset up, which requires separate address logic, +define void @f9(<16 x i8> %val, <16 x i8> *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 256 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Test negative offsets, which also require separate address logic, +define void @f10(<16 x i8> %val, <16 x i8> *%base) { +; CHECK-LABEL: f10: +; CHECK: aghi %r2, -16 +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 -1 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Check that indexes are allowed. +define void @f11(<16 x i8> %val, i8 *%base, i64 %index) { +; CHECK-LABEL: f11: +; CHECK: vst %v24, 0(%r3,%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 %index + %ptr = bitcast i8 *%addr to <16 x i8> * + store <16 x i8> %val, <16 x i8> *%ptr, align 1 + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-04.ll b/llvm/test/CodeGen/SystemZ/vec-move-04.ll new file mode 100644 index 00000000000..f43c0b71491 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-04.ll @@ -0,0 +1,121 @@ +; Test vector insertion of register variables. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into the first element. +define <16 x i8> @f1(<16 x i8> %val, i8 %element) { +; CHECK-LABEL: f1: +; CHECK: vlvgb %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 %element, i32 0 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into the last element. +define <16 x i8> @f2(<16 x i8> %val, i8 %element) { +; CHECK-LABEL: f2: +; CHECK: vlvgb %v24, %r2, 15 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 %element, i32 15 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into a variable element. +define <16 x i8> @f3(<16 x i8> %val, i8 %element, i32 %index) { +; CHECK-LABEL: f3: +; CHECK: vlvgb %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 %element, i32 %index + ret <16 x i8> %ret +} + +; Test v8i16 insertion into the first element. +define <8 x i16> @f4(<8 x i16> %val, i16 %element) { +; CHECK-LABEL: f4: +; CHECK: vlvgh %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 %element, i32 0 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into the last element. +define <8 x i16> @f5(<8 x i16> %val, i16 %element) { +; CHECK-LABEL: f5: +; CHECK: vlvgh %v24, %r2, 7 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 %element, i32 7 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into a variable element. +define <8 x i16> @f6(<8 x i16> %val, i16 %element, i32 %index) { +; CHECK-LABEL: f6: +; CHECK: vlvgh %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 %element, i32 %index + ret <8 x i16> %ret +} + +; Test v4i32 insertion into the first element. +define <4 x i32> @f7(<4 x i32> %val, i32 %element) { +; CHECK-LABEL: f7: +; CHECK: vlvgf %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 %element, i32 0 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into the last element. +define <4 x i32> @f8(<4 x i32> %val, i32 %element) { +; CHECK-LABEL: f8: +; CHECK: vlvgf %v24, %r2, 3 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 %element, i32 3 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into a variable element. +define <4 x i32> @f9(<4 x i32> %val, i32 %element, i32 %index) { +; CHECK-LABEL: f9: +; CHECK: vlvgf %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 %element, i32 %index + ret <4 x i32> %ret +} + +; Test v2i64 insertion into the first element. +define <2 x i64> @f10(<2 x i64> %val, i64 %element) { +; CHECK-LABEL: f10: +; CHECK: vlvgg %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into the last element. +define <2 x i64> @f11(<2 x i64> %val, i64 %element) { +; CHECK-LABEL: f11: +; CHECK: vlvgg %v24, %r2, 1 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into a variable element. +define <2 x i64> @f12(<2 x i64> %val, i64 %element, i32 %index) { +; CHECK-LABEL: f12: +; CHECK: vlvgg %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 %element, i32 %index + ret <2 x i64> %ret +} + +; Test v16i8 insertion into a variable element plus one. +define <16 x i8> @f19(<16 x i8> %val, i8 %element, i32 %index) { +; CHECK-LABEL: f19: +; CHECK: vlvgb %v24, %r2, 1(%r3) +; CHECK: br %r14 + %add = add i32 %index, 1 + %ret = insertelement <16 x i8> %val, i8 %element, i32 %add + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-05.ll b/llvm/test/CodeGen/SystemZ/vec-move-05.ll new file mode 100644 index 00000000000..60a0666c2f9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-05.ll @@ -0,0 +1,161 @@ +; Test vector extraction. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 extraction of the first element. +define i8 @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vlgvb %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 0 + ret i8 %ret +} + +; Test v16i8 extraction of the last element. +define i8 @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vlgvb %r2, %v24, 15 +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 15 + ret i8 %ret +} + +; Test v16i8 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i8 @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK-NOT: vlgvb %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 100000 + ret i8 %ret +} + +; Test v16i8 extraction of a variable element. +define i8 @f4(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f4: +; CHECK: vlgvb %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 %index + ret i8 %ret +} + +; Test v8i16 extraction of the first element. +define i16 @f5(<8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vlgvh %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 0 + ret i16 %ret +} + +; Test v8i16 extraction of the last element. +define i16 @f6(<8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vlgvh %r2, %v24, 7 +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 7 + ret i16 %ret +} + +; Test v8i16 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i16 @f7(<8 x i16> %val) { +; CHECK-LABEL: f7: +; CHECK-NOT: vlgvh %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 100000 + ret i16 %ret +} + +; Test v8i16 extraction of a variable element. +define i16 @f8(<8 x i16> %val, i32 %index) { +; CHECK-LABEL: f8: +; CHECK: vlgvh %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 %index + ret i16 %ret +} + +; Test v4i32 extraction of the first element. +define i32 @f9(<4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vlgvf %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 0 + ret i32 %ret +} + +; Test v4i32 extraction of the last element. +define i32 @f10(<4 x i32> %val) { +; CHECK-LABEL: f10: +; CHECK: vlgvf %r2, %v24, 3 +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 3 + ret i32 %ret +} + +; Test v4i32 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i32 @f11(<4 x i32> %val) { +; CHECK-LABEL: f11: +; CHECK-NOT: vlgvf %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 100000 + ret i32 %ret +} + +; Test v4i32 extraction of a variable element. +define i32 @f12(<4 x i32> %val, i32 %index) { +; CHECK-LABEL: f12: +; CHECK: vlgvf %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 %index + ret i32 %ret +} + +; Test v2i64 extraction of the first element. +define i64 @f13(<2 x i64> %val) { +; CHECK-LABEL: f13: +; CHECK: vlgvg %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 0 + ret i64 %ret +} + +; Test v2i64 extraction of the last element. +define i64 @f14(<2 x i64> %val) { +; CHECK-LABEL: f14: +; CHECK: vlgvg %r2, %v24, 1 +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 1 + ret i64 %ret +} + +; Test v2i64 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i64 @f15(<2 x i64> %val) { +; CHECK-LABEL: f15: +; CHECK-NOT: vlgvg %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 100000 + ret i64 %ret +} + +; Test v2i64 extraction of a variable element. +define i64 @f16(<2 x i64> %val, i32 %index) { +; CHECK-LABEL: f16: +; CHECK: vlgvg %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 %index + ret i64 %ret +} + +; Test v16i8 extraction of a variable element with an offset. +define i8 @f27(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f27: +; CHECK: vlgvb %r2, %v24, 1(%r2) +; CHECK: br %r14 + %add = add i32 %index, 1 + %ret = extractelement <16 x i8> %val, i32 %add + ret i8 %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-06.ll b/llvm/test/CodeGen/SystemZ/vec-move-06.ll new file mode 100644 index 00000000000..de3960cad95 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-06.ll @@ -0,0 +1,13 @@ +; Test vector builds using VLVGP. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test the basic v2i64 usage. +define <2 x i64> @f1(i64 %a, i64 %b) { +; CHECK-LABEL: f1: +; CHECK: vlvgp %v24, %r2, %r3 +; CHECK: br %r14 + %veca = insertelement <2 x i64> undef, i64 %a, i32 0 + %vecb = insertelement <2 x i64> %veca, i64 %b, i32 1 + ret <2 x i64> %vecb +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-07.ll b/llvm/test/CodeGen/SystemZ/vec-move-07.ll new file mode 100644 index 00000000000..a688b089b97 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-07.ll @@ -0,0 +1,39 @@ +; Test scalar_to_vector expansion. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8. +define <16 x i8> @f1(i8 %val) { +; CHECK-LABEL: f1: +; CHECK: vlvgb %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 0 + ret <16 x i8> %ret +} + +; Test v8i16. +define <8 x i16> @f2(i16 %val) { +; CHECK-LABEL: f2: +; CHECK: vlvgh %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 0 + ret <8 x i16> %ret +} + +; Test v4i32. +define <4 x i32> @f3(i32 %val) { +; CHECK-LABEL: f3: +; CHECK: vlvgf %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 0 + ret <4 x i32> %ret +} + +; Test v2i64. Here we load %val into both halves. +define <2 x i64> @f4(i64 %val) { +; CHECK-LABEL: f4: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK: br %r14 + %ret = insertelement <2 x i64> undef, i64 %val, i32 0 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-08.ll b/llvm/test/CodeGen/SystemZ/vec-move-08.ll new file mode 100644 index 00000000000..94a3b3aefba --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-08.ll @@ -0,0 +1,284 @@ +; Test vector insertion of memory values. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into the first element. +define <16 x i8> @f1(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vleb %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 0 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into the last element. +define <16 x i8> @f2(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vleb %v24, 0(%r2), 15 +; CHECK: br %r14 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 15 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the highest in-range offset. +define <16 x i8> @f3(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f3: +; CHECK: vleb %v24, 4095(%r2), 10 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4095 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 10 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the first ouf-of-range offset. +define <16 x i8> @f4(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f4: +; CHECK: aghi %r2, 4096 +; CHECK: vleb %v24, 0(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4096 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 5 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into a variable element. +define <16 x i8> @f5(<16 x i8> %val, i8 *%ptr, i32 %index) { +; CHECK-LABEL: f5: +; CHECK-NOT: vleb +; CHECK: br %r14 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 %index + ret <16 x i8> %ret +} + +; Test v8i16 insertion into the first element. +define <8 x i16> @f6(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vleh %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 0 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into the last element. +define <8 x i16> @f7(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vleh %v24, 0(%r2), 7 +; CHECK: br %r14 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 7 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the highest in-range offset. +define <8 x i16> @f8(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f8: +; CHECK: vleh %v24, 4094(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2047 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the first ouf-of-range offset. +define <8 x i16> @f9(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vleh %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2048 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 1 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into a variable element. +define <8 x i16> @f10(<8 x i16> %val, i16 *%ptr, i32 %index) { +; CHECK-LABEL: f10: +; CHECK-NOT: vleh +; CHECK: br %r14 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 %index + ret <8 x i16> %ret +} + +; Test v4i32 insertion into the first element. +define <4 x i32> @f11(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f11: +; CHECK: vlef %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 0 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into the last element. +define <4 x i32> @f12(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f12: +; CHECK: vlef %v24, 0(%r2), 3 +; CHECK: br %r14 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 3 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the highest in-range offset. +define <4 x i32> @f13(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f13: +; CHECK: vlef %v24, 4092(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1023 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the first ouf-of-range offset. +define <4 x i32> @f14(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f14: +; CHECK: aghi %r2, 4096 +; CHECK: vlef %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1024 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into a variable element. +define <4 x i32> @f15(<4 x i32> %val, i32 *%ptr, i32 %index) { +; CHECK-LABEL: f15: +; CHECK-NOT: vlef +; CHECK: br %r14 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 %index + ret <4 x i32> %ret +} + +; Test v2i64 insertion into the first element. +define <2 x i64> @f16(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f16: +; CHECK: vleg %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into the last element. +define <2 x i64> @f17(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f17: +; CHECK: vleg %v24, 0(%r2), 1 +; CHECK: br %r14 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the highest in-range offset. +define <2 x i64> @f18(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f18: +; CHECK: vleg %v24, 4088(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 511 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the first ouf-of-range offset. +define <2 x i64> @f19(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f19: +; CHECK: aghi %r2, 4096 +; CHECK: vleg %v24, 0(%r2), 0 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 512 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into a variable element. +define <2 x i64> @f20(<2 x i64> %val, i64 *%ptr, i32 %index) { +; CHECK-LABEL: f20: +; CHECK-NOT: vleg +; CHECK: br %r14 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 %index + ret <2 x i64> %ret +} + +; Test a v4i32 gather of the first element. +define <4 x i32> @f31(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f31: +; CHECK: vgef %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 0 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 0 + ret <4 x i32> %ret +} + +; Test a v4i32 gather of the last element. +define <4 x i32> @f32(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f32: +; CHECK: vgef %v24, 0(%v26,%r2), 3 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 3 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 3 + ret <4 x i32> %ret +} + +; Test a v4i32 gather with the highest in-range offset. +define <4 x i32> @f33(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f33: +; CHECK: vgef %v24, 4095(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 1 + %ext = zext i32 %elem to i64 + %add1 = add i64 %base, %ext + %add2 = add i64 %add1, 4095 + %ptr = inttoptr i64 %add2 to i32 * + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 1 + ret <4 x i32> %ret +} + +; Test a v2i64 gather of the first element. +define <2 x i64> @f34(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f34: +; CHECK: vgeg %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 0 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test a v2i64 gather of the last element. +define <2 x i64> @f35(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f35: +; CHECK: vgeg %v24, 0(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 1 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-09.ll b/llvm/test/CodeGen/SystemZ/vec-move-09.ll new file mode 100644 index 00000000000..7863e4305f9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-09.ll @@ -0,0 +1,237 @@ +; Test vector insertion of constants. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into the first element. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vleib %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 0, i32 0 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into the last element. +define <16 x i8> @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vleib %v24, 100, 15 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 100, i32 15 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the maximum signed value. +define <16 x i8> @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vleib %v24, 127, 10 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 127, i32 10 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the minimum signed value. +define <16 x i8> @f4(<16 x i8> %val) { +; CHECK-LABEL: f4: +; CHECK: vleib %v24, -128, 11 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 128, i32 11 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the maximum unsigned value. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vleib %v24, -1, 12 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 255, i32 12 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into a variable element. +define <16 x i8> @f6(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f6: +; CHECK-NOT: vleib +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 0, i32 %index + ret <16 x i8> %ret +} + +; Test v8i16 insertion into the first element. +define <8 x i16> @f7(<8 x i16> %val) { +; CHECK-LABEL: f7: +; CHECK: vleih %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 0, i32 0 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into the last element. +define <8 x i16> @f8(<8 x i16> %val) { +; CHECK-LABEL: f8: +; CHECK: vleih %v24, 0, 7 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 0, i32 7 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the maximum signed value. +define <8 x i16> @f9(<8 x i16> %val) { +; CHECK-LABEL: f9: +; CHECK: vleih %v24, 32767, 4 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 32767, i32 4 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the minimum signed value. +define <8 x i16> @f10(<8 x i16> %val) { +; CHECK-LABEL: f10: +; CHECK: vleih %v24, -32768, 5 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 32768, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the maximum unsigned value. +define <8 x i16> @f11(<8 x i16> %val) { +; CHECK-LABEL: f11: +; CHECK: vleih %v24, -1, 6 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 65535, i32 6 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into a variable element. +define <8 x i16> @f12(<8 x i16> %val, i32 %index) { +; CHECK-LABEL: f12: +; CHECK-NOT: vleih +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 0, i32 %index + ret <8 x i16> %ret +} + +; Test v4i32 insertion into the first element. +define <4 x i32> @f13(<4 x i32> %val) { +; CHECK-LABEL: f13: +; CHECK: vleif %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 0, i32 0 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into the last element. +define <4 x i32> @f14(<4 x i32> %val) { +; CHECK-LABEL: f14: +; CHECK: vleif %v24, 0, 3 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 0, i32 3 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the maximum value allowed by VLEIF. +define <4 x i32> @f15(<4 x i32> %val) { +; CHECK-LABEL: f15: +; CHECK: vleif %v24, 32767, 1 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 32767, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the next value up. +define <4 x i32> @f16(<4 x i32> %val) { +; CHECK-LABEL: f16: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 32768, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the minimum value allowed by VLEIF. +define <4 x i32> @f17(<4 x i32> %val) { +; CHECK-LABEL: f17: +; CHECK: vleif %v24, -32768, 2 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 -32768, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the next value down. +define <4 x i32> @f18(<4 x i32> %val) { +; CHECK-LABEL: f18: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 -32769, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into a variable element. +define <4 x i32> @f19(<4 x i32> %val, i32 %index) { +; CHECK-LABEL: f19: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 0, i32 %index + ret <4 x i32> %ret +} + +; Test v2i64 insertion into the first element. +define <2 x i64> @f20(<2 x i64> %val) { +; CHECK-LABEL: f20: +; CHECK: vleig %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 0, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into the last element. +define <2 x i64> @f21(<2 x i64> %val) { +; CHECK-LABEL: f21: +; CHECK: vleig %v24, 0, 1 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 0, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the maximum value allowed by VLEIG. +define <2 x i64> @f22(<2 x i64> %val) { +; CHECK-LABEL: f22: +; CHECK: vleig %v24, 32767, 1 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 32767, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the next value up. +define <2 x i64> @f23(<2 x i64> %val) { +; CHECK-LABEL: f23: +; CHECK-NOT: vleig +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 32768, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the minimum value allowed by VLEIG. +define <2 x i64> @f24(<2 x i64> %val) { +; CHECK-LABEL: f24: +; CHECK: vleig %v24, -32768, 0 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 -32768, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the next value down. +define <2 x i64> @f25(<2 x i64> %val) { +; CHECK-LABEL: f25: +; CHECK-NOT: vleig +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 -32769, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into a variable element. +define <2 x i64> @f26(<2 x i64> %val, i32 %index) { +; CHECK-LABEL: f26: +; CHECK-NOT: vleig +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 0, i32 %index + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-10.ll b/llvm/test/CodeGen/SystemZ/vec-move-10.ll new file mode 100644 index 00000000000..852a4a7c4ed --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-10.ll @@ -0,0 +1,328 @@ +; Test vector extraction to memory. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 extraction from the first element. +define void @f1(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vsteb %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 0 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction from the last element. +define void @f2(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vsteb %v24, 0(%r2), 15 +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 15 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f3(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f3: +; CHECK-NOT: vsteb %v24, 0(%r2), 16 +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 16 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction with the highest in-range offset. +define void @f4(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f4: +; CHECK: vsteb %v24, 4095(%r2), 10 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4095 + %element = extractelement <16 x i8> %val, i32 10 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction with the first ouf-of-range offset. +define void @f5(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f5: +; CHECK: aghi %r2, 4096 +; CHECK: vsteb %v24, 0(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4096 + %element = extractelement <16 x i8> %val, i32 5 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction from a variable element. +define void @f6(<16 x i8> %val, i8 *%ptr, i32 %index) { +; CHECK-LABEL: f6: +; CHECK-NOT: vsteb +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 %index + store i8 %element, i8 *%ptr + ret void +} + +; Test v8i16 extraction from the first element. +define void @f7(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vsteh %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 0 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction from the last element. +define void @f8(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f8: +; CHECK: vsteh %v24, 0(%r2), 7 +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 7 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f9(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f9: +; CHECK-NOT: vsteh %v24, 0(%r2), 8 +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 8 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction with the highest in-range offset. +define void @f10(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f10: +; CHECK: vsteh %v24, 4094(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2047 + %element = extractelement <8 x i16> %val, i32 5 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction with the first ouf-of-range offset. +define void @f11(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f11: +; CHECK: aghi %r2, 4096 +; CHECK: vsteh %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2048 + %element = extractelement <8 x i16> %val, i32 1 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction from a variable element. +define void @f12(<8 x i16> %val, i16 *%ptr, i32 %index) { +; CHECK-LABEL: f12: +; CHECK-NOT: vsteh +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 %index + store i16 %element, i16 *%ptr + ret void +} + +; Test v4i32 extraction from the first element. +define void @f13(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f13: +; CHECK: vstef %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 0 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction from the last element. +define void @f14(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f14: +; CHECK: vstef %v24, 0(%r2), 3 +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 3 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f15(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f15: +; CHECK-NOT: vstef %v24, 0(%r2), 4 +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 4 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction with the highest in-range offset. +define void @f16(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f16: +; CHECK: vstef %v24, 4092(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1023 + %element = extractelement <4 x i32> %val, i32 2 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction with the first ouf-of-range offset. +define void @f17(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f17: +; CHECK: aghi %r2, 4096 +; CHECK: vstef %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1024 + %element = extractelement <4 x i32> %val, i32 1 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction from a variable element. +define void @f18(<4 x i32> %val, i32 *%ptr, i32 %index) { +; CHECK-LABEL: f18: +; CHECK-NOT: vstef +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 %index + store i32 %element, i32 *%ptr + ret void +} + +; Test v2i64 extraction from the first element. +define void @f19(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f19: +; CHECK: vsteg %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 0 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction from the last element. +define void @f20(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f20: +; CHECK: vsteg %v24, 0(%r2), 1 +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 1 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f21(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f21: +; CHECK-NOT: vsteg %v24, 0(%r2), 2 +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 2 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction with the highest in-range offset. +define void @f22(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f22: +; CHECK: vsteg %v24, 4088(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 511 + %element = extractelement <2 x i64> %val, i32 1 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction with the first ouf-of-range offset. +define void @f23(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f23: +; CHECK: aghi %r2, 4096 +; CHECK: vsteg %v24, 0(%r2), 0 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 512 + %element = extractelement <2 x i64> %val, i32 0 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction from a variable element. +define void @f24(<2 x i64> %val, i64 *%ptr, i32 %index) { +; CHECK-LABEL: f24: +; CHECK-NOT: vsteg +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 %index + store i64 %element, i64 *%ptr + ret void +} + +; Test a v4i32 scatter of the first element. +define void @f37(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f37: +; CHECK: vscef %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 0 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = extractelement <4 x i32> %val, i32 0 + store i32 %element, i32 *%ptr + ret void +} + +; Test a v4i32 scatter of the last element. +define void @f38(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f38: +; CHECK: vscef %v24, 0(%v26,%r2), 3 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 3 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = extractelement <4 x i32> %val, i32 3 + store i32 %element, i32 *%ptr + ret void +} + +; Test a v4i32 scatter with the highest in-range offset. +define void @f39(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f39: +; CHECK: vscef %v24, 4095(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 1 + %ext = zext i32 %elem to i64 + %add1 = add i64 %base, %ext + %add2 = add i64 %add1, 4095 + %ptr = inttoptr i64 %add2 to i32 * + %element = extractelement <4 x i32> %val, i32 1 + store i32 %element, i32 *%ptr + ret void +} + +; Test a v2i64 scatter of the first element. +define void @f40(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f40: +; CHECK: vsceg %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 0 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = extractelement <2 x i64> %val, i32 0 + store i64 %element, i64 *%ptr + ret void +} + +; Test a v2i64 scatter of the last element. +define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f41: +; CHECK: vsceg %v24, 0(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 1 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = extractelement <2 x i64> %val, i32 1 + store i64 %element, i64 *%ptr + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-11.ll b/llvm/test/CodeGen/SystemZ/vec-move-11.ll new file mode 100644 index 00000000000..45bc91b169b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-11.ll @@ -0,0 +1,93 @@ +; Test insertions of register values into a nonzero index of an undef. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into an undef, with an arbitrary index. +define <16 x i8> @f1(i8 %val) { +; CHECK-LABEL: f1: +; CHECK: vlvgb %v24, %r2, 12 +; CHECK-NEXT: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 12 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the first good index for VLVGP. +define <16 x i8> @f2(i8 %val) { +; CHECK-LABEL: f2: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the second good index for VLVGP. +define <16 x i8> @f3(i8 %val) { +; CHECK-LABEL: f3: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 15 + ret <16 x i8> %ret +} + +; Test v8i16 insertion into an undef, with an arbitrary index. +define <8 x i16> @f4(i16 %val) { +; CHECK-LABEL: f4: +; CHECK: vlvgh %v24, %r2, 5 +; CHECK-NEXT: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the first good index for VLVGP. +define <8 x i16> @f5(i16 %val) { +; CHECK-LABEL: f5: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 3 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the second good index for VLVGP. +define <8 x i16> @f6(i16 %val) { +; CHECK-LABEL: f6: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 7 + ret <8 x i16> %ret +} + +; Test v4i32 insertion into an undef, with an arbitrary index. +define <4 x i32> @f7(i32 %val) { +; CHECK-LABEL: f7: +; CHECK: vlvgf %v24, %r2, 2 +; CHECK-NEXT: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the first good index for VLVGP. +define <4 x i32> @f8(i32 %val) { +; CHECK-LABEL: f8: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the second good index for VLVGP. +define <4 x i32> @f9(i32 %val) { +; CHECK-LABEL: f9: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 3 + ret <4 x i32> %ret +} + +; Test v2i64 insertion into an undef. +define <2 x i64> @f10(i64 %val) { +; CHECK-LABEL: f10: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <2 x i64> undef, i64 %val, i32 1 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-12.ll b/llvm/test/CodeGen/SystemZ/vec-move-12.ll new file mode 100644 index 00000000000..1fecab688e7 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-12.ll @@ -0,0 +1,103 @@ +; Test insertions of memory values into a nonzero index of an undef. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into an undef, with an arbitrary index. +define <16 x i8> @f1(i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vlrepb %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> undef, i8 %val, i32 12 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the first good index for VLVGP. +define <16 x i8> @f2(i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: {{vlrepb|vllezb}} %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> undef, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the second good index for VLVGP. +define <16 x i8> @f3(i8 *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vlrepb %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> undef, i8 %val, i32 15 + ret <16 x i8> %ret +} + +; Test v8i16 insertion into an undef, with an arbitrary index. +define <8 x i16> @f4(i16 *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vlreph %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> undef, i16 %val, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the first good index for VLVGP. +define <8 x i16> @f5(i16 *%ptr) { +; CHECK-LABEL: f5: +; CHECK: {{vlreph|vllezh}} %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> undef, i16 %val, i32 3 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the second good index for VLVGP. +define <8 x i16> @f6(i16 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vlreph %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> undef, i16 %val, i32 7 + ret <8 x i16> %ret +} + +; Test v4i32 insertion into an undef, with an arbitrary index. +define <4 x i32> @f7(i32 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> undef, i32 %val, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the first good index for VLVGP. +define <4 x i32> @f8(i32 *%ptr) { +; CHECK-LABEL: f8: +; CHECK: {{vlrepf|vllezf}} %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> undef, i32 %val, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the second good index for VLVGP. +define <4 x i32> @f9(i32 *%ptr) { +; CHECK-LABEL: f9: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> undef, i32 %val, i32 3 + ret <4 x i32> %ret +} + +; Test v2i64 insertion into an undef. +define <2 x i64> @f10(i64 *%ptr) { +; CHECK-LABEL: f10: +; CHECK: vlrepg %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i64, i64 *%ptr + %ret = insertelement <2 x i64> undef, i64 %val, i32 1 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-13.ll b/llvm/test/CodeGen/SystemZ/vec-move-13.ll new file mode 100644 index 00000000000..e103affa4b1 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-13.ll @@ -0,0 +1,47 @@ +; Test insertions of register values into 0. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into 0. +define <16 x i8> @f1(i8 %val1, i8 %val2) { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK-DAG: vlvgb %v24, %r2, 2 +; CHECK-DAG: vlvgb %v24, %r3, 12 +; CHECK: br %r14 + %vec1 = insertelement <16 x i8> zeroinitializer, i8 %val1, i32 2 + %vec2 = insertelement <16 x i8> %vec1, i8 %val2, i32 12 + ret <16 x i8> %vec2 +} + +; Test v8i16 insertion into 0. +define <8 x i16> @f2(i16 %val1, i16 %val2) { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 0 +; CHECK-DAG: vlvgh %v24, %r2, 3 +; CHECK-DAG: vlvgh %v24, %r3, 5 +; CHECK: br %r14 + %vec1 = insertelement <8 x i16> zeroinitializer, i16 %val1, i32 3 + %vec2 = insertelement <8 x i16> %vec1, i16 %val2, i32 5 + ret <8 x i16> %vec2 +} + +; Test v4i32 insertion into 0. +define <4 x i32> @f3(i32 %val) { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 0 +; CHECK: vlvgf %v24, %r2, 3 +; CHECK: br %r14 + %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 3 + ret <4 x i32> %ret +} + +; Test v2i64 insertion into 0. +define <2 x i64> @f4(i64 %val) { +; CHECK-LABEL: f4: +; CHECK: lghi [[REG:%r[0-5]]], 0 +; CHECK: vlvgp %v24, [[REG]], %r2 +; CHECK: br %r14 + %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 1 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-move-14.ll b/llvm/test/CodeGen/SystemZ/vec-move-14.ll new file mode 100644 index 00000000000..f0c60e7d366 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-14.ll @@ -0,0 +1,76 @@ +; Test insertions of memory values into 0. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test VLLEZB. +define <16 x i8> @f1(i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vllezb %v24, 0(%r2) +; CHECK: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test VLLEZB with the highest in-range offset. +define <16 x i8> @f2(i8 *%base) { +; CHECK-LABEL: f2: +; CHECK: vllezb %v24, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test VLLEZB with the next highest offset. +define <16 x i8> @f3(i8 *%base) { +; CHECK-LABEL: f3: +; CHECK-NOT: vllezb %v24, 4096(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test that VLLEZB allows an index. +define <16 x i8> @f4(i8 *%base, i64 %index) { +; CHECK-LABEL: f4: +; CHECK: vllezb %v24, 0({{%r2,%r3|%r3,%r2}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test VLLEZH. +define <8 x i16> @f5(i16 *%ptr) { +; CHECK-LABEL: f5: +; CHECK: vllezh %v24, 0(%r2) +; CHECK: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3 + ret <8 x i16> %ret +} + +; Test VLLEZF. +define <4 x i32> @f6(i32 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vllezf %v24, 0(%r2) +; CHECK: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1 + ret <4 x i32> %ret +} + +; Test VLLEZG. +define <2 x i64> @f7(i64 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vllezg %v24, 0(%r2) +; CHECK: br %r14 + %val = load i64, i64 *%ptr + %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-01.ll b/llvm/test/CodeGen/SystemZ/vec-mul-01.ll new file mode 100644 index 00000000000..209582f5893 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-mul-01.ll @@ -0,0 +1,39 @@ +; Test vector multiplication. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 multiplication. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmlb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = mul <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 multiplication. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmlhw %v24, %v26, %v28 +; CHECK: br %r14 + %ret = mul <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 multiplication. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmlf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = mul <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 multiplication. There's no vector equivalent. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK-NOT: vmlg +; CHECK: br %r14 + %ret = mul <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-02.ll b/llvm/test/CodeGen/SystemZ/vec-mul-02.ll new file mode 100644 index 00000000000..7323330919a --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-mul-02.ll @@ -0,0 +1,36 @@ +; Test vector multiply-and-add. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 multiply-and-add. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3) { +; CHECK-LABEL: f1: +; CHECK: vmalb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %mul = mul <16 x i8> %val1, %val2 + %ret = add <16 x i8> %mul, %val3 + ret <16 x i8> %ret +} + +; Test a v8i16 multiply-and-add. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3) { +; CHECK-LABEL: f2: +; CHECK: vmalhw %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %mul = mul <8 x i16> %val1, %val2 + %ret = add <8 x i16> %mul, %val3 + ret <8 x i16> %ret +} + +; Test a v4i32 multiply-and-add. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3) { +; CHECK-LABEL: f3: +; CHECK: vmalf %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %mul = mul <4 x i32> %val1, %val2 + %ret = add <4 x i32> %mul, %val3 + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-neg-01.ll b/llvm/test/CodeGen/SystemZ/vec-neg-01.ll new file mode 100644 index 00000000000..357648ba4d3 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-neg-01.ll @@ -0,0 +1,39 @@ +; Test vector negation. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 negation. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vlcb %v24, %v26 +; CHECK: br %r14 + %ret = sub <16 x i8> zeroinitializer, %val + ret <16 x i8> %ret +} + +; Test a v8i16 negation. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: vlch %v24, %v26 +; CHECK: br %r14 + %ret = sub <8 x i16> zeroinitializer, %val + ret <8 x i16> %ret +} + +; Test a v4i32 negation. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f3: +; CHECK: vlcf %v24, %v26 +; CHECK: br %r14 + %ret = sub <4 x i32> zeroinitializer, %val + ret <4 x i32> %ret +} + +; Test a v2i64 negation. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f4: +; CHECK: vlcg %v24, %v26 +; CHECK: br %r14 + %ret = sub <2 x i64> zeroinitializer, %val + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-or-01.ll b/llvm/test/CodeGen/SystemZ/vec-or-01.ll new file mode 100644 index 00000000000..789150ad2d1 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-or-01.ll @@ -0,0 +1,39 @@ +; Test vector OR. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 OR. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 OR. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 OR. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 OR. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-or-02.ll b/llvm/test/CodeGen/SystemZ/vec-or-02.ll new file mode 100644 index 00000000000..eeb86e36ff0 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-or-02.ll @@ -0,0 +1,107 @@ +; Test vector (or (and X, Z), (and Y, (not Z))) patterns. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) { +; CHECK-LABEL: f1: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val3, <i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1> + %and1 = and <16 x i8> %val1, %val3 + %and2 = and <16 x i8> %val2, %not + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) { +; CHECK-LABEL: f2: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val3, <i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1, + i8 -1, i8 -1, i8 -1, i8 -1> + %and1 = and <16 x i8> %val1, %not + %and2 = and <16 x i8> %val2, %val3 + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} + +; Test v8i16. +define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) { +; CHECK-LABEL: f3: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val3, <i16 -1, i16 -1, i16 -1, i16 -1, + i16 -1, i16 -1, i16 -1, i16 -1> + %and1 = and <8 x i16> %val1, %val3 + %and2 = and <8 x i16> %val2, %not + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) { +; CHECK-LABEL: f4: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val3, <i16 -1, i16 -1, i16 -1, i16 -1, + i16 -1, i16 -1, i16 -1, i16 -1> + %and1 = and <8 x i16> %val1, %not + %and2 = and <8 x i16> %val2, %val3 + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} + +; Test v4i32. +define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) { +; CHECK-LABEL: f5: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val3, <i32 -1, i32 -1, i32 -1, i32 -1> + %and1 = and <4 x i32> %val1, %val3 + %and2 = and <4 x i32> %val2, %not + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) { +; CHECK-LABEL: f6: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val3, <i32 -1, i32 -1, i32 -1, i32 -1> + %and1 = and <4 x i32> %val1, %not + %and2 = and <4 x i32> %val2, %val3 + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} + +; Test v2i64. +define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) { +; CHECK-LABEL: f7: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val3, <i64 -1, i64 -1> + %and1 = and <2 x i64> %val1, %val3 + %and2 = and <2 x i64> %val2, %not + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) { +; CHECK-LABEL: f8: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val3, <i64 -1, i64 -1> + %and1 = and <2 x i64> %val1, %not + %and2 = and <2 x i64> %val2, %val3 + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-01.ll b/llvm/test/CodeGen/SystemZ/vec-perm-01.ll new file mode 100644 index 00000000000..520ff45e7f7 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-01.ll @@ -0,0 +1,124 @@ +; Test vector splat. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 splat of the first element. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vrepb %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test v16i8 splat of the last element. +define <16 x i8> @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vrepb %v24, %v24, 15 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> <i32 15, i32 15, i32 15, i32 15, + i32 15, i32 15, i32 15, i32 15, + i32 15, i32 15, i32 15, i32 15, + i32 15, i32 15, i32 15, i32 15> + ret <16 x i8> %ret +} + +; Test v16i8 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <16 x i8> @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vrepb %v24, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> <i32 20, i32 20, i32 20, i32 20, + i32 20, i32 20, i32 20, i32 20, + i32 20, i32 20, i32 20, i32 20, + i32 20, i32 20, i32 20, i32 20> + ret <16 x i8> %ret +} + +; Test v8i16 splat of the first element. +define <8 x i16> @f4(<8 x i16> %val) { +; CHECK-LABEL: f4: +; CHECK: vreph %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test v8i16 splat of the last element. +define <8 x i16> @f5(<8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vreph %v24, %v24, 7 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> <i32 7, i32 7, i32 7, i32 7, + i32 7, i32 7, i32 7, i32 7> + ret <8 x i16> %ret +} + +; Test v8i16 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <8 x i16> @f6(<8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vreph %v24, %v24, 2 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> undef, <8 x i16> %val, + <8 x i32> <i32 10, i32 10, i32 10, i32 10, + i32 10, i32 10, i32 10, i32 10> + ret <8 x i16> %ret +} + +; Test v4i32 splat of the first element. +define <4 x i32> @f7(<4 x i32> %val) { +; CHECK-LABEL: f7: +; CHECK: vrepf %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test v4i32 splat of the last element. +define <4 x i32> @f8(<4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vrepf %v24, %v24, 3 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %ret +} + +; Test v4i32 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <4 x i32> @f9(<4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vrepf %v24, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> undef, <4 x i32> %val, + <4 x i32> <i32 5, i32 5, i32 5, i32 5> + ret <4 x i32> %ret +} + +; Test v2i64 splat of the first element. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vrepg %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test v2i64 splat of the last element. +define <2 x i64> @f11(<2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vrepg %v24, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> <i32 1, i32 1> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-02.ll b/llvm/test/CodeGen/SystemZ/vec-perm-02.ll new file mode 100644 index 00000000000..93e4112c0ef --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-02.ll @@ -0,0 +1,144 @@ +; Test replications of a scalar register value, represented as splats. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 splat of the first element. +define <16 x i8> @f1(i8 %scalar) { +; CHECK-LABEL: f1: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepb %v24, [[REG]], 7 +; CHECK: br %r14 + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test v16i8 splat of the last element. +define <16 x i8> @f2(i8 %scalar) { +; CHECK-LABEL: f2: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepb %v24, [[REG]], 7 +; CHECK: br %r14 + %val = insertelement <16 x i8> undef, i8 %scalar, i32 15 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> <i32 15, i32 15, i32 15, i32 15, + i32 15, i32 15, i32 15, i32 15, + i32 15, i32 15, i32 15, i32 15, + i32 15, i32 15, i32 15, i32 15> + ret <16 x i8> %ret +} + +; Test v16i8 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <16 x i8> @f3(i8 %scalar) { +; CHECK-LABEL: f3: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepb %v24, [[REG]], 7 +; CHECK: br %r14 + %val = insertelement <16 x i8> undef, i8 %scalar, i32 4 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> <i32 20, i32 20, i32 20, i32 20, + i32 20, i32 20, i32 20, i32 20, + i32 20, i32 20, i32 20, i32 20, + i32 20, i32 20, i32 20, i32 20> + ret <16 x i8> %ret +} + +; Test v8i16 splat of the first element. +define <8 x i16> @f4(i16 %scalar) { +; CHECK-LABEL: f4: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vreph %v24, [[REG]], 3 +; CHECK: br %r14 + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test v8i16 splat of the last element. +define <8 x i16> @f5(i16 %scalar) { +; CHECK-LABEL: f5: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vreph %v24, [[REG]], 3 +; CHECK: br %r14 + %val = insertelement <8 x i16> undef, i16 %scalar, i32 7 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> <i32 7, i32 7, i32 7, i32 7, + i32 7, i32 7, i32 7, i32 7> + ret <8 x i16> %ret +} + +; Test v8i16 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <8 x i16> @f6(i16 %scalar) { +; CHECK-LABEL: f6: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vreph %v24, [[REG]], 3 +; CHECK: br %r14 + %val = insertelement <8 x i16> undef, i16 %scalar, i32 2 + %ret = shufflevector <8 x i16> undef, <8 x i16> %val, + <8 x i32> <i32 10, i32 10, i32 10, i32 10, + i32 10, i32 10, i32 10, i32 10> + ret <8 x i16> %ret +} + +; Test v4i32 splat of the first element. +define <4 x i32> @f7(i32 %scalar) { +; CHECK-LABEL: f7: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepf %v24, [[REG]], 1 +; CHECK: br %r14 + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test v4i32 splat of the last element. +define <4 x i32> @f8(i32 %scalar) { +; CHECK-LABEL: f8: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepf %v24, [[REG]], 1 +; CHECK: br %r14 + %val = insertelement <4 x i32> undef, i32 %scalar, i32 3 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %ret +} + +; Test v4i32 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <4 x i32> @f9(i32 %scalar) { +; CHECK-LABEL: f9: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepf %v24, [[REG]], 1 +; CHECK: br %r14 + %val = insertelement <4 x i32> undef, i32 %scalar, i32 1 + %ret = shufflevector <4 x i32> undef, <4 x i32> %val, + <4 x i32> <i32 5, i32 5, i32 5, i32 5> + ret <4 x i32> %ret +} + +; Test v2i64 splat of the first element. +define <2 x i64> @f10(i64 %scalar) { +; CHECK-LABEL: f10: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK: br %r14 + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test v2i64 splat of the last element. +define <2 x i64> @f11(i64 %scalar) { +; CHECK-LABEL: f11: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK: br %r14 + %val = insertelement <2 x i64> undef, i64 %scalar, i32 1 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> <i32 1, i32 1> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-03.ll b/llvm/test/CodeGen/SystemZ/vec-perm-03.ll new file mode 100644 index 00000000000..d74948bdb51 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-03.ll @@ -0,0 +1,173 @@ +; Test replications of a scalar memory value, represented as splats. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 replicating load with no offset. +define <16 x i8> @f1(i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vlrepb %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test a v16i8 replicating load with the maximum in-range offset. +define <16 x i8> @f2(i8 *%base) { +; CHECK-LABEL: f2: +; CHECK: vlrepb %v24, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test a v16i8 replicating load with the first out-of-range offset. +define <16 x i8> @f3(i8 *%base) { +; CHECK-LABEL: f3: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepb %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test a v8i16 replicating load with no offset. +define <8 x i16> @f4(i16 *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i16, i16 *%ptr + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test a v8i16 replicating load with the maximum in-range offset. +define <8 x i16> @f5(i16 *%base) { +; CHECK-LABEL: f5: +; CHECK: vlreph %v24, 4094(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i64 2047 + %scalar = load i16, i16 *%ptr + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test a v8i16 replicating load with the first out-of-range offset. +define <8 x i16> @f6(i16 *%base) { +; CHECK-LABEL: f6: +; CHECK: aghi %r2, 4096 +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i64 2048 + %scalar = load i16, i16 *%ptr + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test a v4i32 replicating load with no offset. +define <4 x i32> @f7(i32 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i32, i32 *%ptr + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test a v4i32 replicating load with the maximum in-range offset. +define <4 x i32> @f8(i32 *%base) { +; CHECK-LABEL: f8: +; CHECK: vlrepf %v24, 4092(%r2) +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i64 1023 + %scalar = load i32, i32 *%ptr + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test a v4i32 replicating load with the first out-of-range offset. +define <4 x i32> @f9(i32 *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i64 1024 + %scalar = load i32, i32 *%ptr + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test a v2i64 replicating load with no offset. +define <2 x i64> @f10(i64 *%ptr) { +; CHECK-LABEL: f10: +; CHECK: vlrepg %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i64, i64 *%ptr + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test a v2i64 replicating load with the maximum in-range offset. +define <2 x i64> @f11(i64 *%base) { +; CHECK-LABEL: f11: +; CHECK: vlrepg %v24, 4088(%r2) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 511 + %scalar = load i64, i64 *%ptr + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test a v2i64 replicating load with the first out-of-range offset. +define <2 x i64> @f12(i64 *%base) { +; CHECK-LABEL: f12: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepg %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 512 + %scalar = load i64, i64 *%ptr + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test a v16i8 replicating load with an index. +define <16 x i8> @f19(i8 *%base, i64 %index) { +; CHECK-LABEL: f19: +; CHECK: vlrepb %v24, 1023(%r3,%r2) +; CHECK: br %r14 + %ptr1 = getelementptr i8, i8 *%base, i64 %index + %ptr = getelementptr i8, i8 *%ptr1, i64 1023 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-04.ll b/llvm/test/CodeGen/SystemZ/vec-perm-04.ll new file mode 100644 index 00000000000..1d449b9bb34 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-04.ll @@ -0,0 +1,160 @@ +; Test vector merge high. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a canonical v16i8 merge high. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmrhb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 0, i32 16, i32 1, i32 17, + i32 2, i32 18, i32 3, i32 19, + i32 4, i32 20, i32 5, i32 21, + i32 6, i32 22, i32 7, i32 23> + ret <16 x i8> %ret +} + +; Test a reversed v16i8 merge high. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmrhb %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 16, i32 0, i32 17, i32 1, + i32 18, i32 2, i32 19, i32 3, + i32 20, i32 4, i32 21, i32 5, + i32 22, i32 6, i32 23, i32 7> + ret <16 x i8> %ret +} + +; Test a v16i8 merge high with only the first operand being used. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmrhb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 0, i32 0, i32 1, i32 1, + i32 2, i32 2, i32 3, i32 3, + i32 4, i32 4, i32 5, i32 5, + i32 6, i32 6, i32 7, i32 7> + ret <16 x i8> %ret +} + +; Test a v16i8 merge high with only the second operand being used. +; This is converted into @f3 by target-independent code. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmrhb %v24, %v26, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 16, i32 16, i32 17, i32 17, + i32 18, i32 18, i32 19, i32 19, + i32 20, i32 20, i32 21, i32 21, + i32 22, i32 22, i32 23, i32 23> + ret <16 x i8> %ret +} + +; Test a v16i8 merge with both operands being the same. This too is +; converted into @f3 by target-independent code. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vmrhb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> %val, + <16 x i32> <i32 0, i32 16, i32 17, i32 17, + i32 18, i32 2, i32 3, i32 3, + i32 20, i32 20, i32 5, i32 5, + i32 6, i32 22, i32 23, i32 7> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which some of the indices are don't care. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmrhb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 0, i32 undef, i32 1, i32 17, + i32 undef, i32 18, i32 undef, i32 undef, + i32 undef, i32 20, i32 5, i32 21, + i32 undef, i32 22, i32 7, i32 undef> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which one of the operands is undefined and where +; indices for that operand are "don't care". Target-independent code +; converts the indices themselves into "undef"s. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vmrhb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> <i32 11, i32 16, i32 17, i32 5, + i32 18, i32 10, i32 19, i32 19, + i32 20, i32 20, i32 21, i32 3, + i32 2, i32 22, i32 9, i32 23> + ret <16 x i8> %ret +} + +; Test a canonical v8i16 merge high. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmrhh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 0, i32 8, i32 1, i32 9, + i32 2, i32 10, i32 3, i32 11> + ret <8 x i16> %ret +} + +; Test a reversed v8i16 merge high. +define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vmrhh %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 8, i32 0, i32 9, i32 1, + i32 10, i32 2, i32 11, i32 3> + ret <8 x i16> %ret +} + +; Test a canonical v4i32 merge high. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vmrhf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i32> %ret +} + +; Test a reversed v4i32 merge high. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f11: +; CHECK: vmrhf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 4, i32 0, i32 5, i32 1> + ret <4 x i32> %ret +} + +; Test a canonical v2i64 merge high. +define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f12: +; CHECK: vmrhg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> <i32 0, i32 2> + ret <2 x i64> %ret +} + +; Test a reversed v2i64 merge high. +define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f13: +; CHECK: vmrhg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> <i32 2, i32 0> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-05.ll b/llvm/test/CodeGen/SystemZ/vec-perm-05.ll new file mode 100644 index 00000000000..636228c56ba --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-05.ll @@ -0,0 +1,160 @@ +; Test vector merge low. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a canonical v16i8 merge low. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmrlb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 8, i32 24, i32 9, i32 25, + i32 10, i32 26, i32 11, i32 27, + i32 12, i32 28, i32 13, i32 29, + i32 14, i32 30, i32 15, i32 31> + ret <16 x i8> %ret +} + +; Test a reversed v16i8 merge low. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmrlb %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 24, i32 8, i32 25, i32 9, + i32 26, i32 10, i32 27, i32 11, + i32 28, i32 12, i32 29, i32 13, + i32 30, i32 14, i32 31, i32 15> + ret <16 x i8> %ret +} + +; Test a v16i8 merge low with only the first operand being used. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmrlb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 8, i32 8, i32 9, i32 9, + i32 10, i32 10, i32 11, i32 11, + i32 12, i32 12, i32 13, i32 13, + i32 14, i32 14, i32 15, i32 15> + ret <16 x i8> %ret +} + +; Test a v16i8 merge low with only the second operand being used. +; This is converted into @f3 by target-independent code. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmrlb %v24, %v26, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 24, i32 24, i32 25, i32 25, + i32 26, i32 26, i32 27, i32 27, + i32 28, i32 28, i32 29, i32 29, + i32 30, i32 30, i32 31, i32 31> + ret <16 x i8> %ret +} + +; Test a v16i8 merge with both operands being the same. This too is +; converted into @f3 by target-independent code. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vmrlb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> %val, + <16 x i32> <i32 8, i32 24, i32 25, i32 25, + i32 26, i32 10, i32 11, i32 11, + i32 28, i32 28, i32 13, i32 13, + i32 14, i32 30, i32 31, i32 15> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which some of the indices are don't care. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmrlb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 8, i32 undef, i32 9, i32 25, + i32 undef, i32 26, i32 undef, i32 undef, + i32 undef, i32 28, i32 13, i32 29, + i32 undef, i32 30, i32 15, i32 undef> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which one of the operands is undefined and where +; indices for that operand are "don't care". Target-independent code +; converts the indices themselves into "undef"s. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vmrlb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> <i32 11, i32 24, i32 25, i32 5, + i32 26, i32 10, i32 27, i32 27, + i32 28, i32 28, i32 29, i32 3, + i32 2, i32 30, i32 9, i32 31> + ret <16 x i8> %ret +} + +; Test a canonical v8i16 merge low. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmrlh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 4, i32 12, i32 5, i32 13, + i32 6, i32 14, i32 7, i32 15> + ret <8 x i16> %ret +} + +; Test a reversed v8i16 merge low. +define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vmrlh %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 12, i32 4, i32 13, i32 5, + i32 14, i32 6, i32 15, i32 7> + ret <8 x i16> %ret +} + +; Test a canonical v4i32 merge low. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vmrlf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x i32> %ret +} + +; Test a reversed v4i32 merge low. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f11: +; CHECK: vmrlf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 6, i32 2, i32 7, i32 3> + ret <4 x i32> %ret +} + +; Test a canonical v2i64 merge low. +define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f12: +; CHECK: vmrlg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> <i32 1, i32 3> + ret <2 x i64> %ret +} + +; Test a reversed v2i64 merge low. +define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f13: +; CHECK: vmrlg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> <i32 3, i32 1> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-06.ll b/llvm/test/CodeGen/SystemZ/vec-perm-06.ll new file mode 100644 index 00000000000..298fc60e851 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-06.ll @@ -0,0 +1,140 @@ +; Test vector pack. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a canonical v16i8 pack. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vpkh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 1, i32 3, i32 5, i32 7, + i32 9, i32 11, i32 13, i32 15, + i32 17, i32 19, i32 21, i32 23, + i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %ret +} + +; Test a reversed v16i8 pack. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vpkh %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 17, i32 19, i32 21, i32 23, + i32 25, i32 27, i32 29, i32 31, + i32 1, i32 3, i32 5, i32 7, + i32 9, i32 11, i32 13, i32 15> + ret <16 x i8> %ret +} + +; Test a v16i8 pack with only the first operand being used. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vpkh %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 1, i32 3, i32 5, i32 7, + i32 9, i32 11, i32 13, i32 15, + i32 1, i32 3, i32 5, i32 7, + i32 9, i32 11, i32 13, i32 15> + ret <16 x i8> %ret +} + +; Test a v16i8 pack with only the second operand being used. +; This is converted into @f3 by target-independent code. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vpkh %v24, %v26, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 17, i32 19, i32 21, i32 23, + i32 25, i32 27, i32 29, i32 31, + i32 17, i32 19, i32 21, i32 23, + i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %ret +} + +; Test a v16i8 pack with both operands being the same. This too is +; converted into @f3 by target-independent code. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vpkh %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> %val, + <16 x i32> <i32 1, i32 3, i32 5, i32 7, + i32 9, i32 11, i32 13, i32 15, + i32 17, i32 19, i32 21, i32 23, + i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %ret +} + +; Test a v16i8 pack in which some of the indices are don't care. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vpkh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 1, i32 undef, i32 5, i32 7, + i32 undef, i32 11, i32 undef, i32 undef, + i32 undef, i32 19, i32 21, i32 23, + i32 undef, i32 27, i32 29, i32 undef> + ret <16 x i8> %ret +} + +; Test a v16i8 pack in which one of the operands is undefined and where +; indices for that operand are "don't care". Target-independent code +; converts the indices themselves into "undef"s. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vpkh %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> <i32 7, i32 1, i32 9, i32 15, + i32 15, i32 3, i32 5, i32 1, + i32 17, i32 19, i32 21, i32 23, + i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %ret +} + +; Test a canonical v8i16 pack. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vpkf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 1, i32 3, i32 5, i32 7, + i32 9, i32 11, i32 13, i32 15> + ret <8 x i16> %ret +} + +; Test a reversed v8i16 pack. +define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vpkf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 9, i32 11, i32 13, i32 15, + i32 1, i32 3, i32 5, i32 7> + ret <8 x i16> %ret +} + +; Test a canonical v4i32 pack. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vpkg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x i32> %ret +} + +; Test a reversed v4i32 pack. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f11: +; CHECK: vpkg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 5, i32 7, i32 1, i32 3> + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-07.ll b/llvm/test/CodeGen/SystemZ/vec-perm-07.ll new file mode 100644 index 00000000000..40ca3995524 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-07.ll @@ -0,0 +1,125 @@ +; Test vector shift left double immediate. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift with the lowest useful shift amount. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vsldb %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 1, i32 2, i32 3, i32 4, + i32 5, i32 6, i32 7, i32 8, + i32 9, i32 10, i32 11, i32 12, + i32 13, i32 14, i32 15, i32 16> + ret <16 x i8> %ret +} + +; Test a v16i8 shift with the highest shift amount. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vsldb %v24, %v24, %v26, 15 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 15, i32 16, i32 17, i32 18, + i32 19, i32 20, i32 21, i32 22, + i32 23, i32 24, i32 25, i32 26, + i32 27, i32 28, i32 29, i32 30> + ret <16 x i8> %ret +} + +; Test a v16i8 shift in which the operands need to be reversed. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vsldb %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 20, i32 21, i32 22, i32 23, + i32 24, i32 25, i32 26, i32 27, + i32 28, i32 29, i32 30, i32 31, + i32 0, i32 1, i32 2, i32 3> + ret <16 x i8> %ret +} + +; Test a v16i8 shift in which the operands need to be duplicated. +define <16 x i8> @f4(<16 x i8> %val) { +; CHECK-LABEL: f4: +; CHECK: vsldb %v24, %v24, %v24, 7 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> <i32 7, i32 8, i32 9, i32 10, + i32 11, i32 12, i32 13, i32 14, + i32 15, i32 0, i32 1, i32 2, + i32 3, i32 4, i32 5, i32 6> + ret <16 x i8> %ret +} + +; Test a v16i8 shift in which some of the indices are undefs. +define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vsldb %v24, %v24, %v26, 11 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, + i32 15, i32 16, i32 undef, i32 18, + i32 19, i32 20, i32 21, i32 22, + i32 23, i32 24, i32 25, i32 26> + ret <16 x i8> %ret +} + +; ...and again with reversed operands. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vsldb %v24, %v26, %v24, 13 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 undef, i32 undef, i32 31, i32 0, + i32 1, i32 2, i32 3, i32 4, + i32 5, i32 6, i32 7, i32 8, + i32 9, i32 10, i32 11, i32 12> + ret <16 x i8> %ret +} + +; Test a v8i16 shift with the lowest useful shift amount. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vsldb %v24, %v24, %v26, 2 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 1, i32 2, i32 3, i32 4, + i32 5, i32 6, i32 7, i32 8> + ret <8 x i16> %ret +} + +; Test a v8i16 shift with the highest useful shift amount. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vsldb %v24, %v24, %v26, 14 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 7, i32 8, i32 9, i32 10, + i32 11, i32 12, i32 13, i32 14> + ret <8 x i16> %ret +} + +; Test a v4i32 shift with the lowest useful shift amount. +define <4 x i32> @f9(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f9: +; CHECK: vsldb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 1, i32 2, i32 3, i32 4> + ret <4 x i32> %ret +} + +; Test a v4i32 shift with the highest useful shift amount. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vsldb %v24, %v24, %v26, 12 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 3, i32 4, i32 5, i32 6> + ret <4 x i32> %ret +} + +; We use VPDI for v2i64 shuffles. diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-08.ll b/llvm/test/CodeGen/SystemZ/vec-perm-08.ll new file mode 100644 index 00000000000..4d06377f5a3 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-08.ll @@ -0,0 +1,130 @@ +; Test vector permutes using VPDI. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a high1/low2 permute for v16i8. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 0, i32 1, i32 2, i32 3, + i32 4, i32 5, i32 6, i32 7, + i32 24, i32 25, i32 26, i32 27, + i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %ret +} + +; Test a low2/high1 permute for v16i8. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 24, i32 25, i32 26, i32 27, + i32 28, i32 29, i32 30, i32 31, + i32 0, i32 1, i32 2, i32 3, + i32 4, i32 5, i32 6, i32 7> + ret <16 x i8> %ret +} + +; Test a low1/high2 permute for v16i8. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vpdi %v24, %v24, %v26, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 8, i32 9, i32 10, i32 undef, + i32 12, i32 undef, i32 14, i32 15, + i32 16, i32 17, i32 undef, i32 19, + i32 20, i32 21, i32 22, i32 undef> + ret <16 x i8> %ret +} + +; Test a high2/low1 permute for v16i8. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vpdi %v24, %v26, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 16, i32 17, i32 18, i32 19, + i32 20, i32 21, i32 22, i32 23, + i32 8, i32 9, i32 10, i32 11, + i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %ret +} + +; Test reversing two doublewords in a v16i8. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vpdi %v24, %v24, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> <i32 8, i32 9, i32 10, i32 11, + i32 12, i32 13, i32 14, i32 15, + i32 0, i32 1, i32 2, i32 3, + i32 4, i32 5, i32 6, i32 7> + ret <16 x i8> %ret +} + +; Test a high1/low2 permute for v8i16. +define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 0, i32 1, i32 2, i32 3, + i32 12, i32 13, i32 14, i32 15> + ret <8 x i16> %ret +} + +; Test a low2/high1 permute for v8i16. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 12, i32 13, i32 14, i32 15, + i32 0, i32 1, i32 2, i32 3> + ret <8 x i16> %ret +} + +; Test a high1/low2 permute for v4i32. +define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %ret +} + +; Test a low2/high1 permute for v4i32. +define <4 x i32> @f9(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f9: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %ret +} + +; Test a high1/low2 permute for v2i64. +define <2 x i64> @f10(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f10: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> <i32 0, i32 3> + ret <2 x i64> %ret +} + +; Test low2/high1 permute for v2i64. +define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f11: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> <i32 3, i32 0> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-09.ll b/llvm/test/CodeGen/SystemZ/vec-perm-09.ll new file mode 100644 index 00000000000..9c9632cf030 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-09.ll @@ -0,0 +1,38 @@ +; Test general vector permute of a v16i8. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-CODE %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-CODE-LABEL: f1: +; CHECK-CODE: larl [[REG:%r[0-5]]], +; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE: vperm %v24, %v24, %v26, [[MASK]] +; CHECK-CODE: br %r14 +; +; CHECK-VECTOR: .byte 1 +; CHECK-VECTOR-NEXT: .byte 19 +; CHECK-VECTOR-NEXT: .byte 6 +; CHECK-VECTOR-NEXT: .byte 5 +; CHECK-VECTOR-NEXT: .byte 20 +; CHECK-VECTOR-NEXT: .byte 22 +; CHECK-VECTOR-NEXT: .byte 1 +; CHECK-VECTOR-NEXT: .byte 1 +; CHECK-VECTOR-NEXT: .byte 25 +; CHECK-VECTOR-NEXT: .byte 29 +; CHECK-VECTOR-NEXT: .byte 11 +; Any byte would be OK here +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .byte 31 +; CHECK-VECTOR-NEXT: .byte 4 +; CHECK-VECTOR-NEXT: .byte 15 +; CHECK-VECTOR-NEXT: .byte 19 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> <i32 1, i32 19, i32 6, i32 5, + i32 20, i32 22, i32 1, i32 1, + i32 25, i32 29, i32 11, i32 undef, + i32 31, i32 4, i32 15, i32 19> + ret <16 x i8> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-10.ll b/llvm/test/CodeGen/SystemZ/vec-perm-10.ll new file mode 100644 index 00000000000..382e6dc4c3f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-10.ll @@ -0,0 +1,36 @@ +; Test general vector permute of a v8i16. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-CODE %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-CODE-LABEL: f1: +; CHECK-CODE: larl [[REG:%r[0-5]]], +; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE: vperm %v24, %v26, %v24, [[MASK]] +; CHECK-CODE: br %r14 +; +; CHECK-VECTOR: .byte 0 +; CHECK-VECTOR-NEXT: .byte 1 +; CHECK-VECTOR-NEXT: .byte 26 +; CHECK-VECTOR-NEXT: .byte 27 +; Any 2 bytes would be OK here +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .byte 28 +; CHECK-VECTOR-NEXT: .byte 29 +; CHECK-VECTOR-NEXT: .byte 6 +; CHECK-VECTOR-NEXT: .byte 7 +; CHECK-VECTOR-NEXT: .byte 14 +; CHECK-VECTOR-NEXT: .byte 15 +; CHECK-VECTOR-NEXT: .byte 8 +; CHECK-VECTOR-NEXT: .byte 9 +; CHECK-VECTOR-NEXT: .byte 16 +; CHECK-VECTOR-NEXT: .byte 17 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> <i32 8, i32 5, i32 undef, i32 6, + i32 11, i32 15, i32 12, i32 0> + ret <8 x i16> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-11.ll b/llvm/test/CodeGen/SystemZ/vec-perm-11.ll new file mode 100644 index 00000000000..c9e29880fe0 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-perm-11.ll @@ -0,0 +1,35 @@ +; Test general vector permute of a v4i32. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-CODE %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-CODE-LABEL: f1: +; CHECK-CODE: larl [[REG:%r[0-5]]], +; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE: vperm %v24, %v26, %v24, [[MASK]] +; CHECK-CODE: br %r14 +; +; CHECK-VECTOR: .byte 4 +; CHECK-VECTOR-NEXT: .byte 5 +; CHECK-VECTOR-NEXT: .byte 6 +; CHECK-VECTOR-NEXT: .byte 7 +; CHECK-VECTOR-NEXT: .byte 20 +; CHECK-VECTOR-NEXT: .byte 21 +; CHECK-VECTOR-NEXT: .byte 22 +; CHECK-VECTOR-NEXT: .byte 23 +; Any 4 bytes would be OK here +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .byte 12 +; CHECK-VECTOR-NEXT: .byte 13 +; CHECK-VECTOR-NEXT: .byte 14 +; CHECK-VECTOR-NEXT: .byte 15 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> <i32 5, i32 1, i32 undef, i32 7> + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-01.ll b/llvm/test/CodeGen/SystemZ/vec-shift-01.ll new file mode 100644 index 00000000000..be8605b182c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-01.ll @@ -0,0 +1,39 @@ +; Test vector shift left with vector shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: veslvb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 shift. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: veslvh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 shift. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: veslvf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 shift. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: veslvg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-02.ll b/llvm/test/CodeGen/SystemZ/vec-shift-02.ll new file mode 100644 index 00000000000..2825872e023 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-02.ll @@ -0,0 +1,39 @@ +; Test vector arithmetic shift right with vector shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vesravb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 shift. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vesravh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 shift. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vesravf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 shift. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vesravg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-03.ll b/llvm/test/CodeGen/SystemZ/vec-shift-03.ll new file mode 100644 index 00000000000..c923d8b5d45 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-03.ll @@ -0,0 +1,39 @@ +; Test vector logical shift right with vector shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vesrlvb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 shift. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vesrlvh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 shift. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vesrlvf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 shift. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vesrlvg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-04.ll b/llvm/test/CodeGen/SystemZ/vec-shift-04.ll new file mode 100644 index 00000000000..6fd12897bf5 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-04.ll @@ -0,0 +1,134 @@ +; Test vector shift left with scalar shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift by a variable. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) { +; CHECK-LABEL: f1: +; CHECK: veslb %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i8 + %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0 + %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef, + <16 x i32> zeroinitializer + %ret = shl <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the lowest useful constant. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: veslb %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <16 x i8> %val, <i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1> + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the highest useful constant. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: veslb %v24, %v26, 7 +; CHECK: br %r14 + %ret = shl <16 x i8> %val, <i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7> + ret <16 x i8> %ret +} + +; Test a v8i16 shift by a variable. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) { +; CHECK-LABEL: f4: +; CHECK: veslh %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i16 + %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0 + %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef, + <8 x i32> zeroinitializer + %ret = shl <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the lowest useful constant. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: veslh %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <8 x i16> %val, <i16 1, i16 1, i16 1, i16 1, + i16 1, i16 1, i16 1, i16 1> + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the highest useful constant. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: veslh %v24, %v26, 15 +; CHECK: br %r14 + %ret = shl <8 x i16> %val, <i16 15, i16 15, i16 15, i16 15, + i16 15, i16 15, i16 15, i16 15> + ret <8 x i16> %ret +} + +; Test a v4i32 shift by a variable. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) { +; CHECK-LABEL: f7: +; CHECK: veslf %v24, %v26, 0(%r2) +; CHECK: br %r14 + %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0 + %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef, + <4 x i32> zeroinitializer + %ret = shl <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the lowest useful constant. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: veslf %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the highest useful constant. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: veslf %v24, %v26, 31 +; CHECK: br %r14 + %ret = shl <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31> + ret <4 x i32> %ret +} + +; Test a v2i64 shift by a variable. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) { +; CHECK-LABEL: f10: +; CHECK: veslg %v24, %v26, 0(%r2) +; CHECK: br %r14 + %extshift = sext i32 %shift to i64 + %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0 + %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef, + <2 x i32> zeroinitializer + %ret = shl <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the lowest useful constant. +define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: veslg %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <2 x i64> %val, <i64 1, i64 1> + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the highest useful constant. +define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: veslg %v24, %v26, 63 +; CHECK: br %r14 + %ret = shl <2 x i64> %val, <i64 63, i64 63> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-05.ll b/llvm/test/CodeGen/SystemZ/vec-shift-05.ll new file mode 100644 index 00000000000..22ce46b2d0d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-05.ll @@ -0,0 +1,134 @@ +; Test vector arithmetic shift right with scalar shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift by a variable. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) { +; CHECK-LABEL: f1: +; CHECK: vesrab %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i8 + %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0 + %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef, + <16 x i32> zeroinitializer + %ret = ashr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the lowest useful constant. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vesrab %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <16 x i8> %val, <i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1> + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the highest useful constant. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vesrab %v24, %v26, 7 +; CHECK: br %r14 + %ret = ashr <16 x i8> %val, <i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7> + ret <16 x i8> %ret +} + +; Test a v8i16 shift by a variable. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) { +; CHECK-LABEL: f4: +; CHECK: vesrah %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i16 + %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0 + %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef, + <8 x i32> zeroinitializer + %ret = ashr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the lowest useful constant. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vesrah %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <8 x i16> %val, <i16 1, i16 1, i16 1, i16 1, + i16 1, i16 1, i16 1, i16 1> + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the highest useful constant. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vesrah %v24, %v26, 15 +; CHECK: br %r14 + %ret = ashr <8 x i16> %val, <i16 15, i16 15, i16 15, i16 15, + i16 15, i16 15, i16 15, i16 15> + ret <8 x i16> %ret +} + +; Test a v4i32 shift by a variable. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) { +; CHECK-LABEL: f7: +; CHECK: vesraf %v24, %v26, 0(%r2) +; CHECK: br %r14 + %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0 + %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef, + <4 x i32> zeroinitializer + %ret = ashr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the lowest useful constant. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vesraf %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the highest useful constant. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vesraf %v24, %v26, 31 +; CHECK: br %r14 + %ret = ashr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31> + ret <4 x i32> %ret +} + +; Test a v2i64 shift by a variable. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) { +; CHECK-LABEL: f10: +; CHECK: vesrag %v24, %v26, 0(%r2) +; CHECK: br %r14 + %extshift = sext i32 %shift to i64 + %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0 + %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef, + <2 x i32> zeroinitializer + %ret = ashr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the lowest useful constant. +define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vesrag %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <2 x i64> %val, <i64 1, i64 1> + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the highest useful constant. +define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: vesrag %v24, %v26, 63 +; CHECK: br %r14 + %ret = ashr <2 x i64> %val, <i64 63, i64 63> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-06.ll b/llvm/test/CodeGen/SystemZ/vec-shift-06.ll new file mode 100644 index 00000000000..8a5bb0a9a55 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-06.ll @@ -0,0 +1,134 @@ +; Test vector logical shift right with scalar shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift by a variable. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) { +; CHECK-LABEL: f1: +; CHECK: vesrlb %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i8 + %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0 + %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef, + <16 x i32> zeroinitializer + %ret = lshr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the lowest useful constant. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vesrlb %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <16 x i8> %val, <i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1, + i8 1, i8 1, i8 1, i8 1> + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the highest useful constant. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vesrlb %v24, %v26, 7 +; CHECK: br %r14 + %ret = lshr <16 x i8> %val, <i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7, + i8 7, i8 7, i8 7, i8 7> + ret <16 x i8> %ret +} + +; Test a v8i16 shift by a variable. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) { +; CHECK-LABEL: f4: +; CHECK: vesrlh %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i16 + %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0 + %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef, + <8 x i32> zeroinitializer + %ret = lshr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the lowest useful constant. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vesrlh %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <8 x i16> %val, <i16 1, i16 1, i16 1, i16 1, + i16 1, i16 1, i16 1, i16 1> + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the highest useful constant. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vesrlh %v24, %v26, 15 +; CHECK: br %r14 + %ret = lshr <8 x i16> %val, <i16 15, i16 15, i16 15, i16 15, + i16 15, i16 15, i16 15, i16 15> + ret <8 x i16> %ret +} + +; Test a v4i32 shift by a variable. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) { +; CHECK-LABEL: f7: +; CHECK: vesrlf %v24, %v26, 0(%r2) +; CHECK: br %r14 + %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0 + %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef, + <4 x i32> zeroinitializer + %ret = lshr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the lowest useful constant. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vesrlf %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the highest useful constant. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vesrlf %v24, %v26, 31 +; CHECK: br %r14 + %ret = lshr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31> + ret <4 x i32> %ret +} + +; Test a v2i64 shift by a variable. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) { +; CHECK-LABEL: f10: +; CHECK: vesrlg %v24, %v26, 0(%r2) +; CHECK: br %r14 + %extshift = sext i32 %shift to i64 + %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0 + %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef, + <2 x i32> zeroinitializer + %ret = lshr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the lowest useful constant. +define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vesrlg %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <2 x i64> %val, <i64 1, i64 1> + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the highest useful constant. +define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: vesrlg %v24, %v26, 63 +; CHECK: br %r14 + %ret = lshr <2 x i64> %val, <i64 63, i64 63> + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-shift-07.ll b/llvm/test/CodeGen/SystemZ/vec-shift-07.ll new file mode 100644 index 00000000000..f229c5e25a4 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-shift-07.ll @@ -0,0 +1,182 @@ +; Test vector sign extensions. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i1->v16i8 extension. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: veslb [[REG:%v[0-9]+]], %v24, 7 +; CHECK: vesrab %v24, [[REG]], 7 +; CHECK: br %r14 + %trunc = trunc <16 x i8> %val to <16 x i1> + %ret = sext <16 x i1> %trunc to <16 x i8> + ret <16 x i8> %ret +} + +; Test a v8i1->v8i16 extension. +define <8 x i16> @f2(<8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: veslh [[REG:%v[0-9]+]], %v24, 15 +; CHECK: vesrah %v24, [[REG]], 15 +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i1> + %ret = sext <8 x i1> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v8i8->v8i16 extension. +define <8 x i16> @f3(<8 x i16> %val) { +; CHECK-LABEL: f3: +; CHECK: veslh [[REG:%v[0-9]+]], %v24, 8 +; CHECK: vesrah %v24, [[REG]], 8 +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i8> + %ret = sext <8 x i8> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v4i1->v4i32 extension. +define <4 x i32> @f4(<4 x i32> %val) { +; CHECK-LABEL: f4: +; CHECK: veslf [[REG:%v[0-9]+]], %v24, 31 +; CHECK: vesraf %v24, [[REG]], 31 +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i1> + %ret = sext <4 x i1> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i8->v4i32 extension. +define <4 x i32> @f5(<4 x i32> %val) { +; CHECK-LABEL: f5: +; CHECK: veslf [[REG:%v[0-9]+]], %v24, 24 +; CHECK: vesraf %v24, [[REG]], 24 +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i8> + %ret = sext <4 x i8> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i16->v4i32 extension. +define <4 x i32> @f6(<4 x i32> %val) { +; CHECK-LABEL: f6: +; CHECK: veslf [[REG:%v[0-9]+]], %v24, 16 +; CHECK: vesraf %v24, [[REG]], 16 +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i16> + %ret = sext <4 x i16> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v2i1->v2i64 extension. +define <2 x i64> @f7(<2 x i64> %val) { +; CHECK-LABEL: f7: +; CHECK: veslg [[REG:%v[0-9]+]], %v24, 63 +; CHECK: vesrag %v24, [[REG]], 63 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i1> + %ret = sext <2 x i1> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i8->v2i64 extension. +define <2 x i64> @f8(<2 x i64> %val) { +; CHECK-LABEL: f8: +; CHECK: vsegb %v24, %v24 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i8> + %ret = sext <2 x i8> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i16->v2i64 extension. +define <2 x i64> @f9(<2 x i64> %val) { +; CHECK-LABEL: f9: +; CHECK: vsegh %v24, %v24 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i16> + %ret = sext <2 x i16> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i32->v2i64 extension. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vsegf %v24, %v24 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i32> + %ret = sext <2 x i32> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test an alternative v2i8->v2i64 extension. +define <2 x i64> @f11(<2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vsegb %v24, %v24 +; CHECK: br %r14 + %shl = shl <2 x i64> %val, <i64 56, i64 56> + %ret = ashr <2 x i64> %shl, <i64 56, i64 56> + ret <2 x i64> %ret +} + +; Test an alternative v2i16->v2i64 extension. +define <2 x i64> @f12(<2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: vsegh %v24, %v24 +; CHECK: br %r14 + %shl = shl <2 x i64> %val, <i64 48, i64 48> + %ret = ashr <2 x i64> %shl, <i64 48, i64 48> + ret <2 x i64> %ret +} + +; Test an alternative v2i32->v2i64 extension. +define <2 x i64> @f13(<2 x i64> %val) { +; CHECK-LABEL: f13: +; CHECK: vsegf %v24, %v24 +; CHECK: br %r14 + %shl = shl <2 x i64> %val, <i64 32, i64 32> + %ret = ashr <2 x i64> %shl, <i64 32, i64 32> + ret <2 x i64> %ret +} + +; Test an extraction-based v2i8->v2i64 extension. +define <2 x i64> @f14(<16 x i8> %val) { +; CHECK-LABEL: f14: +; CHECK: vsegb %v24, %v24 +; CHECK: br %r14 + %elt0 = extractelement <16 x i8> %val, i32 7 + %elt1 = extractelement <16 x i8> %val, i32 15 + %ext0 = sext i8 %elt0 to i64 + %ext1 = sext i8 %elt1 to i64 + %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0 + %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1 + ret <2 x i64> %vec1 +} + +; Test an extraction-based v2i16->v2i64 extension. +define <2 x i64> @f15(<16 x i16> %val) { +; CHECK-LABEL: f15: +; CHECK: vsegh %v24, %v24 +; CHECK: br %r14 + %elt0 = extractelement <16 x i16> %val, i32 3 + %elt1 = extractelement <16 x i16> %val, i32 7 + %ext0 = sext i16 %elt0 to i64 + %ext1 = sext i16 %elt1 to i64 + %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0 + %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1 + ret <2 x i64> %vec1 +} + +; Test an extraction-based v2i32->v2i64 extension. +define <2 x i64> @f16(<16 x i32> %val) { +; CHECK-LABEL: f16: +; CHECK: vsegf %v24, %v24 +; CHECK: br %r14 + %elt0 = extractelement <16 x i32> %val, i32 1 + %elt1 = extractelement <16 x i32> %val, i32 3 + %ext0 = sext i32 %elt0 to i64 + %ext1 = sext i32 %elt1 to i64 + %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0 + %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1 + ret <2 x i64> %vec1 +} diff --git a/llvm/test/CodeGen/SystemZ/vec-sub-01.ll b/llvm/test/CodeGen/SystemZ/vec-sub-01.ll new file mode 100644 index 00000000000..9e5b4f81e6d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-sub-01.ll @@ -0,0 +1,39 @@ +; Test vector subtraction. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 subtraction. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 subtraction. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vsh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 subtraction. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vsf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 subtraction. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vsg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/vec-xor-01.ll b/llvm/test/CodeGen/SystemZ/vec-xor-01.ll new file mode 100644 index 00000000000..063b768117c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-xor-01.ll @@ -0,0 +1,39 @@ +; Test vector XOR. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 XOR. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 XOR. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 XOR. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 XOR. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} |

