diff options
author | Ulrich Weigand <ulrich.weigand@de.ibm.com> | 2015-05-05 19:29:21 +0000 |
---|---|---|
committer | Ulrich Weigand <ulrich.weigand@de.ibm.com> | 2015-05-05 19:29:21 +0000 |
commit | cd2a1b5341a3c42b1a56f8f301bd0de0343b5e8e (patch) | |
tree | 1a531a74e28c6c4ffbffa35ad6872e8cbe5f38e3 /llvm/lib/Target/SystemZ | |
parent | 49506d78e7f437785f4d3f68063f4aa9c622bb2c (diff) | |
download | bcm5719-llvm-cd2a1b5341a3c42b1a56f8f301bd0de0343b5e8e.tar.gz bcm5719-llvm-cd2a1b5341a3c42b1a56f8f301bd0de0343b5e8e.zip |
[SystemZ] Handle sub-128 vectors
The ABI allows sub-128 vectors to be passed and returned in registers,
with the vector occupying the upper part of a register. We therefore
want to legalize those types by widening the vector rather than promoting
the elements.
The patch includes some simple tests for sub-128 vectors and also tests
that we can recognize various pack sequences, some of which use sub-128
vectors as temporary results. One of these forms is based on the pack
sequences generated by llvmpipe when no intrinsics are used.
Signed unpacks are recognized as BUILD_VECTORs whose elements are
individually sign-extended. Unsigned unpacks can have the equivalent
form with zero extension, but they also occur as shuffles in which some
elements are zero.
Based on a patch by Richard Sandiford.
llvm-svn: 236525
Diffstat (limited to 'llvm/lib/Target/SystemZ')
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZCallingConv.h | 17 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZCallingConv.td | 17 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 81 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZISelLowering.h | 28 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZInstrVector.td | 24 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZOperators.td | 15 |
6 files changed, 155 insertions, 27 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index 8b8146762b6..bff0706618a 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -28,6 +28,14 @@ private: /// See ISD::OutputArg::IsFixed. SmallVector<bool, 4> ArgIsFixed; + /// Records whether the value was widened from a short vector type. + SmallVector<bool, 4> ArgIsShortVector; + + // Check whether ArgVT is a short vector type. + bool IsShortVectorType(EVT ArgVT) { + return ArgVT.isVector() && ArgVT.getStoreSize() <= 8; + } + public: SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) @@ -39,6 +47,10 @@ public: ArgIsFixed.clear(); for (unsigned i = 0; i < Ins.size(); ++i) ArgIsFixed.push_back(true); + // Record whether the call operand was a short vector. + ArgIsShortVector.clear(); + for (unsigned i = 0; i < Ins.size(); ++i) + ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT)); CCState::AnalyzeFormalArguments(Ins, Fn); } @@ -49,6 +61,10 @@ public: ArgIsFixed.clear(); for (unsigned i = 0; i < Outs.size(); ++i) ArgIsFixed.push_back(Outs[i].IsFixed); + // Record whether the call operand was a short vector. + ArgIsShortVector.clear(); + for (unsigned i = 0; i < Outs.size(); ++i) + ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT)); CCState::AnalyzeCallOperands(Outs, Fn); } @@ -60,6 +76,7 @@ public: CCAssignFn Fn) = delete; bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } + bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; } }; } // end namespace llvm diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index a2f996e60df..be8f00b57ad 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -21,6 +21,11 @@ class CCIfSubtarget<string F, CCAction A> class CCIfFixed<CCAction A> : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>; +// Match if this specific argument was widened from a short vector type. +class CCIfShortVector<CCAction A> + : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>; + + //===----------------------------------------------------------------------===// // z/Linux return value calling convention //===----------------------------------------------------------------------===// @@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[ CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, // Similarly for vectors, with V24 being the ABI-compliant choice. + // Sub-128 vectors are returned in the same way, but they're widened + // to one of these types during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> @@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, - // The first 8 named vector arguments are passed in V24-V31. + // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors + // are passed in the same way, but they're widened to one of these types + // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfFixed<CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>>, + // However, sub-128 vectors which need to go on the stack occupy just a + // single 8-byte-aligned 8-byte stack slot. Pass as i64. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfShortVector<CCBitConvertToType<i64>>>>, + // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index ff79a48179f..c3842519008 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Convert a GPR scalar to a vector by inserting it into element 0. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + // Use a series of unpacks for extensions. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + // Detect shifts by a scalar amount and convert them into // V*_BY_SCALAR. setOperationAction(ISD::SHL, VT, Custom); @@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL, else if (VA.getLocInfo() == CCValAssign::Indirect) Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value, MachinePointerInfo(), false, false, false, 0); - else + else if (VA.getLocInfo() == CCValAssign::BCvt) { + // If this is a short vector argument loaded from the stack, + // extend from i64 to full vector size and then bitcast. + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT().isVector()); + Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64, + Value, DAG.getUNDEF(MVT::i64)); + Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value); + } else assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo"); return Value; } @@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL, return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value); case CCValAssign::AExt: return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value); + case CCValAssign::BCvt: + // If this is a short vector argument to be stored to the stack, + // bitcast to v2i64 and then extract first element. + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT().isVector()); + Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value, + DAG.getConstant(0, DL, MVT::i32)); case CCValAssign::Full: return Value; default: @@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getNode(ISD::BITCAST, DL, VT, Res); } +SDValue +SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, + unsigned UnpackHigh) const { + SDValue PackedOp = Op.getOperand(0); + EVT OutVT = Op.getValueType(); + EVT InVT = PackedOp.getValueType(); + unsigned ToBits = OutVT.getVectorElementType().getSizeInBits(); + unsigned FromBits = InVT.getVectorElementType().getSizeInBits(); + do { + FromBits *= 2; + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), + SystemZ::VectorBits / FromBits); + PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp); + } while (FromBits != ToBits); + return PackedOp; +} + SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const { // Look for cases where a vector shift can use the *_BY_SCALAR form. @@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH); case ISD::SHL: return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); case ISD::SRL: @@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(PERMUTE_DWORDS); OPCODE(PERMUTE); OPCODE(PACK); + OPCODE(UNPACK_HIGH); + OPCODE(UNPACKL_HIGH); + OPCODE(UNPACK_LOW); + OPCODE(UNPACKL_LOW); OPCODE(VSHL_BY_SCALAR); OPCODE(VSRL_BY_SCALAR); OPCODE(VSRA_BY_SCALAR); @@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, } } } - // (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF - // for v4f32. - if (Opcode == SystemZISD::MERGE_HIGH) { + if (Opcode == SystemZISD::MERGE_HIGH || + Opcode == SystemZISD::MERGE_LOW) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); - if (Op0 == Op1) { - if (Op0.getOpcode() == ISD::BITCAST) - Op0 = Op0.getOperand(0); - if (Op0.getOpcode() == SystemZISD::BYTE_MASK && - cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op0.getOpcode() == SystemZISD::BYTE_MASK && + cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) { + // (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF + // for v4f32. + if (Op1 == N->getOperand(0)) return Op1; + // (z_merge_? 0, X) -> (z_unpackl_? 0, X). + EVT VT = Op1.getValueType(); + unsigned ElemBytes = VT.getVectorElementType().getStoreSize(); + if (ElemBytes <= 4) { + Opcode = (Opcode == SystemZISD::MERGE_HIGH ? + SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW); + EVT InVT = VT.changeVectorElementTypeToInteger(); + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16), + SystemZ::VectorBytes / ElemBytes / 2); + if (VT != InVT) { + Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1); + DCI.AddToWorklist(Op1.getNode()); + } + SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1); + DCI.AddToWorklist(Op.getNode()); + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); + } } } // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 24a3f4bb5d4..7a3b6fa85ae 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -201,6 +201,15 @@ enum { // Pack vector operands 0 and 1 into a single vector with half-sized elements. PACK, + // Unpack the first half of vector operand 0 into double-sized elements. + // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends. + UNPACK_HIGH, + UNPACKL_HIGH, + + // Likewise for the second half. + UNPACK_LOW, + UNPACKL_LOW, + // Shift each element of vector operand 0 by the number of bits specified // by scalar operand 1. VSHL_BY_SCALAR, @@ -306,6 +315,23 @@ public: // want to clobber the upper 32 bits of a GPR unnecessarily. return MVT::i32; } + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) + const override { + // Widen subvectors to the full width rather than promoting integer + // elements. This is better because: + // + // (a) it means that we can handle the ABI for passing and returning + // sub-128 vectors without having to handle them as legal types. + // + // (b) we don't have instructions to extend on load and truncate on store, + // so promoting the integers is less efficient. + // + // (c) there are no multiplication instructions for the widest integer + // type (v2i64). + if (VT.getVectorElementType().getSizeInBits() % 8 == 0) + return TypeWidenVector; + return TargetLoweringBase::getPreferredVectorAction(VT); + } EVT getSetCCResultType(LLVMContext &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -417,6 +443,8 @@ private: SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, + unsigned UnpackHigh) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp, diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 8abaeb69a20..f95714d1e70 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in { def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>; // Unpack high. - def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>; - def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>; - def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>; + def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>; + def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>; + def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>; // Unpack logical high. - def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>; - def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>; - def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>; + def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>; + def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>; + def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>; // Unpack low. - def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, null_frag, v128h, v128b, 0>; - def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>; - def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, null_frag, v128g, v128f, 2>; + def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, z_unpack_low, v128h, v128b, 0>; + def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>; + def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, z_unpack_low, v128g, v128f, 2>; // Unpack logical low. - def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>; - def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>; - def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>; + def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>; + def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>; + def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index 63c217413ac..9bf288aa68e 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -193,6 +193,10 @@ def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS", SDT_ZVecTernaryInt>; def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>; def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>; +def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>; +def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>; +def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>; +def z_unpackl_low : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>; def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR", SDT_ZVecBinaryInt>; def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR", @@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr), def z_vllezf32 : PatFrag<(ops node:$addr), (bitconvert (z_merge_high - (v2i64 (bitconvert - (z_merge_high - (v4f32 (z_vzero)), - (v4f32 (scalar_to_vector - (f32 (load node:$addr))))))), + (v2i64 + (z_unpackl_high + (v4i32 + (bitconvert + (v4f32 (scalar_to_vector + (f32 (load node:$addr)))))))), (v2i64 (z_vzero))))>; def z_vllezf64 : PatFrag<(ops node:$addr), (z_merge_high |