diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-06-03 08:06:03 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-06-03 08:06:03 +0000 |
| commit | e85506b6e0de80276b8f1a778dd6156497681829 (patch) | |
| tree | 10cd578fca25e585c92d9a9e337c4ccb3267eabe /llvm/lib | |
| parent | d80c7c4808a7705decdcf9f4d9937ac02db6d6a1 (diff) | |
| download | bcm5719-llvm-e85506b6e0de80276b8f1a778dd6156497681829.tar.gz bcm5719-llvm-e85506b6e0de80276b8f1a778dd6156497681829.zip | |
[X86][XOP] Support for VPERMIL2PD/VPERMIL2PS 2-input shuffle instructions
This patch begins adding support for lowering to the XOP VPERMIL2PD/VPERMIL2PS shuffle instructions - adding the X86ISD::VPERMIL2 opcode and cleaning up the usage.
The internal llvm intrinsics were assuming the shuffle mask operand was the same type as the float/double input operands (I guess to simplify the intrinsic definitions in X86InstrXOP.td to a single value type). These needed changing to integer types (matching the clang builtin and the AMD intrinsics definitions), an auto upgrade path is added to convert old calls.
Mask decoding/target shuffle support will be added in future patches.
Differential Revision: http://reviews.llvm.org/D20049
llvm-svn: 271633
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 35 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrXOP.td | 42 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 4 |
6 files changed, 74 insertions, 17 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 689859d4f67..99aa2376fed 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -282,6 +282,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { NewFn = F; return true; } + // Upgrade any XOP PERMIL2 index operand still using a float/double vector. + if (Name.startswith("x86.xop.vpermil2")) { + auto Params = F->getFunctionType()->params(); + auto Idx = Params[2]; + if (Idx->getScalarType()->isFloatingPointTy()) { + F->setName(Name + ".old"); + unsigned IdxSize = Idx->getPrimitiveSizeInBits(); + unsigned EltSize = Idx->getScalarSizeInBits(); + Intrinsic::ID Permil2ID; + if (EltSize == 64 && IdxSize == 128) + Permil2ID = Intrinsic::x86_xop_vpermil2pd; + else if (EltSize == 32 && IdxSize == 128) + Permil2ID = Intrinsic::x86_xop_vpermil2ps; + else if (EltSize == 64 && IdxSize == 256) + Permil2ID = Intrinsic::x86_xop_vpermil2pd_256; + else + Permil2ID = Intrinsic::x86_xop_vpermil2ps_256; + NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID); + return true; + } + } break; } } @@ -911,6 +932,20 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { CI->eraseFromParent(); return; + case Intrinsic::x86_xop_vpermil2pd: + case Intrinsic::x86_xop_vpermil2ps: + case Intrinsic::x86_xop_vpermil2pd_256: + case Intrinsic::x86_xop_vpermil2ps_256: { + SmallVector<Value *, 4> Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType()); + VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy); + Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy); + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name)); + CI->eraseFromParent(); + return; + } + case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestnzc: { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5babe6e8515..021e1767da8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -21947,6 +21947,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; + case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index ab7cf955cce..f532cddeaa3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -451,6 +451,8 @@ namespace llvm { VPCOM, VPCOMU, // XOP packed permute bytes. VPPERM, + // XOP two source permutation. + VPERMIL2, // Vector multiply packed unsigned doubleword integers. PMULUDQ, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 858fb4f2f10..958bb822a06 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -245,7 +245,12 @@ def X86vpcomu : SDNode<"X86ISD::VPCOMU", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; - +def X86vpermil2 : SDNode<"X86ISD::VPERMIL2", + SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameSizeAs<0,3>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, i8>]>>; def X86vpperm : SDNode<"X86ISD::VPPERM", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index 57e6c1aec64..f49917b80f3 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -342,27 +342,34 @@ let Predicates = [HasXOP] in { (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>; } -multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, - Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { +multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, ValueType vt256, + ValueType id128, ValueType id256, + PatFrag ld_128, PatFrag ld_256> { def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, - (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + (id128 VR128:$src3), (i8 imm:$src4))))]>; def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4), + (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, - (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + (id128 (bitconvert (loadv2i64 addr:$src3))), + (i8 imm:$src4))))]>, VEX_W, MemOp4; def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, - (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (ld_128 addr:$src2))), + (id128 VR128:$src3), (i8 imm:$src4))))]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), @@ -376,21 +383,24 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, - (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; + (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), + (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4), + (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, - (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, - VEX_W, MemOp4, VEX_L; + (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), + (id256 (bitconvert (loadv4i64 addr:$src3))), + (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L; def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, - (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>, - VEX_L; + (vt256 (OpNode (vt256 VR256:$src1), + (vt256 (bitconvert (ld_256 addr:$src2))), + (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rrY_REV : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), @@ -401,10 +411,10 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, } let ExeDomain = SSEPackedDouble in - defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, - int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64, + v2i64, v4i64, loadv2f64, loadv4f64>; let ExeDomain = SSEPackedSingle in - defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, - int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32, + v4i32, v8i32, loadv4f32, loadv8f32>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index ed8da4e24fe..b0eea57fbb3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -2234,6 +2234,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0), X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), |

