diff options
| author | Nate Begeman <natebegeman@mac.com> | 2010-07-27 22:37:06 +0000 | 
|---|---|---|
| committer | Nate Begeman <natebegeman@mac.com> | 2010-07-27 22:37:06 +0000 | 
| commit | 269a6da023c70708c7f86a89575c0fd1d2c5ae71 (patch) | |
| tree | 81a9539317098fae957e0f5e4b3e9eef61145683 | |
| parent | c1124300fe0a45b9e4a7955b7df32e37386c5b76 (diff) | |
| download | bcm5719-llvm-269a6da023c70708c7f86a89575c0fd1d2c5ae71.tar.gz bcm5719-llvm-269a6da023c70708c7f86a89575c0fd1d2c5ae71.zip  | |
~40% faster vector shl <4 x i32> on SSE 4.1  Larger improvements for smaller types coming in future patches.
For:
define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
entry:
  %shl = shl <4 x i32> %r, %a                     ; <<4 x i32>> [#uses=1]
  %tmp2 = bitcast <4 x i32> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %tmp2
}
We get:
_shl:                                   ## @shl
	pslld	$23, %xmm1
	paddd	LCPI0_0, %xmm1
	cvttps2dq	%xmm1, %xmm1
	pmulld	%xmm1, %xmm0
	ret
Instead of:
_shl:                                   ## @shl
	pshufd	$3, %xmm0, %xmm2
	movd	%xmm2, %eax
	pshufd	$3, %xmm1, %xmm2
	movd	%xmm2, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm2
	pshufd	$1, %xmm0, %xmm3
	movd	%xmm3, %eax
	pshufd	$1, %xmm1, %xmm3
	movd	%xmm3, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm3
	punpckldq	%xmm2, %xmm3
	movd	%xmm0, %eax
	movd	%xmm1, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm2
	movhlps	%xmm0, %xmm0
	movd	%xmm0, %eax
	movhlps	%xmm1, %xmm1
	movd	%xmm1, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm0
	punpckldq	%xmm0, %xmm2
	movdqa	%xmm2, %xmm0
	punpckldq	%xmm3, %xmm0
	ret
llvm-svn: 109549
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 33 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_shift4.ll | 14 | 
3 files changed, 48 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 618bb90d445..88bc8d0a92b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -838,6 +838,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)      // FIXME: Do we need to handle scalar-to-vector here?      setOperationAction(ISD::MUL,                MVT::v4i32, Legal); +    // Can turn SHL into an integer multiply. +    setOperationAction(ISD::SHL,                MVT::v4i32, Custom); +      // i8 and i16 vectors are custom , because the source register and source      // source memory operand types are not the same width.  f32 vectors are      // custom since the immediate controlling the insert encodes additional @@ -7498,6 +7501,35 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {    return Res;  } +SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { +  EVT VT = Op.getValueType(); +  DebugLoc dl = Op.getDebugLoc(); +  SDValue R = Op.getOperand(0); + +  assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); +  assert(VT == MVT::v4i32 && "Only know how to lower v4i32"); +   +  Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, +                   DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), +                   Op.getOperand(1), DAG.getConstant(23, MVT::i32)); + +  std::vector<Constant*> CV; +  LLVMContext *Context = DAG.getContext(); +  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); +  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); +  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); +  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); +  Constant *C = ConstantVector::get(CV); +  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); +  SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, +                               PseudoSourceValue::getConstantPool(), 0, +                               false, false, 16); + +  Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); +  Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); +  Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); +  return DAG.getNode(ISD::MUL, dl, VT, Op, R); +}  SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {    // Lower the "add/sub/mul with overflow" instruction into a regular ins plus @@ -7730,6 +7762,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {    case ISD::CTLZ:               return LowerCTLZ(Op, DAG);    case ISD::CTTZ:               return LowerCTTZ(Op, DAG);    case ISD::MUL:                return LowerMUL_V2I64(Op, DAG); +  case ISD::SHL:                return LowerSHL(Op, DAG);    case ISD::SADDO:    case ISD::UADDO:    case ISD::SSUBO: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 96c97d9f9ef..3556579ae65 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -723,6 +723,7 @@ namespace llvm {      SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const; +    SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/X86/vec_shift4.ll b/llvm/test/CodeGen/X86/vec_shift4.ll new file mode 100644 index 00000000000..d8f4e4ec689 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_shift4.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s + +define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { +entry: +; CHECK-NOT: shll +; CHECK: pslld +; CHECK: paddd +; CHECK: cvttps2dq +; CHECK: pmulld + +  %shl = shl <4 x i32> %r, %a                     ; <<4 x i32>> [#uses=1] +  %tmp2 = bitcast <4 x i32> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1] +  ret <2 x i64> %tmp2 +}  | 

