diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2018-10-04 16:25:05 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2018-10-04 16:25:05 +0000 |
| commit | 3746e11abedb8dbe017a6dc038944bf35ab5bd29 (patch) | |
| tree | 82d996de8fd3f7b697b758723db28abe90619cbf | |
| parent | a4c17dd7f38a6c9ef2199d9ee5384c0af37a2d9e (diff) | |
| download | bcm5719-llvm-3746e11abedb8dbe017a6dc038944bf35ab5bd29.tar.gz bcm5719-llvm-3746e11abedb8dbe017a6dc038944bf35ab5bd29.zip | |
[InstCombine] allow bitcast to/from FP for vector insert/extract transform
This is a follow-up to rL343482 / D52439.
This was a pattern that initially caused the commit to be reverted because
the transform requires a bitcast as shown here.
llvm-svn: 343794
3 files changed, 53 insertions, 17 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 945664de686..f01f2b0eddd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -189,9 +189,7 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, // If the source elements are wider than the destination, try to shift and // truncate a subset of scalar bits of an insert op. - // TODO: This is limited to integer types, but we could bitcast to/from FP. - if (NumSrcElts < NumElts && SrcTy->getScalarType()->isIntegerTy() && - DestTy->getScalarType()->isIntegerTy()) { + if (NumSrcElts < NumElts) { Value *Scalar; uint64_t InsIndexC; if (!match(X, m_InsertElement(m_Value(), m_Value(Scalar), @@ -220,13 +218,42 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, unsigned Chunk = ExtIndexC % NarrowingRatio; if (IsBigEndian) Chunk = NarrowingRatio - 1 - Chunk; - unsigned ShAmt = Chunk * DestTy->getPrimitiveSizeInBits(); + + // Bail out if this is an FP vector to FP vector sequence. That would take + // more instructions than we started with unless there is no shift, and it + // may not be handled as well in the backend. + bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy(); + bool NeedDestBitcast = DestTy->isFloatingPointTy(); + if (NeedSrcBitcast && NeedDestBitcast) + return nullptr; + + unsigned SrcWidth = SrcTy->getScalarSizeInBits(); + unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); + unsigned ShAmt = Chunk * DestWidth; + + // TODO: This limitation is more strict than necessary. We could sum the + // number of new instructions and subtract the number eliminated to know if + // we can proceed. + if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse()) + if (NeedSrcBitcast || NeedDestBitcast) + return nullptr; + + if (NeedSrcBitcast) { + Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth); + Scalar = Builder.CreateBitCast(Scalar, SrcIntTy); + } + if (ShAmt) { // Bail out if we could end with more instructions than we started with. if (!Ext.getVectorOperand()->hasOneUse()) return nullptr; Scalar = Builder.CreateLShr(Scalar, ShAmt); } + + if (NeedDestBitcast) { + Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth); + return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy); + } return new TruncInst(Scalar, DestTy); } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 2ca3e328d08..af34a3fd371 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -299,9 +299,8 @@ define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( ; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %tmp, i64 0 -; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <8 x i16> -; CHECK-NEXT: %tmp2 = extractelement <8 x i16> %tmp1, i32 0 +; CHECK-NEXT: %1 = bitcast float %tmp to i32 +; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16 ; CHECK-NEXT: ret i16 %tmp2 define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 diff --git a/llvm/test/Transforms/InstCombine/extractelement.ll b/llvm/test/Transforms/InstCombine/extractelement.ll index ae91396e2ff..5d6a3a1c355 100644 --- a/llvm/test/Transforms/InstCombine/extractelement.ll +++ b/llvm/test/Transforms/InstCombine/extractelement.ll @@ -164,11 +164,16 @@ define i8 @bitcasted_inselt_wide_source_uses(i32 %x) { } define float @bitcasted_inselt_to_FP(i64 %x) { -; ANY-LABEL: @bitcasted_inselt_to_FP( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x i64> [[I]] to <4 x float> -; ANY-NEXT: [[R:%.*]] = extractelement <4 x float> [[B]], i32 1 -; ANY-NEXT: ret float [[R]] +; LE-LABEL: @bitcasted_inselt_to_FP( +; LE-NEXT: [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32 +; LE-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; LE-NEXT: [[R:%.*]] = bitcast i32 [[TMP2]] to float +; LE-NEXT: ret float [[R]] +; +; BE-LABEL: @bitcasted_inselt_to_FP( +; BE-NEXT: [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32 +; BE-NEXT: [[R:%.*]] = bitcast i32 [[TMP1]] to float +; BE-NEXT: ret float [[R]] ; %i = insertelement <2 x i64> undef, i64 %x, i32 0 %b = bitcast <2 x i64> %i to <4 x float> @@ -210,11 +215,16 @@ define float @bitcasted_inselt_to_FP_uses2(i128 %x) { } define i32 @bitcasted_inselt_from_FP(double %x) { -; ANY-LABEL: @bitcasted_inselt_from_FP( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x double> [[I]] to <4 x i32> -; ANY-NEXT: [[R:%.*]] = extractelement <4 x i32> [[B]], i32 1 -; ANY-NEXT: ret i32 [[R]] +; LE-LABEL: @bitcasted_inselt_from_FP( +; LE-NEXT: [[TMP1:%.*]] = bitcast double [[X:%.*]] to i64 +; LE-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 32 +; LE-NEXT: [[R:%.*]] = trunc i64 [[TMP2]] to i32 +; LE-NEXT: ret i32 [[R]] +; +; BE-LABEL: @bitcasted_inselt_from_FP( +; BE-NEXT: [[TMP1:%.*]] = bitcast double [[X:%.*]] to i64 +; BE-NEXT: [[R:%.*]] = trunc i64 [[TMP1]] to i32 +; BE-NEXT: ret i32 [[R]] ; %i = insertelement <2 x double> undef, double %x, i32 0 %b = bitcast <2 x double> %i to <4 x i32> |

