diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 41 | ||||
| -rw-r--r-- | llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll | 284 |
2 files changed, 325 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index d8824b473a6..78ee9d5de3f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1561,6 +1561,47 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::x86_sse4a_insertqi: UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); break; + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: { + if (VWidth == 1 || !APIntOps::isMask(DemandedElts)) + return nullptr; + + // TODO: Handle 3 vectors when supported in code gen. + unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes()); + if (NewNumElts == VWidth) + return nullptr; + + Module *M = II->getParent()->getParent()->getParent(); + Type *EltTy = V->getType()->getVectorElementType(); + + Type *NewTy = (NewNumElts == 1) ? EltTy : + VectorType::get(EltTy, NewNumElts); + + Function *NewIntrin = Intrinsic::getDeclaration(M, II->getIntrinsicID(), + NewTy); + + SmallVector<Value *, 5> Args; + for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) + Args.push_back(II->getArgOperand(I)); + + CallInst *NewCall = Builder->CreateCall(NewIntrin, Args); + NewCall->takeName(II); + NewCall->copyMetadata(*II); + if (NewNumElts == 1) { + return Builder->CreateInsertElement(UndefValue::get(V->getType()), + NewCall, static_cast<uint64_t>(0)); + } + + SmallVector<uint32_t, 8> EltMask; + for (unsigned I = 0; I < VWidth; ++I) + EltMask.push_back(I); + + Value *Shuffle = Builder->CreateShuffleVector( + NewCall, UndefValue::get(NewTy), EltMask); + + MadeChange = true; + return Shuffle; + } } break; } diff --git a/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll new file mode 100644 index 00000000000..642f537d110 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll @@ -0,0 +1,284 @@ +; RUN: opt -S -instcombine %s | FileCheck %s + +; -------------------------------------------------------------------- +; llvm.amdgcn.buffer.load +; -------------------------------------------------------------------- + +; CHECK-LABEL: @buffer_load_f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + ret float %data +} + +; CHECK-LABEL: @buffer_load_v1f32( +; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <1 x float> %data +define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + ret <1 x float> %data +} + +; CHECK-LABEL: @buffer_load_v2f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + ret <2 x float> %data +} + +; CHECK-LABEL: @buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <4 x float> %data +define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + ret <4 x float> %data +} + +; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( +; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 +define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt1 = extractelement <2 x float> %data, i32 1 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt0 = extractelement <4 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 +define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt1 = extractelement <4 x float> %data, i32 1 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 +define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt1 = extractelement <4 x float> %data, i32 2 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 +; CHECK-NEXT: ret float %elt1 +define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt1 = extractelement <4 x float> %data, i32 3 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2> +; CHECK-NEXT: ret <2 x float> %shuf +define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3> +; CHECK-NEXT: ret <2 x float> %shuf +define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> +; CHECK-NEXT: ret <3 x float> %shuf +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> + ret <3 x float> %shuf +} + +; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3> +; CHECK-NEXT: ret <3 x float> %shuf +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3> + ret <3 x float> %shuf +} + +; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3> +; CHECK-NEXT: ret <3 x float> %shuf +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3> + ret <3 x float> %shuf +} + +; FIXME: Not handled even though only 2 elts used +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt0 = extractelement <4 x float> %data, i32 0 +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1 +; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0 +; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1 +; CHECK-NEXT: ret { float, float } %ins1 +define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt0 = extractelement <4 x float> %data, i32 0 + %elt1 = extractelement <4 x float> %data, i32 1 + %ins0 = insertvalue { float, float } undef, float %elt0, 0 + %ins1 = insertvalue { float, float } %ins0, float %elt1, 1 + ret { float, float } %ins1 +} + +; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt0 = extractelement <3 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 +define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt1 = extractelement <3 x float> %data, i32 1 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 +define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %elt1 = extractelement <3 x float> %data, i32 2 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2> +; CHECK-NEXT: ret <2 x float> %shuf +define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: ret float %data +define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.buffer.load.format +; -------------------------------------------------------------------- + +; CHECK-LABEL: @buffer_load_format_v1f32( +; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) +; CHECK-NEXT: ret <1 x float> %data +define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) + ret <1 x float> %data +} + +; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1> + ret <2 x float> %shuf +} + +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + +!0 = !{float 2.500000e+00} |

