diff options
| author | Eric Christopher <echristo@gmail.com> | 2019-04-17 04:52:47 +0000 |
|---|---|---|
| committer | Eric Christopher <echristo@gmail.com> | 2019-04-17 04:52:47 +0000 |
| commit | cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch) | |
| tree | d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/StraightLineStrengthReduce | |
| parent | c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff) | |
| download | bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip | |
Revert "Temporarily Revert "Add basic loop fusion pass.""
The reversion apparently deleted the test/Transforms directory.
Will be re-reverting again.
llvm-svn: 358552
Diffstat (limited to 'llvm/test/Transforms/StraightLineStrengthReduce')
11 files changed, 832 insertions, 0 deletions
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/lit.local.cfg b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/lit.local.cfg new file mode 100644 index 00000000000..2a665f06be7 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll new file mode 100644 index 00000000000..f587a93bf1e --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll @@ -0,0 +1,20 @@ +; RUN: opt < %s -slsr -S | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target triple = "amdgcn--" + +%struct.Matrix4x4 = type { [4 x [4 x float]] } + +; Function Attrs: nounwind +define fastcc void @Accelerator_Intersect(%struct.Matrix4x4 addrspace(1)* nocapture readonly %leafTransformations) #0 { +; CHECK-LABEL: @Accelerator_Intersect( +entry: + %tmp = sext i32 undef to i64 + %arrayidx114 = getelementptr inbounds %struct.Matrix4x4, %struct.Matrix4x4 addrspace(1)* %leafTransformations, i64 %tmp + %tmp1 = getelementptr %struct.Matrix4x4, %struct.Matrix4x4 addrspace(1)* %leafTransformations, i64 %tmp, i32 0, i64 0, i64 0 +; CHECK: %tmp1 = getelementptr %struct.Matrix4x4, %struct.Matrix4x4 addrspace(1)* %leafTransformations, i64 %tmp, i32 0, i64 0, i64 0 + %tmp2 = load <4 x float>, <4 x float> addrspace(1)* undef, align 4 + ret void +} + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "target-cpu"="tahiti" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll new file mode 100644 index 00000000000..9554ae69031 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll @@ -0,0 +1,107 @@ +; RUN: opt -S -mtriple=amdgcn-- -separate-const-offset-from-gep -slsr -gvn < %s | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + + +; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset( +; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]] +; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]] +define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) { +bb: + %i2 = shl nsw i32 %i, 1 + %j1 = add nsw i32 %i, 1023 + %tmp = sext i32 %j1 to i64 + %p1 = getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp + %tmp3 = bitcast float addrspace(1)* %p1 to i32 addrspace(1)* + %v11 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp4 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v11, i32 addrspace(1)* %tmp4, align 4 + + %j2 = add nsw i32 %i2, 1023 + %tmp5 = sext i32 %j2 to i64 + %p2 = getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5 + %tmp6 = bitcast float addrspace(1)* %p2 to i32 addrspace(1)* + %v22 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp7 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v22, i32 addrspace(1)* %tmp7, align 4 + + ret void +} + +; CHECK-LABEL: @slsr_after_reassociate_global_geps_over_mubuf_max_offset( +; CHECK: %j1 = add nsw i32 %i, 1024 +; CHECK: %tmp = sext i32 %j1 to i64 +; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp +; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5 +define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) { +bb: + %i2 = shl nsw i32 %i, 1 + %j1 = add nsw i32 %i, 1024 + %tmp = sext i32 %j1 to i64 + %p1 = getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp + %tmp3 = bitcast float addrspace(1)* %p1 to i32 addrspace(1)* + %v11 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp4 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v11, i32 addrspace(1)* %tmp4, align 4 + + %j2 = add nsw i32 %i2, 1024 + %tmp5 = sext i32 %j2 to i64 + %p2 = getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5 + %tmp6 = bitcast float addrspace(1)* %p2 to i32 addrspace(1)* + %v22 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp7 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v22, i32 addrspace(1)* %tmp7, align 4 + + ret void +} + +; CHECK-LABEL: @slsr_after_reassociate_lds_geps_ds_max_offset( +; CHECK: [[B1:%[0-9]+]] = getelementptr float, float addrspace(3)* %arr, i32 %i +; CHECK: getelementptr inbounds float, float addrspace(3)* [[B1]], i32 16383 + +; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i +; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383 +define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) { +bb: + %i2 = shl nsw i32 %i, 1 + %j1 = add nsw i32 %i, 16383 + %p1 = getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1 + %tmp3 = bitcast float addrspace(3)* %p1 to i32 addrspace(3)* + %v11 = load i32, i32 addrspace(3)* %tmp3, align 4 + %tmp4 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v11, i32 addrspace(1)* %tmp4, align 4 + + %j2 = add nsw i32 %i2, 16383 + %p2 = getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2 + %tmp6 = bitcast float addrspace(3)* %p2 to i32 addrspace(3)* + %v22 = load i32, i32 addrspace(3)* %tmp6, align 4 + %tmp7 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v22, i32 addrspace(1)* %tmp7, align 4 + + ret void +} + +; CHECK-LABEL: @slsr_after_reassociate_lds_geps_over_ds_max_offset( +; CHECK: %j1 = add nsw i32 %i, 16384 +; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1 +; CHECK: %j2 = add i32 %j1, %i +; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2 +define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) { +bb: + %i2 = shl nsw i32 %i, 1 + %j1 = add nsw i32 %i, 16384 + %p1 = getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1 + %tmp3 = bitcast float addrspace(3)* %p1 to i32 addrspace(3)* + %v11 = load i32, i32 addrspace(3)* %tmp3, align 4 + %tmp4 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v11, i32 addrspace(1)* %tmp4, align 4 + + %j2 = add nsw i32 %i2, 16384 + %p2 = getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2 + %tmp6 = bitcast float addrspace(3)* %p2 to i32 addrspace(3)* + %v22 = load i32, i32 addrspace(3)* %tmp6, align 4 + %tmp7 = bitcast float addrspace(1)* %out to i32 addrspace(1)* + store i32 %v22, i32 addrspace(1)* %tmp7, align 4 + + ret void +} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/lit.local.cfg b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/lit.local.cfg new file mode 100644 index 00000000000..2cb98eb371b --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll new file mode 100644 index 00000000000..03c0356eb5b --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -separate-const-offset-from-gep -slsr -gvn -S | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix=PTX + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +; arr[i + 5] +; arr[i * 2 + 5] +; arr[i * 3 + 5] +; arr[i * 4 + 5] +; +; => reassociate-geps +; +; *(&arr[i] + 5) +; *(&arr[i * 2] + 5) +; *(&arr[i * 3] + 5) +; *(&arr[i * 4] + 5) +; +; => slsr +; +; p1 = &arr[i] +; *(p1 + 5) +; p2 = p1 + i +; *(p2 + 5) +; p3 = p2 + i +; *(p3 + 5) +; p4 = p3 + i +; *(p4 + 5) +define void @slsr_after_reassociate_geps(float* %arr, i32 %i) { +; CHECK-LABEL: @slsr_after_reassociate_geps( +; PTX-LABEL: .visible .func slsr_after_reassociate_geps( +; PTX: ld.param.u64 [[arr:%rd[0-9]+]], [slsr_after_reassociate_geps_param_0]; +; PTX: ld.param.u32 [[i:%r[0-9]+]], [slsr_after_reassociate_geps_param_1]; + %i2 = shl nsw i32 %i, 1 + %i3 = mul nsw i32 %i, 3 + %i4 = shl nsw i32 %i, 2 + + %j1 = add nsw i32 %i, 5 + %p1 = getelementptr inbounds float, float* %arr, i32 %j1 +; CHECK: [[b1:%[0-9]+]] = getelementptr float, float* %arr, i64 [[bump:%[0-9]+]] +; PTX: mul.wide.s32 [[i4:%rd[0-9]+]], [[i]], 4; +; PTX: add.s64 [[base1:%rd[0-9]+]], [[arr]], [[i4]]; + %v1 = load float, float* %p1, align 4 +; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base1]]+20]; + call void @foo(float %v1) + + %j2 = add nsw i32 %i2, 5 + %p2 = getelementptr inbounds float, float* %arr, i32 %j2 +; CHECK: [[b2:%[0-9]+]] = getelementptr float, float* [[b1]], i64 [[bump]] +; PTX: add.s64 [[base2:%rd[0-9]+]], [[base1]], [[i4]]; + %v2 = load float, float* %p2, align 4 +; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base2]]+20]; + call void @foo(float %v2) + + %j3 = add nsw i32 %i3, 5 + %p3 = getelementptr inbounds float, float* %arr, i32 %j3 +; CHECK: [[b3:%[0-9]+]] = getelementptr float, float* [[b2]], i64 [[bump]] +; PTX: add.s64 [[base3:%rd[0-9]+]], [[base2]], [[i4]]; + %v3 = load float, float* %p3, align 4 +; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base3]]+20]; + call void @foo(float %v3) + + %j4 = add nsw i32 %i4, 5 + %p4 = getelementptr inbounds float, float* %arr, i32 %j4 +; CHECK: [[b4:%[0-9]+]] = getelementptr float, float* [[b3]], i64 [[bump]] +; PTX: add.s64 [[base4:%rd[0-9]+]], [[base3]], [[i4]]; + %v4 = load float, float* %p4, align 4 +; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base4]]+20]; + call void @foo(float %v4) + + ret void +} + +declare void @foo(float) diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll new file mode 100644 index 00000000000..cb73565b152 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll @@ -0,0 +1,71 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CUDA code +; __global__ void foo(int b, int s) { +; #pragma unroll +; for (int i = 0; i < 4; ++i) { +; if (cond(i)) +; use((b + i) * s); +; } +; } +define void @foo(i32 %b, i32 %s) { +; CHECK-LABEL: .visible .entry foo( +entry: +; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1]; +; CHECK: ld.param.u32 [[b:%r[0-9]+]], [foo_param_0]; + %call = tail call zeroext i1 @cond(i32 0) + br i1 %call, label %if.then, label %for.inc + +if.then: ; preds = %entry + %mul = mul nsw i32 %b, %s +; CHECK: mul.lo.s32 [[a0:%r[0-9]+]], [[b]], [[s]] + tail call void @use(i32 %mul) + br label %for.inc + +for.inc: ; preds = %entry, %if.then + %call.1 = tail call zeroext i1 @cond(i32 1) + br i1 %call.1, label %if.then.1, label %for.inc.1 + +if.then.1: ; preds = %for.inc + %add.1 = add nsw i32 %b, 1 + %mul.1 = mul nsw i32 %add.1, %s +; CHECK: add.s32 [[a1:%r[0-9]+]], [[a0]], [[s]] + tail call void @use(i32 %mul.1) + br label %for.inc.1 + +for.inc.1: ; preds = %if.then.1, %for.inc + %call.2 = tail call zeroext i1 @cond(i32 2) + br i1 %call.2, label %if.then.2, label %for.inc.2 + +if.then.2: ; preds = %for.inc.1 + %add.2 = add nsw i32 %b, 2 + %mul.2 = mul nsw i32 %add.2, %s +; CHECK: add.s32 [[a2:%r[0-9]+]], [[a1]], [[s]] + tail call void @use(i32 %mul.2) + br label %for.inc.2 + +for.inc.2: ; preds = %if.then.2, %for.inc.1 + %call.3 = tail call zeroext i1 @cond(i32 3) + br i1 %call.3, label %if.then.3, label %for.inc.3 + +if.then.3: ; preds = %for.inc.2 + %add.3 = add nsw i32 %b, 3 + %mul.3 = mul nsw i32 %add.3, %s +; CHECK: add.s32 [[a3:%r[0-9]+]], [[a2]], [[s]] + tail call void @use(i32 %mul.3) + br label %for.inc.3 + +for.inc.3: ; preds = %if.then.3, %for.inc.2 + ret void +} + +declare zeroext i1 @cond(i32) + +declare void @use(i32) + +!nvvm.annotations = !{!0} + +!0 = !{void (i32, i32)* @foo, !"kernel", i32 1} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/X86/lit.local.cfg b/llvm/test/Transforms/StraightLineStrengthReduce/X86/lit.local.cfg new file mode 100644 index 00000000000..c8625f4d9d2 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/X86/no-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/X86/no-slsr.ll new file mode 100644 index 00000000000..f11cbc5897a --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/X86/no-slsr.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -slsr -gvn -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Do not perform SLSR on &input[s] and &input[s * 2] which fit into addressing +; modes of X86. +define i32 @no_slsr_gep(i32* %input, i64 %s) { +; CHECK-LABEL: @no_slsr_gep( + ; v0 = input[0]; + %p0 = getelementptr inbounds i32, i32* %input, i64 0 + %v0 = load i32, i32* %p0 + + ; v1 = input[s]; + %p1 = getelementptr inbounds i32, i32* %input, i64 %s +; CHECK: %p1 = getelementptr inbounds i32, i32* %input, i64 %s + %v1 = load i32, i32* %p1 + + ; v2 = input[s * 2]; + %s2 = mul nsw i64 %s, 2 + %p2 = getelementptr inbounds i32, i32* %input, i64 %s2 +; CHECK: %p2 = getelementptr inbounds i32, i32* %input, i64 %s2 + %v2 = load i32, i32* %p2 + + ; return v0 + v1 + v2; + %1 = add i32 %v0, %v1 + %2 = add i32 %1, %v2 + ret i32 %2 +} + +define void @no_slsr_add(i32 %b, i32 %s) { +; CHECK-LABEL: @no_slsr_add( + %1 = add i32 %b, %s +; CHECK: add i32 %b, %s + call void @foo(i32 %1) + %s2 = mul i32 %s, 2 +; CHECK: %s2 = mul i32 %s, 2 + %2 = add i32 %b, %s2 +; CHECK: add i32 %b, %s2 + call void @foo(i32 %2) + ret void +} + +declare void @foo(i32 %a) diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll new file mode 100644 index 00000000000..92af617dab8 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slsr -gvn -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" + +define void @shl(i32 %b, i32 %s) { +; CHECK-LABEL: @shl( +; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]] +; CHECK-NEXT: call void @foo(i32 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = add i32 [[T1]], [[S]] +; CHECK-NEXT: call void @foo(i32 [[T2]]) +; CHECK-NEXT: ret void +; + %t1 = add i32 %b, %s + call void @foo(i32 %t1) + %s2 = shl i32 %s, 1 + %t2 = add i32 %b, %s2 + call void @foo(i32 %t2) + ret void +} + +define void @stride_is_2s(i32 %b, i32 %s) { +; CHECK-LABEL: @stride_is_2s( +; CHECK-NEXT: [[S2:%.*]] = shl i32 [[S:%.*]], 1 +; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S2]] +; CHECK-NEXT: call void @foo(i32 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = add i32 [[T1]], [[S2]] +; CHECK-NEXT: call void @foo(i32 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = add i32 [[T2]], [[S2]] +; CHECK-NEXT: call void @foo(i32 [[T3]]) +; CHECK-NEXT: ret void +; + %s2 = shl i32 %s, 1 + %t1 = add i32 %b, %s2 + call void @foo(i32 %t1) + %s4 = shl i32 %s, 2 + %t2 = add i32 %b, %s4 + call void @foo(i32 %t2) + %s6 = mul i32 %s, 6 + %t3 = add i32 %b, %s6 + call void @foo(i32 %t3) + ret void +} + +define void @stride_is_3s(i32 %b, i32 %s) { +; CHECK-LABEL: @stride_is_3s( +; CHECK-NEXT: [[T1:%.*]] = add i32 [[S:%.*]], [[B:%.*]] +; CHECK-NEXT: call void @foo(i32 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[S]], 3 +; CHECK-NEXT: [[T2:%.*]] = add i32 [[T1]], [[TMP1]] +; CHECK-NEXT: call void @foo(i32 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = add i32 [[T2]], [[TMP1]] +; CHECK-NEXT: call void @foo(i32 [[T3]]) +; CHECK-NEXT: ret void +; + %t1 = add i32 %s, %b + call void @foo(i32 %t1) + %s4 = shl i32 %s, 2 + %t2 = add i32 %s4, %b + call void @foo(i32 %t2) + %s7 = mul i32 %s, 7 + %t3 = add i32 %s7, %b + call void @foo(i32 %t3) + ret void +} + +; foo(b + 6 * s); +; foo(b + 4 * s); +; foo(b + 2 * s); +; => +; t1 = b + 6 * s; +; foo(t1); +; s2 = 2 * s; +; t2 = t1 - s2; +; foo(t2); +; t3 = t2 - s2; +; foo(t3); +define void @stride_is_minus_2s(i32 %b, i32 %s) { +; CHECK-LABEL: @stride_is_minus_2s( +; CHECK-NEXT: [[S6:%.*]] = mul i32 [[S:%.*]], 6 +; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S6]] +; CHECK-NEXT: call void @foo(i32 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[S]], 1 +; CHECK-NEXT: [[T2:%.*]] = sub i32 [[T1]], [[TMP1]] +; CHECK-NEXT: call void @foo(i32 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = sub i32 [[T2]], [[TMP1]] +; CHECK-NEXT: call void @foo(i32 [[T3]]) +; CHECK-NEXT: ret void +; + %s6 = mul i32 %s, 6 + %t1 = add i32 %b, %s6 + call void @foo(i32 %t1) + %s4 = shl i32 %s, 2 + %t2 = add i32 %b, %s4 + call void @foo(i32 %t2) + %s2 = shl i32 %s, 1 + %t3 = add i32 %b, %s2 + call void @foo(i32 %t3) + ret void +} + +; TODO: This pass is targeted at simple address-calcs, so it is artificially limited to +; match scalar values. The code could be modified to handle vector types too. + +define void @stride_is_minus_2s_vec(<2 x i32> %b, <2 x i32> %s) { +; CHECK-LABEL: @stride_is_minus_2s_vec( +; CHECK-NEXT: [[S6:%.*]] = mul <2 x i32> [[S:%.*]], <i32 6, i32 6> +; CHECK-NEXT: [[T1:%.*]] = add <2 x i32> [[B:%.*]], [[S6]] +; CHECK-NEXT: call void @voo(<2 x i32> [[T1]]) +; CHECK-NEXT: [[S4:%.*]] = shl <2 x i32> [[S]], <i32 2, i32 2> +; CHECK-NEXT: [[T2:%.*]] = add <2 x i32> [[B]], [[S4]] +; CHECK-NEXT: call void @voo(<2 x i32> [[T2]]) +; CHECK-NEXT: [[S2:%.*]] = shl <2 x i32> [[S]], <i32 1, i32 1> +; CHECK-NEXT: [[T3:%.*]] = add <2 x i32> [[B]], [[S2]] +; CHECK-NEXT: call void @voo(<2 x i32> [[T3]]) +; CHECK-NEXT: ret void +; + %s6 = mul <2 x i32> %s, <i32 6, i32 6> + %t1 = add <2 x i32> %b, %s6 + call void @voo(<2 x i32> %t1) + %s4 = shl <2 x i32> %s, <i32 2, i32 2> + %t2 = add <2 x i32> %b, %s4 + call void @voo(<2 x i32> %t2) + %s2 = shl <2 x i32> %s, <i32 1, i32 1> + %t3 = add <2 x i32> %b, %s2 + call void @voo(<2 x i32> %t3) + ret void +} + +; t = b + (s << 3); +; foo(t); +; foo(b + s); +; +; do not rewrite b + s to t - 7 * s because the latter is more complicated. +define void @simple_enough(i32 %b, i32 %s) { +; CHECK-LABEL: @simple_enough( +; CHECK-NEXT: [[S8:%.*]] = shl i32 [[S:%.*]], 3 +; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S8]] +; CHECK-NEXT: call void @foo(i32 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = add i32 [[B]], [[S]] +; CHECK-NEXT: call void @foo(i32 [[T2]]) +; CHECK-NEXT: ret void +; + %s8 = shl i32 %s, 3 + %t1 = add i32 %b, %s8 + call void @foo(i32 %t1) + %t2 = add i32 %b, %s + call void @foo(i32 %t2) + ret void +} + +define void @slsr_strided_add_128bit(i128 %b, i128 %s) { +; CHECK-LABEL: @slsr_strided_add_128bit( +; CHECK-NEXT: [[S125:%.*]] = shl i128 [[S:%.*]], 125 +; CHECK-NEXT: [[T1:%.*]] = add i128 [[B:%.*]], [[S125]] +; CHECK-NEXT: call void @bar(i128 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = add i128 [[T1]], [[S125]] +; CHECK-NEXT: call void @bar(i128 [[T2]]) +; CHECK-NEXT: ret void +; + %s125 = shl i128 %s, 125 + %s126 = shl i128 %s, 126 + %t1 = add i128 %b, %s125 + call void @bar(i128 %t1) + %t2 = add i128 %b, %s126 + call void @bar(i128 %t2) + ret void +} + +declare void @foo(i32) +declare void @voo(<2 x i32>) +declare void @bar(i128) diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll new file mode 100644 index 00000000000..b9bb4faf1b4 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll @@ -0,0 +1,191 @@ +; RUN: opt < %s -slsr -gvn -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64-p:64:64:64-p1:32:32:32" + +; foo(input[0]); +; foo(input[s]); +; foo(input[s * 2]); +; => +; p0 = &input[0]; +; foo(*p); +; p1 = p0 + s; +; foo(*p1); +; p2 = p1 + s; +; foo(*p2); +define void @slsr_gep(i32* %input, i64 %s) { +; CHECK-LABEL: @slsr_gep( + ; v0 = input[0]; + %p0 = getelementptr inbounds i32, i32* %input, i64 0 + call void @foo(i32* %p0) + + ; v1 = input[s]; + %p1 = getelementptr inbounds i32, i32* %input, i64 %s +; CHECK: %p1 = getelementptr inbounds i32, i32* %input, i64 %s + call void @foo(i32* %p1) + + ; v2 = input[s * 2]; + %s2 = shl nsw i64 %s, 1 + %p2 = getelementptr inbounds i32, i32* %input, i64 %s2 +; CHECK: %p2 = getelementptr inbounds i32, i32* %p1, i64 %s + call void @foo(i32* %p2) + + ret void +} + +; foo(input[0]); +; foo(input[(long)s]); +; foo(input[(long)(s * 2)]); +; => +; p0 = &input[0]; +; foo(*p); +; p1 = p0 + (long)s; +; foo(*p1); +; p2 = p1 + (long)s; +; foo(*p2); +define void @slsr_gep_sext(i32* %input, i32 %s) { +; CHECK-LABEL: @slsr_gep_sext( + ; v0 = input[0]; + %p0 = getelementptr inbounds i32, i32* %input, i64 0 + call void @foo(i32* %p0) + + ; v1 = input[s]; + %t = sext i32 %s to i64 + %p1 = getelementptr inbounds i32, i32* %input, i64 %t +; CHECK: %p1 = getelementptr inbounds i32, i32* %input, i64 %t + call void @foo(i32* %p1) + + ; v2 = input[s * 2]; + %s2 = shl nsw i32 %s, 1 + %t2 = sext i32 %s2 to i64 + %p2 = getelementptr inbounds i32, i32* %input, i64 %t2 +; CHECK: %p2 = getelementptr inbounds i32, i32* %p1, i64 %t + call void @foo(i32* %p2) + + ret void +} + +; int input[10][5]; +; foo(input[s][t]); +; foo(input[s * 2][t]); +; foo(input[s * 3][t]); +; => +; p0 = &input[s][t]; +; foo(*p0); +; p1 = p0 + 5s; +; foo(*p1); +; p2 = p1 + 5s; +; foo(*p2); +define void @slsr_gep_2d([10 x [5 x i32]]* %input, i64 %s, i64 %t) { +; CHECK-LABEL: @slsr_gep_2d( + ; v0 = input[s][t]; + %p0 = getelementptr inbounds [10 x [5 x i32]], [10 x [5 x i32]]* %input, i64 0, i64 %s, i64 %t + call void @foo(i32* %p0) + + ; v1 = input[s * 2][t]; + %s2 = shl nsw i64 %s, 1 +; CHECK: [[BUMP:%[a-zA-Z0-9]+]] = mul i64 %s, 5 + %p1 = getelementptr inbounds [10 x [5 x i32]], [10 x [5 x i32]]* %input, i64 0, i64 %s2, i64 %t +; CHECK: %p1 = getelementptr inbounds i32, i32* %p0, i64 [[BUMP]] + call void @foo(i32* %p1) + + ; v3 = input[s * 3][t]; + %s3 = mul nsw i64 %s, 3 + %p2 = getelementptr inbounds [10 x [5 x i32]], [10 x [5 x i32]]* %input, i64 0, i64 %s3, i64 %t +; CHECK: %p2 = getelementptr inbounds i32, i32* %p1, i64 [[BUMP]] + call void @foo(i32* %p2) + + ret void +} + +%struct.S = type <{ i64, i32 }> + +; In this case, the bump +; = (char *)&input[s * 2][t].f1 - (char *)&input[s][t].f1 +; = 60 * s +; which may not be divisible by typeof(input[s][t].f1) = 8. Therefore, we +; rewrite the candidates using byte offset instead of index offset as in +; @slsr_gep_2d. +define void @slsr_gep_uglygep([10 x [5 x %struct.S]]* %input, i64 %s, i64 %t) { +; CHECK-LABEL: @slsr_gep_uglygep( + ; v0 = input[s][t].f1; + %p0 = getelementptr inbounds [10 x [5 x %struct.S]], [10 x [5 x %struct.S]]* %input, i64 0, i64 %s, i64 %t, i32 0 + call void @bar(i64* %p0) + + ; v1 = input[s * 2][t].f1; + %s2 = shl nsw i64 %s, 1 +; CHECK: [[BUMP:%[a-zA-Z0-9]+]] = mul i64 %s, 60 + %p1 = getelementptr inbounds [10 x [5 x %struct.S]], [10 x [5 x %struct.S]]* %input, i64 0, i64 %s2, i64 %t, i32 0 +; CHECK: getelementptr inbounds i8, i8* %{{[0-9]+}}, i64 [[BUMP]] + call void @bar(i64* %p1) + + ; v2 = input[s * 3][t].f1; + %s3 = mul nsw i64 %s, 3 + %p2 = getelementptr inbounds [10 x [5 x %struct.S]], [10 x [5 x %struct.S]]* %input, i64 0, i64 %s3, i64 %t, i32 0 +; CHECK: getelementptr inbounds i8, i8* %{{[0-9]+}}, i64 [[BUMP]] + call void @bar(i64* %p2) + + ret void +} + +define void @slsr_out_of_bounds_gep(i32* %input, i32 %s) { +; CHECK-LABEL: @slsr_out_of_bounds_gep( + ; v0 = input[0]; + %p0 = getelementptr i32, i32* %input, i64 0 + call void @foo(i32* %p0) + + ; v1 = input[(long)s]; + %t = sext i32 %s to i64 + %p1 = getelementptr i32, i32* %input, i64 %t +; CHECK: %p1 = getelementptr i32, i32* %input, i64 %t + call void @foo(i32* %p1) + + ; v2 = input[(long)(s * 2)]; + %s2 = shl nsw i32 %s, 1 + %t2 = sext i32 %s2 to i64 + %p2 = getelementptr i32, i32* %input, i64 %t2 +; CHECK: %p2 = getelementptr i32, i32* %p1, i64 %t + call void @foo(i32* %p2) + + ret void +} + +define void @slsr_gep_128bit_index(i32* %input, i128 %s) { +; CHECK-LABEL: @slsr_gep_128bit_index( + ; p0 = &input[0] + %p0 = getelementptr inbounds i32, i32* %input, i128 0 + call void @foo(i32* %p0) + + ; p1 = &input[s << 125] + %s125 = shl nsw i128 %s, 125 + %p1 = getelementptr inbounds i32, i32* %input, i128 %s125 +; CHECK: %p1 = getelementptr inbounds i32, i32* %input, i128 %s125 + call void @foo(i32* %p1) + + ; p2 = &input[s << 126] + %s126 = shl nsw i128 %s, 126 + %p2 = getelementptr inbounds i32, i32* %input, i128 %s126 +; CHECK: %p2 = getelementptr inbounds i32, i32* %input, i128 %s126 + call void @foo(i32* %p2) + + ret void +} + +define void @slsr_gep_32bit_pointer(i32 addrspace(1)* %input, i64 %s) { +; CHECK-LABEL: @slsr_gep_32bit_pointer( + ; p1 = &input[s] + %p1 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %s + call void @baz(i32 addrspace(1)* %p1) + + ; p2 = &input[s * 2] + %s2 = mul nsw i64 %s, 2 + %p2 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %s2 + ; %s2 is wider than the pointer size of addrspace(1), so do not factor it. +; CHECK: %p2 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %s2 + call void @baz(i32 addrspace(1)* %p2) + + ret void +} + +declare void @foo(i32*) +declare void @bar(i64*) +declare void @baz(i32 addrspace(1)*) diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-mul.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-mul.ll new file mode 100644 index 00000000000..56b1d1fc5cd --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-mul.ll @@ -0,0 +1,147 @@ +; RUN: opt < %s -slsr -gvn -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" + +define void @slsr1(i32 %b, i32 %s) { +; CHECK-LABEL: @slsr1( + ; foo(b * s); + %mul0 = mul i32 %b, %s +; CHECK: mul i32 +; CHECK-NOT: mul i32 + call void @foo(i32 %mul0) + + ; foo((b + 1) * s); + %b1 = add i32 %b, 1 + %mul1 = mul i32 %b1, %s + call void @foo(i32 %mul1) + + ; foo((b + 2) * s); + %b2 = add i32 %b, 2 + %mul2 = mul i32 %b2, %s + call void @foo(i32 %mul2) + + ret void +} + +define void @non_canonicalized(i32 %b, i32 %s) { +; CHECK-LABEL: @non_canonicalized( + ; foo(b * s); + %mul0 = mul i32 %b, %s +; CHECK: mul i32 +; CHECK-NOT: mul i32 + call void @foo(i32 %mul0) + + ; foo((1 + b) * s); + %b1 = add i32 1, %b + %mul1 = mul i32 %b1, %s + call void @foo(i32 %mul1) + + ; foo((2 + b) * s); + %b2 = add i32 2, %b + %mul2 = mul i32 %b2, %s + call void @foo(i32 %mul2) + + ret void +} + +define void @or(i32 %a, i32 %s) { + %b = shl i32 %a, 1 +; CHECK-LABEL: @or( + ; foo(b * s); + %mul0 = mul i32 %b, %s +; CHECK: [[base:[^ ]+]] = mul i32 + call void @foo(i32 %mul0) + + ; foo((b | 1) * s); + %b1 = or i32 %b, 1 + %mul1 = mul i32 %b1, %s +; CHECK: add i32 [[base]], %s + call void @foo(i32 %mul1) + + ; foo((b | 2) * s); + %b2 = or i32 %b, 2 + %mul2 = mul i32 %b2, %s +; CHECK: mul i32 %b2, %s + call void @foo(i32 %mul2) + + ret void +} + +; foo(a * b) +; foo((a + 1) * b) +; foo(a * (b + 1)) +; foo((a + 1) * (b + 1)) +define void @slsr2(i32 %a, i32 %b) { +; CHECK-LABEL: @slsr2( + %a1 = add i32 %a, 1 + %b1 = add i32 %b, 1 + %mul0 = mul i32 %a, %b +; CHECK: mul i32 +; CHECK-NOT: mul i32 + %mul1 = mul i32 %a1, %b + %mul2 = mul i32 %a, %b1 + %mul3 = mul i32 %a1, %b1 + + call void @foo(i32 %mul0) + call void @foo(i32 %mul1) + call void @foo(i32 %mul2) + call void @foo(i32 %mul3) + + ret void +} + +; The bump is a multiple of the stride. +; +; foo(b * s); +; foo((b + 2) * s); +; foo((b + 4) * s); +; => +; mul0 = b * s; +; bump = s * 2; +; mul1 = mul0 + bump; // GVN ensures mul1 and mul2 use the same bump. +; mul2 = mul1 + bump; +define void @slsr3(i32 %b, i32 %s) { +; CHECK-LABEL: @slsr3( + %mul0 = mul i32 %b, %s +; CHECK: mul i32 + call void @foo(i32 %mul0) + + %b1 = add i32 %b, 2 + %mul1 = mul i32 %b1, %s +; CHECK: [[BUMP:%[a-zA-Z0-9]+]] = shl i32 %s, 1 +; CHECK: %mul1 = add i32 %mul0, [[BUMP]] + call void @foo(i32 %mul1) + + %b2 = add i32 %b, 4 + %mul2 = mul i32 %b2, %s +; CHECK: %mul2 = add i32 %mul1, [[BUMP]] + call void @foo(i32 %mul2) + + ret void +} + +; Do not rewrite a candidate if its potential basis does not dominate it. +; +; if (cond) +; foo(a * b); +; foo((a + 1) * b); +define void @not_dominate(i1 %cond, i32 %a, i32 %b) { +; CHECK-LABEL: @not_dominate( +entry: + %a1 = add i32 %a, 1 + br i1 %cond, label %then, label %merge + +then: + %mul0 = mul i32 %a, %b +; CHECK: %mul0 = mul i32 %a, %b + call void @foo(i32 %mul0) + br label %merge + +merge: + %mul1 = mul i32 %a1, %b +; CHECK: %mul1 = mul i32 %a1, %b + call void @foo(i32 %mul1) + ret void +} + +declare void @foo(i32) |

