summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopStrengthReduce/AMDGPU
diff options
context:
space:
mode:
authorEric Christopher <echristo@gmail.com>2019-04-17 04:52:47 +0000
committerEric Christopher <echristo@gmail.com>2019-04-17 04:52:47 +0000
commitcee313d288a4faf0355d76fb6e0e927e211d08a5 (patch)
treed386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/LoopStrengthReduce/AMDGPU
parentc3d6a929fdd92fd06d4304675ade8d7210ee711a (diff)
downloadbcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz
bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip
Revert "Temporarily Revert "Add basic loop fusion pass.""
The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552
Diffstat (limited to 'llvm/test/Transforms/LoopStrengthReduce/AMDGPU')
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll167
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll156
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll30
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg3
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll131
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll37
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll95
7 files changed, 619 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
new file mode 100644
index 00000000000..f5846c5eebc
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -0,0 +1,167 @@
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; Make sure the pointer / address space of AtomicRMW is considered
+
+; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(
+
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = atomicrmw add i32 addrspace(3)* %scevgep4, i32 undef seq_cst
+; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst
+; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst
+; OPT: br i1 %exitcond
+define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+ %tmp4 = atomicrmw add i32 addrspace(3)* %tmp3, i32 undef seq_cst
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+ %tmp7 = atomicrmw add i32 addrspace(3)* %tmp6, i32 undef seq_cst
+ %tmp8 = add nsw i32 %tmp7, %tmp4
+ atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic
+define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+ %tmp4 = cmpxchg i32 addrspace(3)* %tmp3, i32 undef, i32 undef seq_cst monotonic
+ %tmp4.0 = extractvalue { i32, i1 } %tmp4, 0
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+ %tmp7 = cmpxchg i32 addrspace(3)* %tmp6, i32 undef, i32 undef seq_cst monotonic
+ %tmp7.0 = extractvalue { i32, i1 } %tmp7, 0
+ %tmp8 = add nsw i32 %tmp7.0, %tmp4.0
+ atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef, i32 0, i32 0, i1 false)
+; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+ %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false)
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+ %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false)
+ %tmp8 = add nsw i32 %tmp7, %tmp4
+ atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef, i32 0, i32 0, i1 false)
+; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+ %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false)
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+ %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false)
+ %tmp8 = add nsw i32 %tmp7, %tmp4
+ atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind argmemonly }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
new file mode 100644
index 00000000000..568809b093c
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -0,0 +1,156 @@
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
+
+; Test that loops with different maximum offsets for different address
+; spaces are correctly handled.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32(
+; OPT: {{^}}.lr.ph:
+; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
+; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i64 %indvars.iv, 4095
+ %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %tmp1
+ %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
+ %tmp4 = sext i8 %tmp3 to i32
+ %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
+ %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+ %tmp7 = add nsw i32 %tmp6, %tmp4
+ store i32 %tmp7, i32 addrspace(1)* %tmp5, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32(
+; OPT: {{^}}.lr.ph.preheader:
+; OPT: %scevgep2 = getelementptr i8, i8 addrspace(1)* %arg1, i64 4096
+; OPT: br label %.lr.ph
+
+; OPT: {{^}}.lr.ph:
+; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i64 %indvars.iv, 4096
+ %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %tmp1
+ %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
+ %tmp4 = sext i8 %tmp3 to i32
+ %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
+ %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+ %tmp7 = add nsw i32 %tmp6, %tmp4
+ store i32 %tmp7, i32 addrspace(1)* %tmp5, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32(
+; OPT: {{^}}.lr.ph
+; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
+; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i64 %indvars.iv, 65535
+ %tmp2 = trunc i64 %tmp1 to i32
+ %tmp3 = getelementptr inbounds i8, i8 addrspace(3)* %arg1, i32 %tmp2
+ %tmp4 = load i8, i8 addrspace(3)* %tmp3, align 1
+ %tmp5 = sext i8 %tmp4 to i32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = add nsw i32 %tmp7, %tmp5
+ store i32 %tmp8, i32 addrspace(1)* %tmp6, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32(
+; OPT: {{^}}.lr.ph.preheader:
+; OPT: %scevgep2 = getelementptr i8, i8 addrspace(3)* %arg1, i32 65536
+; OPT: br label %.lr.ph
+
+; OPT: {{^}}.lr.ph:
+; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = add nuw nsw i64 %indvars.iv, 65536
+ %tmp2 = trunc i64 %tmp1 to i32
+ %tmp3 = getelementptr inbounds i8, i8 addrspace(3)* %arg1, i32 %tmp2
+ %tmp4 = load i8, i8 addrspace(3)* %tmp3, align 1
+ %tmp5 = sext i8 %tmp4 to i32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = add nsw i32 %tmp7, %tmp5
+ store i32 %tmp8, i32 addrspace(1)* %tmp6, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
new file mode 100644
index 00000000000..d558aa24304
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "amdgcn--"
+
+; We need to compile this for a target where we have different address spaces,
+; and where pointers in those address spaces have different size.
+; E.g. for amdgcn-- pointers in address space 0 are 32 bits and pointers in
+; address space 1 are 64 bits.
+
+; We shouldn't crash. Check that we get a loop with the two stores.
+;CHECK-LABEL: foo:
+;CHECK: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]:
+;CHECK: buffer_store_dword
+;CHECK: buffer_store_dword
+;CHECK: s_branch [[LOOP_LABEL]]
+
+define amdgpu_kernel void @foo() {
+entry:
+ br label %loop
+
+loop:
+ %idx0 = phi i32 [ %next_idx0, %loop ], [ 0, %entry ]
+ %0 = getelementptr inbounds i32, i32 addrspace(5)* null, i32 %idx0
+ %1 = getelementptr inbounds i32, i32 addrspace(1)* null, i32 %idx0
+ store i32 1, i32 addrspace(5)* %0
+ store i32 7, i32 addrspace(1)* %1
+ %next_idx0 = add nuw nsw i32 %idx0, 1
+ br label %loop
+}
+
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg
new file mode 100644
index 00000000000..6baccf05fff
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'AMDGPU' in config.root.targets:
+ config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
new file mode 100644
index 00000000000..fdbd0dada2c
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
@@ -0,0 +1,131 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -print-lsr-output < %s 2>&1 | FileCheck %s
+
+; Test various conditions where OptimizeLoopTermCond doesn't look at a
+; memory instruction use and fails to find the address space.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; CHECK-LABEL: @local_cmp_user(
+; CHECK: bb11:
+; CHECK: %lsr.iv1 = phi i32 [ %lsr.iv.next2, %bb ], [ 2, %entry ]
+; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %bb ], [ %{{[0-9]+}}, %entry ]
+; CHECK: %lsr.iv.next = add i32 %lsr.iv, -1
+; CHECK: %lsr.iv.next2 = add i32 %lsr.iv1, -2
+; CHECK: br i1
+
+; CHECK: bb:
+; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)*
+; CHECK: %c1 = icmp ne i8 addrspace(3)*
+define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind {
+entry:
+ br label %bb11
+
+bb11:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %bb ]
+ %ii = shl i32 %i, 1
+ %c0 = icmp eq i32 %i, %arg0
+ br i1 %c0, label %bb13, label %bb
+
+bb:
+ %t = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* undef
+ %p = getelementptr i8, i8 addrspace(3)* %t, i32 %ii
+ %c1 = icmp ne i8 addrspace(3)* %p, null
+ %i.next = add i32 %i, 1
+ br i1 %c1, label %bb11, label %bb13
+
+bb13:
+ unreachable
+}
+
+; CHECK-LABEL: @global_cmp_user(
+; CHECK: %lsr.iv1 = phi i64
+; CHECK: %lsr.iv = phi i64
+; CHECK: %lsr.iv.next = add i64 %lsr.iv, -1
+; CHECK: %lsr.iv.next2 = add i64 %lsr.iv1, -2
+; CHECK: br i1
+
+; CHECK: bb:
+; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)*
+; CHECK: icmp ne i8 addrspace(1)* %t
+define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind {
+entry:
+ br label %bb11
+
+bb11:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %bb ]
+ %ii = shl i64 %i, 1
+ %c0 = icmp eq i64 %i, %arg0
+ br i1 %c0, label %bb13, label %bb
+
+bb:
+ %t = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* undef
+ %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii
+ %c1 = icmp ne i8 addrspace(1)* %p, null
+ %i.next = add i64 %i, 1
+ br i1 %c1, label %bb11, label %bb13
+
+bb13:
+ unreachable
+}
+
+; CHECK-LABEL: @global_gep_user(
+; CHECK: %lsr.iv1 = phi i32 [ %lsr.iv.next2, %bb ], [ 0, %entry ]
+; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %bb ], [ %{{[0-9]+}}, %entry ]
+; CHECK: %lsr.iv.next = add i32 %lsr.iv, -1
+; CHECK: %lsr.iv.next2 = add i32 %lsr.iv1, 2
+; CHECK: br i1
+
+; CHECK: bb:
+; CHECK: %idxprom = sext i32 %lsr.iv1 to i64
+; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom
+define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind {
+entry:
+ br label %bb11
+
+bb11:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %bb ]
+ %ii = shl i32 %i, 1
+ %c0 = icmp eq i32 %i, %arg0
+ br i1 %c0, label %bb13, label %bb
+
+bb:
+ %t = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* undef
+ %p = getelementptr i8, i8 addrspace(1)* %t, i32 %ii
+ %c1 = icmp ne i8 addrspace(1)* %p, null
+ %i.next = add i32 %i, 1
+ br i1 %c1, label %bb11, label %bb13
+
+bb13:
+ unreachable
+}
+
+; CHECK-LABEL: @global_sext_scale_user(
+; CHECK: %lsr.iv1 = phi i32 [ %lsr.iv.next2, %bb ], [ 0, %entry ]
+; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %bb ], [ %{{[0-9]+}}, %entry ]
+; CHECK: %lsr.iv.next = add i32 %lsr.iv, -1
+; CHECK: %lsr.iv.next2 = add i32 %lsr.iv1, 2
+; CHECK: br i1
+
+; CHECK: bb
+; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
+define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind {
+entry:
+ br label %bb11
+
+bb11:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %bb ]
+ %ii = shl i32 %i, 1
+ %ii.ext = sext i32 %ii to i64
+ %c0 = icmp eq i32 %i, %arg0
+ br i1 %c0, label %bb13, label %bb
+
+bb:
+ %t = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* undef
+ %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
+ %c1 = icmp ne i8 addrspace(1)* %p, null
+ %i.next = add i32 %i, 1
+ br i1 %c1, label %bb11, label %bb13
+
+bb13:
+ unreachable
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll
new file mode 100644
index 00000000000..9a32fbec4f5
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+
+@array = external addrspace(4) constant [32 x [800 x i32]], align 4
+
+; GCN-LABEL: {{^}}test_lsr_voidty:
+define amdgpu_kernel void @test_lsr_voidty() {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body.i, %entry
+ br label %for.body.i
+
+for.body.i: ; preds = %for.body.i, %for.body
+ %ij = phi i32 [ 0, %for.body ], [ %inc14, %for.body.i ]
+ %tmp = load i32, i32 addrspace(5)* undef, align 4
+ %inc13 = or i32 %ij, 2
+ %shl = shl i32 1, 0
+ %and = and i32 %shl, %tmp
+ %tobool = icmp eq i32 %and, 0
+ %add = mul nuw nsw i32 %inc13, 5
+ %tmp1 = zext i32 %add to i64
+ %arrayidx8 = getelementptr inbounds [32 x [800 x i32]], [32 x [800 x i32]] addrspace(4)* @array, i64 0, i64 undef, i64 %tmp1
+ %tmp2 = load i32, i32 addrspace(4)* %arrayidx8, align 4
+ %and9 = select i1 %tobool, i32 0, i32 %tmp2
+ %xor = xor i32 undef, %and9
+ %inc1 = or i32 %ij, 3
+ %add2 = mul nuw nsw i32 %inc1, 5
+ %add6 = add nuw nsw i32 %add2, 1
+ %tmp3 = zext i32 %add6 to i64
+ %arrayidx9 = getelementptr inbounds [32 x [800 x i32]], [32 x [800 x i32]] addrspace(4)* @array, i64 0, i64 undef, i64 %tmp3
+ %tmp4 = bitcast i32 addrspace(4)* %arrayidx9 to <4 x i32> addrspace(4)*
+ %tmp5 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp4, align 4
+ %reorder_shuffle2 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %tmp6 = select <4 x i1> undef, <4 x i32> zeroinitializer, <4 x i32> %reorder_shuffle2
+ %inc14 = add nuw nsw i32 %ij, 4
+ br i1 undef, label %for.body, label %for.body.i
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
new file mode 100644
index 00000000000..38ff1708f42
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -0,0 +1,95 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -loop-reduce %s | FileCheck %s
+
+; Test for assert resulting from inconsistent isLegalAddressingMode
+; answers when the address space was dropped from the query.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+%0 = type { i32, double, i32, float }
+
+; CHECK-LABEL: @lsr_crash_preserve_addrspace_unknown_type(
+; CHECK: %tmp4 = bitcast %0 addrspace(3)* %tmp to double addrspace(3)*
+; CHECK: %scevgep5 = getelementptr double, double addrspace(3)* %tmp4, i32 1
+; CHECK: load double, double addrspace(3)* %scevgep5
+
+; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4
+; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep
+define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb17, %bb
+ %tmp = phi %0 addrspace(3)* [ undef, %bb ], [ %tmp18, %bb17 ]
+ %tmp2 = getelementptr inbounds %0, %0 addrspace(3)* %tmp, i64 0, i32 1
+ %tmp3 = load double, double addrspace(3)* %tmp2, align 8
+ br label %bb4
+
+bb4: ; preds = %bb1
+ br i1 undef, label %bb8, label %bb5
+
+bb5: ; preds = %bb4
+ unreachable
+
+bb8: ; preds = %bb4
+ %tmp9 = getelementptr inbounds %0, %0 addrspace(3)* %tmp, i64 0, i32 0
+ %tmp10 = load i32, i32 addrspace(3)* %tmp9, align 4
+ %tmp11 = icmp eq i32 0, %tmp10
+ br i1 %tmp11, label %bb12, label %bb17
+
+bb12: ; preds = %bb8
+ %tmp13 = getelementptr inbounds %0, %0 addrspace(3)* %tmp, i64 0, i32 2
+ %tmp14 = load i32, i32 addrspace(3)* %tmp13, align 4
+ %tmp15 = icmp eq i32 0, %tmp14
+ br i1 %tmp15, label %bb16, label %bb17
+
+bb16: ; preds = %bb12
+ unreachable
+
+bb17: ; preds = %bb12, %bb8
+ %tmp18 = getelementptr inbounds %0, %0 addrspace(3)* %tmp, i64 2
+ br label %bb1
+}
+
+; CHECK-LABEL: @lsr_crash_preserve_addrspace_unknown_type2(
+; CHECK: %scevgep3 = getelementptr i8, i8 addrspace(5)* %array, i32 %j
+; CHECK: %scevgep2 = getelementptr i8, i8 addrspace(5)* %array, i32 %j
+; CHECK: %n8 = load i8, i8 addrspace(5)* %scevgep2, align 4
+; CHECK: call void @llvm.memcpy.p5i8.p3i8.i64(i8 addrspace(5)* %scevgep3, i8 addrspace(3)* %scevgep4, i64 42, i1 false)
+; CHECK: call void @llvm.memmove.p5i8.p3i8.i64(i8 addrspace(5)* %scevgep3, i8 addrspace(3)* %scevgep4, i64 42, i1 false)
+; CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* %scevgep3, i8 42, i64 42, i1 false)
+define void @lsr_crash_preserve_addrspace_unknown_type2(i8 addrspace(5)* %array, i8 addrspace(3)* %array2) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.inc
+ %j = phi i32 [ %add, %for.inc ], [ 0, %entry ]
+ %idx = getelementptr inbounds i8, i8 addrspace(5)* %array, i32 %j
+ %idx1 = getelementptr inbounds i8, i8 addrspace(3)* %array2, i32 %j
+ %t = getelementptr inbounds i8, i8 addrspace(5)* %array, i32 %j
+ %n8 = load i8, i8 addrspace(5)* %t, align 4
+ %n7 = getelementptr inbounds i8, i8 addrspace(5)* %t, i32 42
+ %n9 = load i8, i8 addrspace(5)* %n7, align 4
+ %cmp = icmp sgt i32 %j, 42
+ %add = add nuw nsw i32 %j, 1
+ br i1 %cmp, label %if.then17, label %for.inc
+
+if.then17: ; preds = %for.body
+ call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* %idx, i8 addrspace(3)* %idx1, i64 42, i1 false)
+ call void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* %idx, i8 addrspace(3)* %idx1, i64 42, i1 false)
+ call void @llvm.memset.p5i8.i64(i8 addrspace(5)* %idx, i8 42, i64 42, i1 false)
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then17
+ %exitcond = icmp eq i1 %cmp, 1
+ br i1 %exitcond, label %end, label %for.body
+
+end: ; preds = %for.inc
+ ret void
+}
+
+declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)*, i8 addrspace(3)*, i64, i1)
+declare void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)*, i8 addrspace(3)*, i64, i1)
+declare void @llvm.memset.p5i8.i64(i8 addrspace(5)*, i8, i64, i1)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
OpenPOWER on IntegriCloud