summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/SLPVectorizer/AMDGPU
diff options
context:
space:
mode:
authorEric Christopher <echristo@gmail.com>2019-04-17 02:12:23 +0000
committerEric Christopher <echristo@gmail.com>2019-04-17 02:12:23 +0000
commita86343512845c9c1fdbac865fea88aa5fce7142a (patch)
tree666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/SLPVectorizer/AMDGPU
parent7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff)
downloadbcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz
bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip
Temporarily Revert "Add basic loop fusion pass."
As it's causing some bot failures (and per request from kbarton). This reverts commit r358543/ab70da07286e618016e78247e4a24fcb84077fda. llvm-svn: 358546
Diffstat (limited to 'llvm/test/Transforms/SLPVectorizer/AMDGPU')
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll149
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll250
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg3
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll203
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll722
5 files changed, 0 insertions, 1327 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll
deleted file mode 100644
index 735ce651ed7..00000000000
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -slp-threshold=-18 < %s | FileCheck %s
-
-; Make sure there's no SCEV assert when the indexes are for different
-; sized address spaces
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-define void @slp_scev_assert(i32 %idx, i64 %tmp3) #0 {
-; CHECK-LABEL: @slp_scev_assert(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP:%.*]] = addrspacecast i8 addrspace(5)* undef to i8*
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 [[IDX:%.*]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP]], i64 [[TMP3:%.*]]
-; CHECK-NEXT: store i8 0, i8 addrspace(5)* [[TMP2]]
-; CHECK-NEXT: store i8 0, i8* [[TMP4]]
-; CHECK-NEXT: ret void
-;
-bb:
- %tmp = addrspacecast i8 addrspace(5)* undef to i8*
- %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx
- %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3
- store i8 0, i8 addrspace(5)* %tmp2
- store i8 0, i8* %tmp4
- ret void
-}
-
-define void @multi_as_reduction_different_sized(i32 addrspace(3)* %lds, i32 %idx0, i64 %idx1) #0 {
-; CHECK-LABEL: @multi_as_reduction_different_sized(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[FLAT:%.*]] = addrspacecast i32 addrspace(3)* [[LDS:%.*]] to i32*
-; CHECK-NEXT: [[ADD0:%.*]] = add i32 [[IDX0:%.*]], 2
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
-; CHECK-NEXT: [[LDS_1:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* [[LDS]], i32 [[ADD0]]
-; CHECK-NEXT: [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
-; CHECK-NEXT: [[LOAD_LDS_0:%.*]] = load i32, i32 addrspace(3)* [[LDS]], align 4
-; CHECK-NEXT: [[LOAD_LDS_1:%.*]] = load i32, i32 addrspace(3)* [[LDS_1]], align 4
-; CHECK-NEXT: [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
-; CHECK-NEXT: [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
-; CHECK-NEXT: [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_LDS_0]]
-; CHECK-NEXT: [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_LDS_1]]
-; CHECK-NEXT: store i32 [[SUB0]], i32* undef
-; CHECK-NEXT: store i32 [[SUB1]], i32* undef
-; CHECK-NEXT: ret void
-;
-bb:
- %flat = addrspacecast i32 addrspace(3)* %lds to i32*
- %add0 = add i32 %idx0, 2
- %add1 = add i64 %idx1, 1
-
- %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 %add0
- %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
-
- %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4
- %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4
-
- %load.flat.0 = load i32, i32* %flat, align 4
- %load.flat.1 = load i32, i32* %flat.1, align 4
-
- %sub0 = sub i32 %load.flat.0, %load.lds.0
- %sub1 = sub i32 %load.flat.1, %load.lds.1
-
- store i32 %sub0, i32* undef
- store i32 %sub1, i32* undef
- ret void
-}
-
-; This should vectorize if using GetUnderlyingObject
-define void @multi_as_reduction_same_size(i32 addrspace(1)* %global, i64 %idx0, i64 %idx1) #0 {
-; CHECK-LABEL: @multi_as_reduction_same_size(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[FLAT:%.*]] = addrspacecast i32 addrspace(1)* [[GLOBAL:%.*]] to i32*
-; CHECK-NEXT: [[ADD0:%.*]] = add i64 [[IDX0:%.*]], 2
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
-; CHECK-NEXT: [[GLOBAL_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[GLOBAL]], i64 [[ADD0]]
-; CHECK-NEXT: [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
-; CHECK-NEXT: [[LOAD_GLOBAL_0:%.*]] = load i32, i32 addrspace(1)* [[GLOBAL]], align 4
-; CHECK-NEXT: [[LOAD_GLOBAL_1:%.*]] = load i32, i32 addrspace(1)* [[GLOBAL_1]], align 4
-; CHECK-NEXT: [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
-; CHECK-NEXT: [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
-; CHECK-NEXT: [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_GLOBAL_0]]
-; CHECK-NEXT: [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_GLOBAL_1]]
-; CHECK-NEXT: store i32 [[SUB0]], i32* undef
-; CHECK-NEXT: store i32 [[SUB1]], i32* undef
-; CHECK-NEXT: ret void
-;
-bb:
- %flat = addrspacecast i32 addrspace(1)* %global to i32*
- %add0 = add i64 %idx0, 2
- %add1 = add i64 %idx1, 1
-
- %global.1 = getelementptr inbounds i32, i32 addrspace(1)* %global, i64 %add0
- %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
-
- %load.global.0 = load i32, i32 addrspace(1)* %global, align 4
- %load.global.1 = load i32, i32 addrspace(1)* %global.1, align 4
-
- %load.flat.0 = load i32, i32* %flat, align 4
- %load.flat.1 = load i32, i32* %flat.1, align 4
-
- %sub0 = sub i32 %load.flat.0, %load.global.0
- %sub1 = sub i32 %load.flat.1, %load.global.1
-
- store i32 %sub0, i32* undef
- store i32 %sub1, i32* undef
- ret void
-}
-
-; This should vectorize if using GetUnderlyingObject
-; The add is done in the same width, even though the address space size is smaller
-define void @multi_as_reduction_different_sized_noncanon(i32 addrspace(3)* %lds, i64 %idx0, i64 %idx1) #0 {
-; CHECK-LABEL: @multi_as_reduction_different_sized_noncanon(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[FLAT:%.*]] = addrspacecast i32 addrspace(3)* [[LDS:%.*]] to i32*
-; CHECK-NEXT: [[ADD0:%.*]] = add i64 [[IDX0:%.*]], 2
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
-; CHECK-NEXT: [[LDS_1:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* [[LDS]], i64 [[ADD0]]
-; CHECK-NEXT: [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
-; CHECK-NEXT: [[LOAD_LDS_0:%.*]] = load i32, i32 addrspace(3)* [[LDS]], align 4
-; CHECK-NEXT: [[LOAD_LDS_1:%.*]] = load i32, i32 addrspace(3)* [[LDS_1]], align 4
-; CHECK-NEXT: [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
-; CHECK-NEXT: [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
-; CHECK-NEXT: [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_LDS_0]]
-; CHECK-NEXT: [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_LDS_1]]
-; CHECK-NEXT: store i32 [[SUB0]], i32* undef
-; CHECK-NEXT: store i32 [[SUB1]], i32* undef
-; CHECK-NEXT: ret void
-;
-bb:
- %flat = addrspacecast i32 addrspace(3)* %lds to i32*
- %add0 = add i64 %idx0, 2
- %add1 = add i64 %idx1, 1
-
- %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i64 %add0
- %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
-
- %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4
- %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4
-
- %load.flat.0 = load i32, i32* %flat, align 4
- %load.flat.1 = load i32, i32* %flat.1, align 4
-
- %sub0 = sub i32 %load.flat.0, %load.lds.0
- %sub1 = sub i32 %load.flat.1, %load.lds.1
-
- store i32 %sub0, i32* undef
- store i32 %sub1, i32* undef
- ret void
-}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
deleted file mode 100644
index 4007a0d30ed..00000000000
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
+++ /dev/null
@@ -1,250 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -S -slp-threshold=-100 -slp-vectorize-hor-store -dce | FileCheck %s --check-prefix=GFX9
-
-@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
-@arr64 = local_unnamed_addr global [32 x i64] zeroinitializer, align 16
-@var = global i32 zeroinitializer, align 8
-@var64 = global i64 zeroinitializer, align 8
-
-@farr = local_unnamed_addr global [32 x float] zeroinitializer, align 16
-@fvar = global float zeroinitializer, align 8
-
-@darr = local_unnamed_addr global [32 x double] zeroinitializer, align 16
-@dvar = global double zeroinitializer, align 8
-
-; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store.
-define i32 @smaxv6() {
-; GFX9-LABEL: @smaxv6(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
-; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
-; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
-; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]]
-; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
-; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8
-; GFX9-NEXT: ret i32 [[OP_EXTRA]]
-;
- %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
- %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
- %cmp1 = icmp sgt i32 %load1, %load2
- %select1 = select i1 %cmp1, i32 %load1, i32 %load2
-
- %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
- %cmp2 = icmp sgt i32 %select1, %load3
- %select2 = select i1 %cmp2, i32 %select1, i32 %load3
-
- %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
- %cmp3 = icmp sgt i32 %select2, %load4
- %select3 = select i1 %cmp3, i32 %select2, i32 %load4
-
- %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
- %cmp4 = icmp sgt i32 %select3, %load5
- %select4 = select i1 %cmp4, i32 %select3, i32 %load5
-
- %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
- %cmp5 = icmp sgt i32 %select4, %load6
- %select5 = select i1 %cmp5, i32 %select4, i32 %load6
-
- %store-select = select i1 %cmp1, i32 3, i32 4
- store i32 %store-select, i32* @var, align 8
- ret i32 %select5
-}
-
-define i64 @sminv6() {
-; GFX9-LABEL: @sminv6(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16
-; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
-; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
-; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
-; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]]
-; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
-; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8
-; GFX9-NEXT: ret i64 [[OP_EXTRA]]
-;
- %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
- %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
- %cmp1 = icmp slt i64 %load1, %load2
- %select1 = select i1 %cmp1, i64 %load1, i64 %load2
-
- %load3 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16
- %cmp2 = icmp slt i64 %select1, %load3
- %select2 = select i1 %cmp2, i64 %select1, i64 %load3
-
- %load4 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8
- %cmp3 = icmp slt i64 %select2, %load4
- %select3 = select i1 %cmp3, i64 %select2, i64 %load4
-
- %load5 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16
- %cmp4 = icmp slt i64 %select3, %load5
- %select4 = select i1 %cmp4, i64 %select3, i64 %load5
-
- %load6 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8
- %cmp5 = icmp slt i64 %select4, %load6
- %select5 = select i1 %cmp5, i64 %select4, i64 %load6
-
- %store-select = select i1 %cmp1, i64 3, i64 4
- store i64 %store-select, i64* @var64, align 8
- ret i64 %select5
-}
-
-define float @fmaxv6() {
-; GFX9-LABEL: @fmaxv6(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16
-; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
-; GFX9-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]
-; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]
-; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
-; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8
-; GFX9-NEXT: ret float [[OP_EXTRA]]
-;
- %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
- %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
- %cmp1 = fcmp fast ogt float %load1, %load2
- %select1 = select i1 %cmp1, float %load1, float %load2
-
- %load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
- %cmp2 = fcmp fast ogt float %select1, %load3
- %select2 = select i1 %cmp2, float %select1, float %load3
-
- %load4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
- %cmp3 = fcmp fast ogt float %select2, %load4
- %select3 = select i1 %cmp3, float %select2, float %load4
-
- %load5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
- %cmp4 = fcmp fast ogt float %select3, %load5
- %select4 = select i1 %cmp4, float %select3, float %load5
-
- %load6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
- %cmp5 = fcmp fast ogt float %select4, %load6
- %select5 = select i1 %cmp5, float %select4, float %load6
-
- %store-select = select i1 %cmp1, float 3.0, float 4.0
- store float %store-select, float* @fvar, align 8
- ret float %select5
-}
-
-define double @dminv6() {
-; GFX9-LABEL: @dminv6(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16
-; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
-; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
-; GFX9-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]
-; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]
-; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
-; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8
-; GFX9-NEXT: ret double [[OP_EXTRA]]
-;
- %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
- %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
- %cmp1 = fcmp fast olt double %load1, %load2
- %select1 = select i1 %cmp1, double %load1, double %load2
-
- %load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
- %cmp2 = fcmp fast olt double %select1, %load3
- %select2 = select i1 %cmp2, double %select1, double %load3
-
- %load4 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
- %cmp3 = fcmp fast olt double %select2, %load4
- %select3 = select i1 %cmp3, double %select2, double %load4
-
- %load5 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
- %cmp4 = fcmp fast olt double %select3, %load5
- %select4 = select i1 %cmp4, double %select3, double %load5
-
- %load6 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
- %cmp5 = fcmp fast olt double %select4, %load6
- %select5 = select i1 %cmp5, double %select4, double %load6
-
- %store-select = select i1 %cmp1, double 3.0, double 4.0
- store double %store-select, double* @dvar, align 8
- ret double %select5
-}
-
-define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
-; GFX9-LABEL: @smax_wdiff_valuenum(
-; GFX9-NEXT: [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
-; GFX9-NEXT: [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
-; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
-; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
-; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
-; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
-; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]]
-; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
-; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8
-; GFX9-NEXT: ret i32 [[OP_EXTRA]]
-;
- %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
- %elt1 = extractelement <2 x i32> %vload, i32 0
- %cmp1 = icmp sgt i32 %elt1, %v1
- %ex0 = extractelement <2 x i32> %vload, i32 0
- %select1 = select i1 %cmp1, i32 %ex0, i32 %v1
-
- %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
- %cmp2 = icmp sgt i32 %select1, %load3
- %select2 = select i1 %cmp2, i32 %select1, i32 %load3
-
- %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
- %cmp3 = icmp sgt i32 %select2, %load4
- %select3 = select i1 %cmp3, i32 %select2, i32 %load4
-
- %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
- %cmp4 = icmp sgt i32 %select3, %load5
- %select4 = select i1 %cmp4, i32 %select3, i32 %load5
-
- %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
- %cmp5 = icmp sgt i32 %select4, %load6
- %select5 = select i1 %cmp5, i32 %select4, i32 %load6
-
- %storeval = select i1 %cmp1, i32 3, i32 4
- store i32 %storeval, i32* @var, align 8
- ret i32 %select5
-}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
deleted file mode 100644
index 6baccf05fff..00000000000
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'AMDGPU' in config.root.targets:
- config.unsupported = True
-
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
deleted file mode 100644
index 55905a4c444..00000000000
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ /dev/null
@@ -1,203 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
-
-; FIXME: Should still like to vectorize the memory operations for VI
-
-; Simple 3-pair chain with loads and stores
-; GCN-LABEL: @test1_as_3_3_3_v2f16(
-; GFX89: load <2 x half>, <2 x half> addrspace(3)*
-; GFX89: load <2 x half>, <2 x half> addrspace(3)*
-; GFX89: fmul <2 x half>
-; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
-; GFX89: ret
-define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %i1 = load half, half addrspace(3)* %b, align 2
- %mul = fmul half %i0, %i1
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
- %i4 = load half, half addrspace(3)* %arrayidx4, align 2
- %mul5 = fmul half %i3, %i4
- store half %mul, half addrspace(3)* %c, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- store half %mul5, half addrspace(3)* %arrayidx5, align 2
- ret void
-}
-
-; GCN-LABEL: @test1_as_3_0_0(
-; GFX89: load <2 x half>, <2 x half> addrspace(3)*
-; GFX89: load <2 x half>, <2 x half>*
-; GFX89: fmul <2 x half>
-; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
-; GFX89: ret
-define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %i1 = load half, half* %b, align 2
- %mul = fmul half %i0, %i1
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
- %i4 = load half, half* %arrayidx4, align 2
- %mul5 = fmul half %i3, %i4
- store half %mul, half* %c, align 2
- %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
- store half %mul5, half* %arrayidx5, align 2
- ret void
-}
-
-; GCN-LABEL: @test1_as_0_0_3_v2f16(
-; GFX89: load <2 x half>, <2 x half>*
-; GFX89: load <2 x half>, <2 x half>*
-; GFX89: fmul <2 x half>
-; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
-; GFX89: ret
-define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
- %i0 = load half, half* %a, align 2
- %i1 = load half, half* %b, align 2
- %mul = fmul half %i0, %i1
- %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
- %i3 = load half, half* %arrayidx3, align 2
- %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
- %i4 = load half, half* %arrayidx4, align 2
- %mul5 = fmul half %i3, %i4
- store half %mul, half addrspace(3)* %c, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- store half %mul5, half addrspace(3)* %arrayidx5, align 2
- ret void
-}
-
-; GCN-LABEL: @test1_fma_v2f16(
-; GFX9: load <2 x half>
-; GFX9: load <2 x half>
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fma.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %i1 = load half, half addrspace(3)* %b, align 2
- %i2 = load half, half addrspace(3)* %c, align 2
- %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
- %i4 = load half, half addrspace(3)* %arrayidx4, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- %i5 = load half, half addrspace(3)* %arrayidx5, align 2
- %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
- store half %fma0, half addrspace(3)* %d, align 2
- %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
- store half %fma1, half addrspace(3)* %arrayidx6, align 2
- ret void
-}
-
-; GCN-LABEL: @mul_scalar_v2f16(
-; GFX9: load <2 x half>
-; GFX9: fmul <2 x half>
-; GFX9: store <2 x half>
-define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %mul = fmul half %i0, %scalar
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %mul5 = fmul half %i3, %scalar
- store half %mul, half addrspace(3)* %c, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- store half %mul5, half addrspace(3)* %arrayidx5, align 2
- ret void
-}
-
-; GCN-LABEL: @fabs_v2f16
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fabs.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %fabs0 = call half @llvm.fabs.f16(half %i0)
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %fabs1 = call half @llvm.fabs.f16(half %i3)
- store half %fabs0, half addrspace(3)* %c, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- store half %fabs1, half addrspace(3)* %arrayidx5, align 2
- ret void
-}
-
-; GCN-LABEL: @test1_fabs_fma_v2f16(
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fabs.v2f16(
-; GFX9: call <2 x half> @llvm.fma.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %i1 = load half, half addrspace(3)* %b, align 2
- %i2 = load half, half addrspace(3)* %c, align 2
- %i0.fabs = call half @llvm.fabs.f16(half %i0)
-
- %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
- %i4 = load half, half addrspace(3)* %arrayidx4, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- %i5 = load half, half addrspace(3)* %arrayidx5, align 2
- %i3.fabs = call half @llvm.fabs.f16(half %i3)
-
- %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
- store half %fma0, half addrspace(3)* %d, align 2
- %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
- store half %fma1, half addrspace(3)* %arrayidx6, align 2
- ret void
-}
-
-; FIXME: Should do vector load and extract component for fabs
-; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
-; GFX9: load half
-; GFX9: call half @llvm.fabs.f16(
-; GFX9: load <2 x half>
-; GFX9: load half
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fma.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %i1 = load half, half addrspace(3)* %b, align 2
- %i2 = load half, half addrspace(3)* %c, align 2
- %i1.fabs = call half @llvm.fabs.f16(half %i1)
-
- %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
- %i4 = load half, half addrspace(3)* %arrayidx4, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- %i5 = load half, half addrspace(3)* %arrayidx5, align 2
- %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
- store half %fma0, half addrspace(3)* %d, align 2
- %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
- store half %fma1, half addrspace(3)* %arrayidx6, align 2
- ret void
-}
-
-; GCN-LABEL: @canonicalize_v2f16
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
- %i0 = load half, half addrspace(3)* %a, align 2
- %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
- %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
- %i3 = load half, half addrspace(3)* %arrayidx3, align 2
- %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
- store half %canonicalize0, half addrspace(3)* %c, align 2
- %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
- store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
- ret void
-}
-
-declare half @llvm.fabs.f16(half) #1
-declare half @llvm.fma.f16(half, half, half) #1
-declare half @llvm.canonicalize.f16(half) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
deleted file mode 100644
index d7434394dcc..00000000000
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ /dev/null
@@ -1,722 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
-
-define half @reduction_half4(<4 x half> %a) {
-; GFX9-LABEL: @reduction_half4(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0
-; GFX9-NEXT: ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half4(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
-; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT: ret half [[ADD3]]
-;
-entry:
- %elt0 = extractelement <4 x half> %a, i64 0
- %elt1 = extractelement <4 x half> %a, i64 1
- %elt2 = extractelement <4 x half> %a, i64 2
- %elt3 = extractelement <4 x half> %a, i64 3
-
- %add1 = fadd fast half %elt1, %elt0
- %add2 = fadd fast half %elt2, %add1
- %add3 = fadd fast half %elt3, %add2
-
- ret half %add3
-}
-
-define half @reduction_half8(<8 x half> %vec8) {
-; GFX9-LABEL: @reduction_half8(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0
-; GFX9-NEXT: ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half8(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3
-; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
-; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT: [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
-; VI-NEXT: [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
-; VI-NEXT: [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
-; VI-NEXT: [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
-; VI-NEXT: ret half [[ADD7]]
-;
-entry:
- %elt0 = extractelement <8 x half> %vec8, i64 0
- %elt1 = extractelement <8 x half> %vec8, i64 1
- %elt2 = extractelement <8 x half> %vec8, i64 2
- %elt3 = extractelement <8 x half> %vec8, i64 3
- %elt4 = extractelement <8 x half> %vec8, i64 4
- %elt5 = extractelement <8 x half> %vec8, i64 5
- %elt6 = extractelement <8 x half> %vec8, i64 6
- %elt7 = extractelement <8 x half> %vec8, i64 7
-
- %add1 = fadd fast half %elt1, %elt0
- %add2 = fadd fast half %elt2, %add1
- %add3 = fadd fast half %elt3, %add2
- %add4 = fadd fast half %elt4, %add3
- %add5 = fadd fast half %elt5, %add4
- %add6 = fadd fast half %elt6, %add5
- %add7 = fadd fast half %elt7, %add6
-
- ret half %add7
-}
-
-define half @reduction_half16(<16 x half> %vec16) {
-; GFX9-LABEL: @reduction_half16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
-; GFX9-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0
-; GFX9-NEXT: ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3
-; VI-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
-; VI-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
-; VI-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
-; VI-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
-; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8
-; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
-; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
-; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
-; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT: [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
-; VI-NEXT: [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
-; VI-NEXT: [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
-; VI-NEXT: [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
-; VI-NEXT: [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]]
-; VI-NEXT: [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]]
-; VI-NEXT: [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]]
-; VI-NEXT: [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]]
-; VI-NEXT: [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]]
-; VI-NEXT: [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]]
-; VI-NEXT: [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]]
-; VI-NEXT: [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]]
-; VI-NEXT: ret half [[ADD15]]
-;
-entry:
- %elt0 = extractelement <16 x half> %vec16, i64 0
- %elt1 = extractelement <16 x half> %vec16, i64 1
- %elt2 = extractelement <16 x half> %vec16, i64 2
- %elt3 = extractelement <16 x half> %vec16, i64 3
- %elt4 = extractelement <16 x half> %vec16, i64 4
- %elt5 = extractelement <16 x half> %vec16, i64 5
- %elt6 = extractelement <16 x half> %vec16, i64 6
- %elt7 = extractelement <16 x half> %vec16, i64 7
- %elt8 = extractelement <16 x half> %vec16, i64 8
- %elt9 = extractelement <16 x half> %vec16, i64 9
- %elt10 = extractelement <16 x half> %vec16, i64 10
- %elt11 = extractelement <16 x half> %vec16, i64 11
- %elt12 = extractelement <16 x half> %vec16, i64 12
- %elt13 = extractelement <16 x half> %vec16, i64 13
- %elt14 = extractelement <16 x half> %vec16, i64 14
- %elt15 = extractelement <16 x half> %vec16, i64 15
-
- %add1 = fadd fast half %elt1, %elt0
- %add2 = fadd fast half %elt2, %add1
- %add3 = fadd fast half %elt3, %add2
- %add4 = fadd fast half %elt4, %add3
- %add5 = fadd fast half %elt5, %add4
- %add6 = fadd fast half %elt6, %add5
- %add7 = fadd fast half %elt7, %add6
- %add8 = fadd fast half %elt8, %add7
- %add9 = fadd fast half %elt9, %add8
- %add10 = fadd fast half %elt10, %add9
- %add11 = fadd fast half %elt11, %add10
- %add12 = fadd fast half %elt12, %add11
- %add13 = fadd fast half %elt13, %add12
- %add14 = fadd fast half %elt14, %add13
- %add15 = fadd fast half %elt15, %add14
-
- ret half %add15
-}
-
-; FIXME: support vectorization;
-define half @reduction_sub_half4(<4 x half> %a) {
-; GCN-LABEL: @reduction_sub_half4(
-; GCN-NEXT: entry:
-; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
-; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
-; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
-; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
-; GCN-NEXT: [[ADD1:%.*]] = fsub fast half [[ELT1]], [[ELT0]]
-; GCN-NEXT: [[ADD2:%.*]] = fsub fast half [[ELT2]], [[ADD1]]
-; GCN-NEXT: [[ADD3:%.*]] = fsub fast half [[ELT3]], [[ADD2]]
-; GCN-NEXT: ret half [[ADD3]]
-;
-entry:
- %elt0 = extractelement <4 x half> %a, i64 0
- %elt1 = extractelement <4 x half> %a, i64 1
- %elt2 = extractelement <4 x half> %a, i64 2
- %elt3 = extractelement <4 x half> %a, i64 3
-
- %add1 = fsub fast half %elt1, %elt0
- %add2 = fsub fast half %elt2, %add1
- %add3 = fsub fast half %elt3, %add2
-
- ret half %add3
-}
-
-define i16 @reduction_v4i16(<4 x i16> %a) {
-; GFX9-LABEL: @reduction_v4i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_v4i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3
-; VI-NEXT: [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
-; VI-NEXT: [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
-; VI-NEXT: ret i16 [[ADD3]]
-;
-entry:
- %elt0 = extractelement <4 x i16> %a, i64 0
- %elt1 = extractelement <4 x i16> %a, i64 1
- %elt2 = extractelement <4 x i16> %a, i64 2
- %elt3 = extractelement <4 x i16> %a, i64 3
-
- %add1 = add i16 %elt1, %elt0
- %add2 = add i16 %elt2, %add1
- %add3 = add i16 %elt3, %add2
-
- ret i16 %add3
-}
-
-define i16 @reduction_v8i16(<8 x i16> %vec8) {
-; GFX9-LABEL: @reduction_v8i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_v8i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
-; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
-; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
-; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
-; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
-; VI-NEXT: [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
-; VI-NEXT: [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
-; VI-NEXT: [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]]
-; VI-NEXT: [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]]
-; VI-NEXT: [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]]
-; VI-NEXT: [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]]
-; VI-NEXT: ret i16 [[ADD7]]
-;
-entry:
- %elt0 = extractelement <8 x i16> %vec8, i64 0
- %elt1 = extractelement <8 x i16> %vec8, i64 1
- %elt2 = extractelement <8 x i16> %vec8, i64 2
- %elt3 = extractelement <8 x i16> %vec8, i64 3
- %elt4 = extractelement <8 x i16> %vec8, i64 4
- %elt5 = extractelement <8 x i16> %vec8, i64 5
- %elt6 = extractelement <8 x i16> %vec8, i64 6
- %elt7 = extractelement <8 x i16> %vec8, i64 7
-
- %add1 = add i16 %elt1, %elt0
- %add2 = add i16 %elt2, %add1
- %add3 = add i16 %elt3, %add2
- %add4 = add i16 %elt4, %add3
- %add5 = add i16 %elt5, %add4
- %add6 = add i16 %elt6, %add5
- %add7 = add i16 %elt7, %add6
-
- ret i16 %add7
-}
-
-define i16 @reduction_umin_v4i16(<4 x i16> %vec4) {
-; GFX9-LABEL: @reduction_umin_v4i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_umin_v4i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
-; VI-NEXT: [[CMP1:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
-; VI-NEXT: [[CMP2:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]]
-; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MIN1]]
-; VI-NEXT: [[CMP3:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]]
-; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MIN2]]
-; VI-NEXT: ret i16 [[MIN3]]
-;
-entry:
- %elt0 = extractelement <4 x i16> %vec4, i64 0
- %elt1 = extractelement <4 x i16> %vec4, i64 1
- %elt2 = extractelement <4 x i16> %vec4, i64 2
- %elt3 = extractelement <4 x i16> %vec4, i64 3
-
- %cmp1 = icmp ult i16 %elt1, %elt0
- %min1 = select i1 %cmp1, i16 %elt1, i16 %elt0
- %cmp2 = icmp ult i16 %elt2, %min1
- %min2 = select i1 %cmp2, i16 %elt2, i16 %min1
- %cmp3 = icmp ult i16 %elt3, %min2
- %min3 = select i1 %cmp3, i16 %elt3, i16 %min2
-
- ret i16 %min3
-}
-
-define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) {
-; GFX9-LABEL: @reduction_icmp_v8i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_icmp_v8i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
-; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
-; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
-; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
-; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
-; VI-NEXT: [[CMP0:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]]
-; VI-NEXT: [[CMP1:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]]
-; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]]
-; VI-NEXT: [[CMP2:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]]
-; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]]
-; VI-NEXT: [[CMP3:%.*]] = icmp ult i16 [[ELT4]], [[MIN3]]
-; VI-NEXT: [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]]
-; VI-NEXT: [[CMP4:%.*]] = icmp ult i16 [[ELT5]], [[MIN4]]
-; VI-NEXT: [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]]
-; VI-NEXT: [[CMP5:%.*]] = icmp ult i16 [[ELT6]], [[MIN5]]
-; VI-NEXT: [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]]
-; VI-NEXT: [[CMP6:%.*]] = icmp ult i16 [[ELT7]], [[MIN6]]
-; VI-NEXT: [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]]
-; VI-NEXT: ret i16 [[MIN7]]
-;
-entry:
- %elt0 = extractelement <8 x i16> %vec8, i64 0
- %elt1 = extractelement <8 x i16> %vec8, i64 1
- %elt2 = extractelement <8 x i16> %vec8, i64 2
- %elt3 = extractelement <8 x i16> %vec8, i64 3
- %elt4 = extractelement <8 x i16> %vec8, i64 4
- %elt5 = extractelement <8 x i16> %vec8, i64 5
- %elt6 = extractelement <8 x i16> %vec8, i64 6
- %elt7 = extractelement <8 x i16> %vec8, i64 7
-
- %cmp0 = icmp ult i16 %elt1, %elt0
- %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
- %cmp1 = icmp ult i16 %elt2, %min1
- %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
- %cmp2 = icmp ult i16 %elt3, %min2
- %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
-
- %cmp3 = icmp ult i16 %elt4, %min3
- %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
- %cmp4 = icmp ult i16 %elt5, %min4
- %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
-
- %cmp5 = icmp ult i16 %elt6, %min5
- %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
- %cmp6 = icmp ult i16 %elt7, %min6
- %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
-
- ret i16 %min7
-}
-
-define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
-; GFX9-LABEL: @reduction_smin_v16i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]]
-; GFX9-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_smin_v16i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <16 x i16> [[VEC16:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <16 x i16> [[VEC16]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <16 x i16> [[VEC16]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <16 x i16> [[VEC16]], i64 3
-; VI-NEXT: [[ELT4:%.*]] = extractelement <16 x i16> [[VEC16]], i64 4
-; VI-NEXT: [[ELT5:%.*]] = extractelement <16 x i16> [[VEC16]], i64 5
-; VI-NEXT: [[ELT6:%.*]] = extractelement <16 x i16> [[VEC16]], i64 6
-; VI-NEXT: [[ELT7:%.*]] = extractelement <16 x i16> [[VEC16]], i64 7
-; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x i16> [[VEC16]], i64 8
-; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x i16> [[VEC16]], i64 9
-; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x i16> [[VEC16]], i64 10
-; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x i16> [[VEC16]], i64 11
-; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x i16> [[VEC16]], i64 12
-; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x i16> [[VEC16]], i64 13
-; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x i16> [[VEC16]], i64 14
-; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x i16> [[VEC16]], i64 15
-; VI-NEXT: [[CMP0:%.*]] = icmp slt i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]]
-; VI-NEXT: [[CMP1:%.*]] = icmp slt i16 [[ELT2]], [[MIN1]]
-; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]]
-; VI-NEXT: [[CMP2:%.*]] = icmp slt i16 [[ELT3]], [[MIN2]]
-; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]]
-; VI-NEXT: [[CMP3:%.*]] = icmp slt i16 [[ELT4]], [[MIN3]]
-; VI-NEXT: [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]]
-; VI-NEXT: [[CMP4:%.*]] = icmp slt i16 [[ELT5]], [[MIN4]]
-; VI-NEXT: [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]]
-; VI-NEXT: [[CMP5:%.*]] = icmp slt i16 [[ELT6]], [[MIN5]]
-; VI-NEXT: [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]]
-; VI-NEXT: [[CMP6:%.*]] = icmp slt i16 [[ELT7]], [[MIN6]]
-; VI-NEXT: [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]]
-; VI-NEXT: [[CMP7:%.*]] = icmp slt i16 [[ELT8]], [[MIN7]]
-; VI-NEXT: [[MIN8:%.*]] = select i1 [[CMP7]], i16 [[ELT8]], i16 [[MIN7]]
-; VI-NEXT: [[CMP8:%.*]] = icmp slt i16 [[ELT9]], [[MIN8]]
-; VI-NEXT: [[MIN9:%.*]] = select i1 [[CMP8]], i16 [[ELT9]], i16 [[MIN8]]
-; VI-NEXT: [[CMP9:%.*]] = icmp slt i16 [[ELT10]], [[MIN9]]
-; VI-NEXT: [[MIN10:%.*]] = select i1 [[CMP9]], i16 [[ELT10]], i16 [[MIN9]]
-; VI-NEXT: [[CMP10:%.*]] = icmp slt i16 [[ELT11]], [[MIN10]]
-; VI-NEXT: [[MIN11:%.*]] = select i1 [[CMP10]], i16 [[ELT11]], i16 [[MIN10]]
-; VI-NEXT: [[CMP11:%.*]] = icmp slt i16 [[ELT12]], [[MIN11]]
-; VI-NEXT: [[MIN12:%.*]] = select i1 [[CMP11]], i16 [[ELT12]], i16 [[MIN11]]
-; VI-NEXT: [[CMP12:%.*]] = icmp slt i16 [[ELT13]], [[MIN12]]
-; VI-NEXT: [[MIN13:%.*]] = select i1 [[CMP12]], i16 [[ELT13]], i16 [[MIN12]]
-; VI-NEXT: [[CMP13:%.*]] = icmp slt i16 [[ELT14]], [[MIN13]]
-; VI-NEXT: [[MIN14:%.*]] = select i1 [[CMP13]], i16 [[ELT14]], i16 [[MIN13]]
-; VI-NEXT: [[CMP14:%.*]] = icmp slt i16 [[ELT15]], [[MIN14]]
-; VI-NEXT: [[MIN15:%.*]] = select i1 [[CMP14]], i16 [[ELT15]], i16 [[MIN14]]
-; VI-NEXT: ret i16 [[MIN15]]
-;
-entry:
- %elt0 = extractelement <16 x i16> %vec16, i64 0
- %elt1 = extractelement <16 x i16> %vec16, i64 1
- %elt2 = extractelement <16 x i16> %vec16, i64 2
- %elt3 = extractelement <16 x i16> %vec16, i64 3
- %elt4 = extractelement <16 x i16> %vec16, i64 4
- %elt5 = extractelement <16 x i16> %vec16, i64 5
- %elt6 = extractelement <16 x i16> %vec16, i64 6
- %elt7 = extractelement <16 x i16> %vec16, i64 7
-
- %elt8 = extractelement <16 x i16> %vec16, i64 8
- %elt9 = extractelement <16 x i16> %vec16, i64 9
- %elt10 = extractelement <16 x i16> %vec16, i64 10
- %elt11 = extractelement <16 x i16> %vec16, i64 11
- %elt12 = extractelement <16 x i16> %vec16, i64 12
- %elt13 = extractelement <16 x i16> %vec16, i64 13
- %elt14 = extractelement <16 x i16> %vec16, i64 14
- %elt15 = extractelement <16 x i16> %vec16, i64 15
-
- %cmp0 = icmp slt i16 %elt1, %elt0
- %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
- %cmp1 = icmp slt i16 %elt2, %min1
- %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
- %cmp2 = icmp slt i16 %elt3, %min2
- %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
-
- %cmp3 = icmp slt i16 %elt4, %min3
- %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
- %cmp4 = icmp slt i16 %elt5, %min4
- %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
-
- %cmp5 = icmp slt i16 %elt6, %min5
- %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
- %cmp6 = icmp slt i16 %elt7, %min6
- %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
-
- %cmp7 = icmp slt i16 %elt8, %min7
- %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
- %cmp8 = icmp slt i16 %elt9, %min8
- %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
-
- %cmp9 = icmp slt i16 %elt10, %min9
- %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
- %cmp10 = icmp slt i16 %elt11, %min10
- %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
-
- %cmp11 = icmp slt i16 %elt12, %min11
- %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
- %cmp12 = icmp slt i16 %elt13, %min12
- %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
-
- %cmp13 = icmp slt i16 %elt14, %min13
- %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
- %cmp14 = icmp slt i16 %elt15, %min14
- %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
-
-
- ret i16 %min15
-}
-
-define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
-; GFX9-LABEL: @reduction_umax_v4i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_umax_v4i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
-; VI-NEXT: [[CMP1:%.*]] = icmp ugt i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
-; VI-NEXT: [[CMP2:%.*]] = icmp ugt i16 [[ELT2]], [[MAX1]]
-; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]]
-; VI-NEXT: [[CMP3:%.*]] = icmp ugt i16 [[ELT3]], [[MAX2]]
-; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]]
-; VI-NEXT: ret i16 [[MAX3]]
-;
-entry:
- %elt0 = extractelement <4 x i16> %vec4, i64 0
- %elt1 = extractelement <4 x i16> %vec4, i64 1
- %elt2 = extractelement <4 x i16> %vec4, i64 2
- %elt3 = extractelement <4 x i16> %vec4, i64 3
-
- %cmp1 = icmp ugt i16 %elt1, %elt0
- %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0
- %cmp2 = icmp ugt i16 %elt2, %max1
- %max2 = select i1 %cmp2, i16 %elt2, i16 %max1
- %cmp3 = icmp ugt i16 %elt3, %max2
- %max3 = select i1 %cmp3, i16 %elt3, i16 %max2
-
- ret i16 %max3
-}
-
-define i16 @reduction_smax_v4i16(<4 x i16> %vec4) {
-; GFX9-LABEL: @reduction_smax_v4i16(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_smax_v4i16(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
-; VI-NEXT: [[CMP1:%.*]] = icmp sgt i16 [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
-; VI-NEXT: [[CMP2:%.*]] = icmp sgt i16 [[ELT2]], [[MAX1]]
-; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]]
-; VI-NEXT: [[CMP3:%.*]] = icmp sgt i16 [[ELT3]], [[MAX2]]
-; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]]
-; VI-NEXT: ret i16 [[MAX3]]
-;
-entry:
- %elt0 = extractelement <4 x i16> %vec4, i64 0
- %elt1 = extractelement <4 x i16> %vec4, i64 1
- %elt2 = extractelement <4 x i16> %vec4, i64 2
- %elt3 = extractelement <4 x i16> %vec4, i64 3
-
- %cmp1 = icmp sgt i16 %elt1, %elt0
- %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0
- %cmp2 = icmp sgt i16 %elt2, %max1
- %max2 = select i1 %cmp2, i16 %elt2, i16 %max1
- %cmp3 = icmp sgt i16 %elt3, %max2
- %max3 = select i1 %cmp3, i16 %elt3, i16 %max2
-
- ret i16 %max3
-}
-
-define half @reduction_fmax_v4half(<4 x half> %vec4) {
-; GFX9-LABEL: @reduction_fmax_v4half(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_fmax_v4half(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
-; VI-NEXT: [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
-; VI-NEXT: [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
-; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
-; VI-NEXT: [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
-; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
-; VI-NEXT: ret half [[MAX3]]
-;
-entry:
- %elt0 = extractelement <4 x half> %vec4, i64 0
- %elt1 = extractelement <4 x half> %vec4, i64 1
- %elt2 = extractelement <4 x half> %vec4, i64 2
- %elt3 = extractelement <4 x half> %vec4, i64 3
-
- %cmp1 = fcmp fast ogt half %elt1, %elt0
- %max1 = select i1 %cmp1, half %elt1, half %elt0
- %cmp2 = fcmp fast ogt half %elt2, %max1
- %max2 = select i1 %cmp2, half %elt2, half %max1
- %cmp3 = fcmp fast ogt half %elt3, %max2
- %max3 = select i1 %cmp3, half %elt3, half %max2
-
- ret half %max3
-}
-
-define half @reduction_fmin_v4half(<4 x half> %vec4) {
-; GFX9-LABEL: @reduction_fmin_v4half(
-; GFX9-NEXT: entry:
-; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
-; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
-; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT: ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_fmin_v4half(
-; VI-NEXT: entry:
-; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
-; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
-; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
-; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
-; VI-NEXT: [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
-; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
-; VI-NEXT: [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
-; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
-; VI-NEXT: [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
-; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
-; VI-NEXT: ret half [[MIN3]]
-;
-entry:
- %elt0 = extractelement <4 x half> %vec4, i64 0
- %elt1 = extractelement <4 x half> %vec4, i64 1
- %elt2 = extractelement <4 x half> %vec4, i64 2
- %elt3 = extractelement <4 x half> %vec4, i64 3
-
- %cmp1 = fcmp fast olt half %elt1, %elt0
- %min1 = select i1 %cmp1, half %elt1, half %elt0
- %cmp2 = fcmp fast olt half %elt2, %min1
- %min2 = select i1 %cmp2, half %elt2, half %min1
- %cmp3 = fcmp fast olt half %elt3, %min2
- %min3 = select i1 %cmp3, half %elt3, half %min2
-
- ret half %min3
-}
-
-; Tests to make sure reduction does not kick in. vega does not support packed math for types larger than 16 bits.
-define float @reduction_v4float(<4 x float> %a) {
-; GCN-LABEL: @reduction_v4float(
-; GCN-NEXT: entry:
-; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x float> [[A]], i64 2
-; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x float> [[A]], i64 3
-; GCN-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
-; GCN-NEXT: [[ADD2:%.*]] = fadd fast float [[ELT2]], [[ADD1]]
-; GCN-NEXT: [[ADD3:%.*]] = fadd fast float [[ELT3]], [[ADD2]]
-; GCN-NEXT: ret float [[ADD3]]
-;
-entry:
- %elt0 = extractelement <4 x float> %a, i64 0
- %elt1 = extractelement <4 x float> %a, i64 1
- %elt2 = extractelement <4 x float> %a, i64 2
- %elt3 = extractelement <4 x float> %a, i64 3
-
- %add1 = fadd fast float %elt1, %elt0
- %add2 = fadd fast float %elt2, %add1
- %add3 = fadd fast float %elt3, %add2
-
- ret float %add3
-} \ No newline at end of file
OpenPOWER on IntegriCloud