2 files changed, 126 insertions, 65 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 81878b7abb1..dd654b7b493 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5469,6 +5469,77 @@ class HorizontalReduction {
         return OperationData(
             Instruction::FCmp, LHS, RHS, RK_Max,
             cast<Instruction>(Select->getCondition())->hasNoNaNs());
+      } else {
+        // Try harder: look for min/max pattern based on instructions producing
+        // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+        // During the intermediate stages of SLP, it's very common to have
+        // pattern like this (since optimizeGatherSequence is run only once
+        // at the end):
+        // %1 = extractelement <2 x i32> %a, i32 0
+        // %2 = extractelement <2 x i32> %a, i32 1
+        // %cond = icmp sgt i32 %1, %2
+        // %3 = extractelement <2 x i32> %a, i32 0
+        // %4 = extractelement <2 x i32> %a, i32 1
+        // %select = select i1 %cond, i32 %3, i32 %4
+        CmpInst::Predicate Pred;
+        Instruction *L1;
+        Instruction *L2;
+
+        LHS = Select->getTrueValue();
+        RHS = Select->getFalseValue();
+        Value *Cond = Select->getCondition();
+
+        // TODO: Support inverse predicates.
+        if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+          if (!isa<ExtractElementInst>(RHS) ||
+              !L2->isIdenticalTo(cast<Instruction>(RHS)))
+            return OperationData(V);
+        } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+          if (!isa<ExtractElementInst>(LHS) ||
+              !L1->isIdenticalTo(cast<Instruction>(LHS)))
+            return OperationData(V);
+        } else {
+          if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+            return OperationData(V);
+          if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+              !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+              !L2->isIdenticalTo(cast<Instruction>(RHS)))
+            return OperationData(V);
+        }
+        switch (Pred) {
+        default:
+          return OperationData(V);
+
+        case CmpInst::ICMP_ULT:
+        case CmpInst::ICMP_ULE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+        case CmpInst::ICMP_SLT:
+        case CmpInst::ICMP_SLE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+        case CmpInst::FCMP_OLT:
+        case CmpInst::FCMP_OLE:
+        case CmpInst::FCMP_ULT:
+        case CmpInst::FCMP_ULE:
+          return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+                               cast<Instruction>(Cond)->hasNoNaNs());
+
+        case CmpInst::ICMP_UGT:
+        case CmpInst::ICMP_UGE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+        case CmpInst::ICMP_SGT:
+        case CmpInst::ICMP_SGE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+        case CmpInst::FCMP_OGT:
+        case CmpInst::FCMP_OGE:
+        case CmpInst::FCMP_UGT:
+        case CmpInst::FCMP_UGE:
+          return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+                               cast<Instruction>(Cond)->hasNoNaNs());
+        }
       }
     }
     return OperationData(V);
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
index 6851fb9266a..4007a0d30ed 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
@@ -20,21 +20,19 @@ define i32 @smaxv6() {
 ; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
-; GFX9-NEXT:    [[LOAD3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; GFX9-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], i32 [[SELECT1]], i32 [[LOAD3]]
-; GFX9-NEXT:    [[LOAD4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; GFX9-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], i32 [[SELECT2]], i32 [[LOAD4]]
-; GFX9-NEXT:    [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; GFX9-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], i32 [[SELECT3]], i32 [[LOAD5]]
-; GFX9-NEXT:    [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; GFX9-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[SELECT4]], i32 [[LOAD6]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
 ; GFX9-NEXT:    store i32 [[STORE_SELECT]], i32* @var, align 8
-; GFX9-NEXT:    ret i32 [[SELECT5]]
+; GFX9-NEXT:    ret i32 [[OP_EXTRA]]
 ;
   %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -69,21 +67,19 @@ define i64 @sminv6() {
 ; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; GFX9-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
-; GFX9-NEXT:    [[LOAD3:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16
-; GFX9-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], i64 [[SELECT1]], i64 [[LOAD3]]
-; GFX9-NEXT:    [[LOAD4:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8
-; GFX9-NEXT:    [[CMP3:%.*]] = icmp slt i64 [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], i64 [[SELECT2]], i64 [[LOAD4]]
-; GFX9-NEXT:    [[LOAD5:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16
-; GFX9-NEXT:    [[CMP4:%.*]] = icmp slt i64 [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], i64 [[SELECT3]], i64 [[LOAD5]]
-; GFX9-NEXT:    [[LOAD6:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8
-; GFX9-NEXT:    [[CMP5:%.*]] = icmp slt i64 [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], i64 [[SELECT4]], i64 [[LOAD6]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
 ; GFX9-NEXT:    store i64 [[STORE_SELECT]], i64* @var64, align 8
-; GFX9-NEXT:    ret i64 [[SELECT5]]
+; GFX9-NEXT:    ret i64 [[OP_EXTRA]]
 ;
   %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
   %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
@@ -118,21 +114,19 @@ define float @fmaxv6() {
 ; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
 ; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
-; GFX9-NEXT:    [[LOAD3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
-; GFX9-NEXT:    [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]]
-; GFX9-NEXT:    [[LOAD4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
-; GFX9-NEXT:    [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]]
-; GFX9-NEXT:    [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
-; GFX9-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]]
-; GFX9-NEXT:    [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
-; GFX9-NEXT:    [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
 ; GFX9-NEXT:    store float [[STORE_SELECT]], float* @fvar, align 8
-; GFX9-NEXT:    ret float [[SELECT5]]
+; GFX9-NEXT:    ret float [[OP_EXTRA]]
 ;
   %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
   %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
@@ -167,21 +161,19 @@ define double @dminv6() {
 ; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
-; GFX9-NEXT:    [[LOAD3:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
-; GFX9-NEXT:    [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]]
-; GFX9-NEXT:    [[LOAD4:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
-; GFX9-NEXT:    [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]]
-; GFX9-NEXT:    [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
-; GFX9-NEXT:    [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]]
-; GFX9-NEXT:    [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
-; GFX9-NEXT:    [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
 ; GFX9-NEXT:    store double [[STORE_SELECT]], double* @dvar, align 8
-; GFX9-NEXT:    ret double [[SELECT5]]
+; GFX9-NEXT:    ret double [[OP_EXTRA]]
 ;
   %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
   %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
@@ -216,21 +208,19 @@ define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
 ; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
 ; GFX9-NEXT:    [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
-; GFX9-NEXT:    [[LOAD3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; GFX9-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], i32 [[SELECT1]], i32 [[LOAD3]]
-; GFX9-NEXT:    [[LOAD4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; GFX9-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], i32 [[SELECT2]], i32 [[LOAD4]]
-; GFX9-NEXT:    [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; GFX9-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], i32 [[SELECT3]], i32 [[LOAD5]]
-; GFX9-NEXT:    [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; GFX9-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[SELECT4]], i32 [[LOAD6]]
+; GFX9-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]]
 ; GFX9-NEXT:    [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
 ; GFX9-NEXT:    store i32 [[STOREVAL]], i32* @var, align 8
-; GFX9-NEXT:    ret i32 [[SELECT5]]
+; GFX9-NEXT:    ret i32 [[OP_EXTRA]]
 ;
   %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
   %elt1 = extractelement <2 x i32> %vload, i32 0