summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp48
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll156
2 files changed, 152 insertions, 52 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4b396495390..974eff9974d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -618,6 +618,15 @@ public:
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable() const;
+ /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
@@ -3284,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
return true;
}
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+ if (RdxOpcode != Instruction::Or)
+ return false;
+
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+
+ // Look past the reduction to find a source value. Arbitrarily follow the
+ // path through operand 0 of any 'or'. Also, peek through optional
+ // shift-left-by-constant.
+ Value *ZextLoad = FirstReduced;
+ while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+ ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+ // Check if the input to the reduction is an extended load.
+ Value *LoadPtr;
+ if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ return false;
+
+ // Require that the total load bit width is a legal integer type.
+ // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+ // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+ Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+ unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+ LLVMContext &Context = FirstReduced->getContext();
+ if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+ return false;
+
+ // Everything matched - assume that we can fold the whole sequence using
+ // load combining.
+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
+ << *(cast<Instruction>(FirstReduced)) << "\n");
+
+ return true;
+}
+
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
@@ -6374,6 +6420,8 @@ public:
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
+ if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+ break;
V.computeMinimumValueSizes();
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
index e3452e194db..c44a8524edf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
@@ -15,31 +15,37 @@ define i64 @load_bswap(%v8i8* %p) {
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32>
+; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]]
-; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]]
-; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]]
-; CHECK-NEXT: ret i64 [[OP_EXTRA]]
+; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
+; CHECK-NEXT: ret i64 [[OR01234567]]
;
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -97,18 +103,38 @@ define i64 @load_bswap_nop_shift(%v8i8* %p) {
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT: ret i64 [[TMP5]]
+; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
+; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
+; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
+; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
+; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
+; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
+; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
+; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
+; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
+; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
+; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
+; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
+; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
+; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
+; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
+; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
+; CHECK-NEXT: ret i64 [[OR01234567]]
;
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -168,30 +194,36 @@ define i64 @load64le(i8* %arg) {
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32>
+; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]]
-; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]]
-; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]]
-; CHECK-NEXT: ret i64 [[OP_EXTRA]]
+; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]]
+; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT: ret i64 [[O7]]
;
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
@@ -247,18 +279,38 @@ define i64 @load64le_nop_shift(i8* %arg) {
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT: ret i64 [[TMP5]]
+; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
+; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
+; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
+; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
+; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
+; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
+; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
+; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
+; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
+; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
+; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
+; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
+; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
+; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
+; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]]
+; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT: ret i64 [[O7]]
;
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
OpenPOWER on IntegriCloud