diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2019-10-05 18:03:58 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2019-10-05 18:03:58 +0000 |
| commit | e2321bb4488a81b87742f3343e3bdf8e161aa35b (patch) | |
| tree | 48e6260a743b8adf2a2866d6250955e09c2ce8a6 /llvm/test/Transforms/SLPVectorizer | |
| parent | 9ecacb0d54fb89dc7e6da66d9ecae934ca5c01d4 (diff) | |
| download | bcm5719-llvm-e2321bb4488a81b87742f3343e3bdf8e161aa35b.tar.gz bcm5719-llvm-e2321bb4488a81b87742f3343e3bdf8e161aa35b.zip | |
[SLP] avoid reduction transform on patterns that the backend can load-combine
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a scalar cost model adjustment with a conservative pattern match and cost
summation for a multi-instruction sequence that can probably be reduced later.
This should prevent SLP from creating a vector reduction unless that sequence is
extremely cheap.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 373833
Diffstat (limited to 'llvm/test/Transforms/SLPVectorizer')
| -rw-r--r-- | llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll | 156 |
1 files changed, 104 insertions, 52 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll index e3452e194db..c44a8524edf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -15,31 +15,37 @@ define i64 @load_bswap(%v8i8* %p) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] ; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] ; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] ; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] ; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 ; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 ; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 ; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 ; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32> +; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 +; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 +; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 +; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 ; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 ; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 ; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]] -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]] -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]] -; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]] -; CHECK-NEXT: ret i64 [[OP_EXTRA]] +; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] +; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] +; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] +; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] +; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] +; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] +; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]] +; CHECK-NEXT: ret i64 [[OR01234567]] ; %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0 %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1 @@ -97,18 +103,38 @@ define i64 @load_bswap_nop_shift(%v8i8* %p) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: ret i64 [[TMP5]] +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] +; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] +; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] +; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] +; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 +; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 +; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 +; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 +; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 +; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 +; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 +; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 +; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 +; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 +; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 +; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 +; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0 +; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] +; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] +; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] +; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] +; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] +; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] +; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]] +; CHECK-NEXT: ret i64 [[OR01234567]] ; %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0 %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1 @@ -168,30 +194,36 @@ define i64 @load64le(i8* %arg) { ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7 ; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1 +; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1 +; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1 +; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1 ; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1 ; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1 ; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1 ; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 +; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 ; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 ; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 ; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32> +; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 +; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 +; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 +; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 ; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 ; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 ; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]] -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]] -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]] -; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]] -; CHECK-NEXT: ret i64 [[OP_EXTRA]] +; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]] +; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] +; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] +; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] +; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] +; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] +; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] +; CHECK-NEXT: ret i64 [[O7]] ; %g1 = getelementptr inbounds i8, i8* %arg, i64 1 %g2 = getelementptr inbounds i8, i8* %arg, i64 2 @@ -247,18 +279,38 @@ define i64 @load64le_nop_shift(i8* %arg) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: ret i64 [[TMP5]] +; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1 +; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1 +; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1 +; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1 +; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1 +; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1 +; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1 +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 +; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 +; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 +; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 +; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0 +; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 +; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 +; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 +; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 +; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 +; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 +; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 +; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]] +; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] +; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] +; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] +; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] +; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] +; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] +; CHECK-NEXT: ret i64 [[O7]] ; %g1 = getelementptr inbounds i8, i8* %arg, i64 1 %g2 = getelementptr inbounds i8, i8* %arg, i64 2 |

