diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/include/llvm/Target/TargetInstrInfo.h | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/atomic-ops.ll | 7 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/2012-08-30-select.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/MachineSink-SubReg.ll | 37 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/clz.ll | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/half.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/machine-cse.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/uint64-to-float.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_int_to_fp.ll | 560 |
10 files changed, 348 insertions, 308 deletions
diff --git a/llvm/include/llvm/Target/TargetInstrInfo.h b/llvm/include/llvm/Target/TargetInstrInfo.h index 2de98acccbe..af0fc803a75 100644 --- a/llvm/include/llvm/Target/TargetInstrInfo.h +++ b/llvm/include/llvm/Target/TargetInstrInfo.h @@ -270,11 +270,8 @@ public: /// MachineSink determines on its own whether the instruction is safe to sink; /// this gives the target a hook to override the default behavior with regards /// to which instructions should be sunk. - /// The default behavior is to not sink insert_subreg, subreg_to_reg, and - /// reg_sequence. These are meant to be close to the source to make it easier - /// to coalesce. virtual bool shouldSink(const MachineInstr &MI) const { - return !MI.isInsertSubreg() && !MI.isSubregToReg() && !MI.isRegSequence(); + return true; } /// Re-issue the specified 'original' instruction at the diff --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll index 9fac8d8a868..b763e065200 100644 --- a/llvm/test/CodeGen/AArch64/atomic-ops.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll @@ -452,20 +452,19 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind { define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_xchg_i32: +; CHECK: mov {{[xw]}}8, w[[OLD:[0-9]+]] %old = atomicrmw xchg i32* @var32, i32 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 ; CHECK: .LBB{{[0-9]+}}_1: -; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]] +; ; CHECK: ldxr {{[xw]}}[[OLD]], [x[[ADDR]]] ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. -; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]] +; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w8, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb - -; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] ret i32 %old } diff --git a/llvm/test/CodeGen/ARM/2012-08-30-select.ll b/llvm/test/CodeGen/ARM/2012-08-30-select.ll index e78bbdea01f..dbedad2637b 100644 --- a/llvm/test/CodeGen/ARM/2012-08-30-select.ll +++ b/llvm/test/CodeGen/ARM/2012-08-30-select.ll @@ -2,8 +2,10 @@ ; rdar://12201387 ;CHECK-LABEL: select_s_v_v: -;CHECK: it ne +;CHECK: itee ne ;CHECK-NEXT: vmovne.i32 +;CHECK-NEXT: vmoveq +;CHECK-NEXT: vmoveq ;CHECK: bx define <16 x i8> @select_s_v_v(<16 x i8> %vec, i32 %avail) { entry: diff --git a/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll b/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll index c687b6905b7..151f0ffc751 100644 --- a/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll +++ b/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 | grep mov | count 2 +; RUN: llc < %s -march=x86-64 | grep mov | count 1 ; rdar://6806252 define i64 @test(i32* %tmp13) nounwind { diff --git a/llvm/test/CodeGen/X86/MachineSink-SubReg.ll b/llvm/test/CodeGen/X86/MachineSink-SubReg.ll new file mode 100644 index 00000000000..f78d4d9b527 --- /dev/null +++ b/llvm/test/CodeGen/X86/MachineSink-SubReg.ll @@ -0,0 +1,37 @@ +; PR28852: Check machine code sinking is not stopped by SUBREG_TO_REG. +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: foo +; CHECK-NOT: imull +; CHECK: retq +; CHECK: imull + +define void @foo(i64 %value, i32 %kLengthBits, i32* nocapture %bits, i64* nocapture %bit_buffer_64, i32 %x) local_unnamed_addr { +entry: + %mul = mul i32 %x, %kLengthBits + %add = add i32 %mul, 3 + %conv = zext i32 %add to i64 + %mul2 = mul nuw nsw i64 %conv, 5 + %sub = sub i64 64, %value + %conv4 = trunc i64 %sub to i32 + %tmp0 = load i32, i32* %bits, align 4 + %cmp = icmp ult i32 %tmp0, %conv4 + br i1 %cmp, label %if.then, label %if.end, !prof !0 + +if.then: ; preds = %entry + %add7 = add i64 %mul2, %value + %tmp1 = load i64, i64* %bit_buffer_64, align 8 + %add8 = add i64 %add7, %tmp1 + store i64 %add8, i64* %bit_buffer_64, align 8 + %conv9 = trunc i64 %mul2 to i32 + store i32 %conv9, i32* %bits, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 2000} diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll index 685b2588bf5..40bbac220ec 100644 --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -427,13 +427,14 @@ define i64 @ctlz_i64_zero_test(i64 %n) { ; ; X64-LABEL: ctlz_i64_zero_test: ; X64: # BB#0: -; X64-NEXT: movl $64, %eax ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB11_2 -; X64-NEXT: # BB#1: # %cond.false +; X64-NEXT: je .LBB11_1 +; X64-NEXT: # BB#2: # %cond.false ; X64-NEXT: bsrq %rdi, %rax ; X64-NEXT: xorq $63, %rax -; X64-NEXT: .LBB11_2: # %cond.end +; X64-NEXT: retq +; X64-NEXT: .LBB11_1: +; X64-NEXT: movl $64, %eax ; X64-NEXT: retq ; ; X32-CLZ-LABEL: ctlz_i64_zero_test: @@ -601,12 +602,13 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; ; X64-LABEL: cttz_i64_zero_test: ; X64: # BB#0: -; X64-NEXT: movl $64, %eax ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB15_2 -; X64-NEXT: # BB#1: # %cond.false +; X64-NEXT: je .LBB15_1 +; X64-NEXT: # BB#2: # %cond.false ; X64-NEXT: bsfq %rdi, %rax -; X64-NEXT: .LBB15_2: # %cond.end +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: +; X64-NEXT: movl $64, %eax ; X64-NEXT: retq ; ; X32-CLZ-LABEL: cttz_i64_zero_test: diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 739bb146e3a..4c8003f0c51 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -157,8 +157,6 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 { ; CHECK-LABEL: test_uitofp_i64: ; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]] ; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]] -; CHECK-NEXT: movl %edi, [[REG0:%[a-z0-9]+]] -; CHECK-NEXT: andl $1, [[REG0]] ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]] @@ -169,8 +167,10 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 { ; convert using shift+or if negative ; CHECK-NEXT: [[LABEL1]]: -; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: orq %rdi, [[REG2:%[a-z0-9]+]] +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]] ; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]] ; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]] ; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]] diff --git a/llvm/test/CodeGen/X86/machine-cse.ll b/llvm/test/CodeGen/X86/machine-cse.ll index c6876d29dfc..9853a7cde67 100644 --- a/llvm/test/CodeGen/X86/machine-cse.ll +++ b/llvm/test/CodeGen/X86/machine-cse.ll @@ -53,15 +53,15 @@ entry: sw.bb: ; preds = %entry, %entry, %entry ; CHECK: %sw.bb -; CHECK: imull +; CHECK-NOT: imull %mul = mul nsw i32 %test_case, 3 %mul20 = mul nsw i32 %mul, %scale br i1 undef, label %if.end34, label %sw.bb307 if.end34: ; preds = %sw.bb ; CHECK: %if.end34 +; CHECK: imull ; CHECK: leal -; CHECK-NOT: imull tail call void (...) @printf(i32 %test_case, i32 %mul20) nounwind %tmp = mul i32 %scale, %test_case %tmp752 = mul i32 %tmp, 3 @@ -104,12 +104,13 @@ return: ; preds = %if.end, %entry ; rdar://11393714 define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp { ; CHECK: %entry -; CHECK: xorl +; CHECK-NOT: xorl ; CHECK: %preheader +; CHECK-NOT: xorl ; CHECK: %do.body ; CHECK-NOT: xorl ; CHECK: %do.cond -; CHECK-NOT: xorl +; CHECK: xorl ; CHECK: %return entry: %cmp = icmp eq i64 %n, 0 diff --git a/llvm/test/CodeGen/X86/uint64-to-float.ll b/llvm/test/CodeGen/X86/uint64-to-float.ll index a1074a6d698..8a8d29bfda5 100644 --- a/llvm/test/CodeGen/X86/uint64-to-float.ll +++ b/llvm/test/CodeGen/X86/uint64-to-float.ll @@ -6,13 +6,15 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-darwin10.0.0" -; CHECK: andl +; CHECK: %entry ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: js LBB0_1 ; CHECK: cvtsi2ss ; CHECK-NEXT: ret ; CHECK: LBB0_1 -; CHECK: shrq +; CHECK: movq +; CHECK-NEXT: shrq +; CHECK-NEXT: andl ; CHECK-NEXT: orq ; CHECK-NEXT: cvtsi2ss define float @test(i64 %a) { diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 91657c317a2..68d55cb871e 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1325,8 +1325,6 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB38_1 ; SSE-NEXT: # BB#2: @@ -1334,16 +1332,16 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB38_3 ; SSE-NEXT: .LBB38_1: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB38_3: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB38_4 ; SSE-NEXT: # BB#5: @@ -1352,10 +1350,12 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq ; SSE-NEXT: .LBB38_4: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq @@ -1363,31 +1363,31 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; VEX-LABEL: uitofp_2i64_to_4f32: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: movl %eax, %ecx -; VEX-NEXT: andl $1, %ecx ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB38_1 ; VEX-NEXT: # BB#2: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB38_3 ; VEX-NEXT: .LBB38_1: -; VEX-NEXT: shrq %rax -; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB38_3: ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: movl %eax, %ecx -; VEX-NEXT: andl $1, %ecx ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB38_4 ; VEX-NEXT: # BB#5: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: jmp .LBB38_6 ; VEX-NEXT: .LBB38_4: -; VEX-NEXT: shrq %rax -; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0 +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB38_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1429,8 +1429,6 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: .LBB39_2: ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB39_3 ; SSE-NEXT: # BB#4: @@ -1438,17 +1436,17 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB39_5 ; SSE-NEXT: .LBB39_3: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB39_5: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB39_6 ; SSE-NEXT: # BB#7: @@ -1456,10 +1454,12 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB39_8 ; SSE-NEXT: .LBB39_6: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB39_8: ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1469,31 +1469,31 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; VEX-LABEL: uitofp_4i64_to_4f32_undef: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: movl %eax, %ecx -; VEX-NEXT: andl $1, %ecx ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_1 ; VEX-NEXT: # BB#2: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB39_3 ; VEX-NEXT: .LBB39_1: -; VEX-NEXT: shrq %rax -; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB39_3: ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: movl %eax, %ecx -; VEX-NEXT: andl $1, %ecx ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_4 ; VEX-NEXT: # BB#5: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: jmp .LBB39_6 ; VEX-NEXT: .LBB39_4: -; VEX-NEXT: shrq %rax -; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0 +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1694,37 +1694,35 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-LABEL: uitofp_4i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB45_1 ; SSE-NEXT: # BB#2: ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB45_3 ; SSE-NEXT: .LBB45_1: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB45_3: ; SSE-NEXT: movd %xmm0, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB45_4 ; SSE-NEXT: # BB#5: ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB45_6 ; SSE-NEXT: .LBB45_4: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB45_6: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB45_7 ; SSE-NEXT: # BB#8: @@ -1732,17 +1730,17 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB45_9 ; SSE-NEXT: .LBB45_7: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB45_9: ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB45_10 ; SSE-NEXT: # BB#11: @@ -1750,10 +1748,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB45_12 ; SSE-NEXT: .LBB45_10: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB45_12: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1764,53 +1764,51 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-LABEL: uitofp_4i64_to_4f32: ; AVX1: # BB#0: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_1 ; AVX1-NEXT: # BB#2: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB45_3 ; AVX1-NEXT: .LBB45_1: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB45_3: ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_4 ; AVX1-NEXT: # BB#5: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB45_6 ; AVX1-NEXT: .LBB45_4: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB45_6: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_7 ; AVX1-NEXT: # BB#8: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: jmp .LBB45_9 ; AVX1-NEXT: .LBB45_7: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB45_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_10 ; AVX1-NEXT: # BB#11: @@ -1819,9 +1817,11 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB45_10: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -1830,53 +1830,51 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-LABEL: uitofp_4i64_to_4f32: ; AVX2: # BB#0: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_1 ; AVX2-NEXT: # BB#2: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB45_3 ; AVX2-NEXT: .LBB45_1: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB45_3: ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_4 ; AVX2-NEXT: # BB#5: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB45_6 ; AVX2-NEXT: .LBB45_4: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB45_6: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_7 ; AVX2-NEXT: # BB#8: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: jmp .LBB45_9 ; AVX2-NEXT: .LBB45_7: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB45_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_10 ; AVX2-NEXT: # BB#11: @@ -1885,9 +1883,11 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB45_10: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -3083,37 +3083,35 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movd %xmm3, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB74_1 ; SSE-NEXT: # BB#2: ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB74_3 ; SSE-NEXT: .LBB74_1: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB74_3: ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB74_4 ; SSE-NEXT: # BB#5: ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB74_6 ; SSE-NEXT: .LBB74_4: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB74_6: ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: movd %xmm3, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB74_7 ; SSE-NEXT: # BB#8: @@ -3121,17 +3119,17 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB74_9 ; SSE-NEXT: .LBB74_7: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB74_9: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB74_10 ; SSE-NEXT: # BB#11: @@ -3139,10 +3137,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB74_12 ; SSE-NEXT: .LBB74_10: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB74_12: ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -3153,53 +3153,51 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX1: # BB#0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_1 ; AVX1-NEXT: # BB#2: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB74_3 ; AVX1-NEXT: .LBB74_1: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB74_3: ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_4 ; AVX1-NEXT: # BB#5: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB74_6 ; AVX1-NEXT: .LBB74_4: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB74_6: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_7 ; AVX1-NEXT: # BB#8: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: jmp .LBB74_9 ; AVX1-NEXT: .LBB74_7: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB74_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_10 ; AVX1-NEXT: # BB#11: @@ -3208,9 +3206,11 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB74_10: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -3220,53 +3220,51 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_1 ; AVX2-NEXT: # BB#2: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB74_3 ; AVX2-NEXT: .LBB74_1: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB74_3: ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_4 ; AVX2-NEXT: # BB#5: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB74_6 ; AVX2-NEXT: .LBB74_4: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB74_6: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_7 ; AVX2-NEXT: # BB#8: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: jmp .LBB74_9 ; AVX2-NEXT: .LBB74_7: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB74_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_10 ; AVX2-NEXT: # BB#11: @@ -3275,9 +3273,11 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB74_10: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -3408,52 +3408,50 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movd %xmm5, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_1 ; SSE-NEXT: # BB#2: ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: jmp .LBB78_3 ; SSE-NEXT: .LBB78_1: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm4 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: addss %xmm4, %xmm4 ; SSE-NEXT: .LBB78_3: ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_4 ; SSE-NEXT: # BB#5: ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB78_6 ; SSE-NEXT: .LBB78_4: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB78_6: ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] ; SSE-NEXT: movd %xmm5, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_7 ; SSE-NEXT: # BB#8: ; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: jmp .LBB78_9 ; SSE-NEXT: .LBB78_7: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm6 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: addss %xmm6, %xmm6 ; SSE-NEXT: .LBB78_9: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_10 ; SSE-NEXT: # BB#11: @@ -3461,29 +3459,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 ; SSE-NEXT: jmp .LBB78_12 ; SSE-NEXT: .LBB78_10: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm5, %xmm5 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm5 +; SSE-NEXT: cvtsi2ssq %rax, %xmm5 ; SSE-NEXT: addss %xmm5, %xmm5 ; SSE-NEXT: .LBB78_12: ; SSE-NEXT: movd %xmm3, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_13 ; SSE-NEXT: # BB#14: ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: jmp .LBB78_15 ; SSE-NEXT: .LBB78_13: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: cvtsi2ssq %rcx, %xmm7 +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: addss %xmm7, %xmm7 ; SSE-NEXT: .LBB78_15: ; SSE-NEXT: movd %xmm2, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_16 ; SSE-NEXT: # BB#17: @@ -3491,18 +3489,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB78_18 ; SSE-NEXT: .LBB78_16: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB78_18: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: movd %xmm3, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_19 ; SSE-NEXT: # BB#20: @@ -3510,18 +3508,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB78_21 ; SSE-NEXT: .LBB78_19: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB78_21: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE-NEXT: movd %xmm2, %rax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $1, %ecx ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_22 ; SSE-NEXT: # BB#23: @@ -3529,10 +3527,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB78_24 ; SSE-NEXT: .LBB78_22: -; SSE-NEXT: shrq %rax -; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB78_24: ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -3544,122 +3544,122 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_1 ; AVX1-NEXT: # BB#2: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB78_3 ; AVX1-NEXT: .LBB78_1: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB78_3: ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_4 ; AVX1-NEXT: # BB#5: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB78_6 ; AVX1-NEXT: .LBB78_4: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB78_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_7 ; AVX1-NEXT: # BB#8: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: jmp .LBB78_9 ; AVX1-NEXT: .LBB78_7: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB78_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_10 ; AVX1-NEXT: # BB#11: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; AVX1-NEXT: jmp .LBB78_12 ; AVX1-NEXT: .LBB78_10: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB78_12: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_13 ; AVX1-NEXT: # BB#14: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: jmp .LBB78_15 ; AVX1-NEXT: .LBB78_13: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB78_15: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_16 ; AVX1-NEXT: # BB#17: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX1-NEXT: jmp .LBB78_18 ; AVX1-NEXT: .LBB78_16: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB78_18: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vmovq %xmm4, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_19 ; AVX1-NEXT: # BB#20: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 ; AVX1-NEXT: jmp .LBB78_21 ; AVX1-NEXT: .LBB78_19: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX1-NEXT: .LBB78_21: ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm4, %rax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_22 ; AVX1-NEXT: # BB#23: ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX1-NEXT: jmp .LBB78_24 ; AVX1-NEXT: .LBB78_22: -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB78_24: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -3671,122 +3671,122 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_1 ; AVX2-NEXT: # BB#2: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB78_3 ; AVX2-NEXT: .LBB78_1: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB78_3: ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_4 ; AVX2-NEXT: # BB#5: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB78_6 ; AVX2-NEXT: .LBB78_4: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB78_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_7 ; AVX2-NEXT: # BB#8: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: jmp .LBB78_9 ; AVX2-NEXT: .LBB78_7: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB78_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_10 ; AVX2-NEXT: # BB#11: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; AVX2-NEXT: jmp .LBB78_12 ; AVX2-NEXT: .LBB78_10: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB78_12: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_13 ; AVX2-NEXT: # BB#14: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: jmp .LBB78_15 ; AVX2-NEXT: .LBB78_13: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB78_15: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_16 ; AVX2-NEXT: # BB#17: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX2-NEXT: jmp .LBB78_18 ; AVX2-NEXT: .LBB78_16: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB78_18: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vmovq %xmm4, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_19 ; AVX2-NEXT: # BB#20: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 ; AVX2-NEXT: jmp .LBB78_21 ; AVX2-NEXT: .LBB78_19: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX2-NEXT: .LBB78_21: ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX2-NEXT: vpextrq $1, %xmm4, %rax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_22 ; AVX2-NEXT: # BB#23: ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX2-NEXT: jmp .LBB78_24 ; AVX2-NEXT: .LBB78_22: -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB78_24: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |