diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-12-01 12:08:55 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-12-01 12:08:55 +0000 |
| commit | e017ed32450573c16e981b91e16a883299fee59a (patch) | |
| tree | afaa1f7a4e237f62145dd9384a5d029ac10ce9ca /llvm/test/CodeGen | |
| parent | 0c5d6ccbfc089b3335a78d910febff44c3dd622c (diff) | |
| download | bcm5719-llvm-e017ed32450573c16e981b91e16a883299fee59a.tar.gz bcm5719-llvm-e017ed32450573c16e981b91e16a883299fee59a.zip | |
[SelectionDAG] Improve SimplifyDemandedBits to SimplifyDemandedVectorElts simplification
D52935 introduced the ability for SimplifyDemandedBits to call SimplifyDemandedVectorElts through BITCASTs if the demanded bit mask entirely covered the sub element.
This patch relaxes this to demanding an element if we need any bit from it.
Differential Revision: https://reviews.llvm.org/D54761
llvm-svn: 348073
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/idot8.ll | 402 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 15 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/combine-sdiv.ll | 230 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pr35918.ll | 200 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/rotate-extract-vector.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-rotate-128.ll | 18 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 18 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll | 79 |
10 files changed, 472 insertions, 532 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/idot8.ll b/llvm/test/CodeGen/AMDGPU/idot8.ll index e0cd2ad506b..edb88f1912e 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8.ll @@ -3656,67 +3656,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s12, s0 -; GFX7-NEXT: s_mov_b32 s14, s0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s38, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s16, s0 -; GFX7-NEXT: s_mov_b32 s18, s0 +; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s13, s1, 4 -; GFX7-NEXT: s_lshl_b32 s15, s1, 12 -; GFX7-NEXT: s_lshl_b32 s17, s1, 16 -; GFX7-NEXT: s_lshl_b32 s19, s1, 20 -; GFX7-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 +; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 4 +; GFX7-NEXT: s_ashr_i64 s[14:15], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 12 +; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 16 +; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 20 ; GFX7-NEXT: s_lshl_b32 s13, s1, 8 -; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s21, s1, 24 +; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 28 -; GFX7-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX7-NEXT: s_mov_b32 s1, s2 -; GFX7-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 4 +; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX7-NEXT: s_lshl_b32 s1, s9, 4 ; GFX7-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 8 +; GFX7-NEXT: s_lshl_b32 s1, s9, 8 ; GFX7-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 12 +; GFX7-NEXT: s_lshl_b32 s1, s9, 12 ; GFX7-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 16 +; GFX7-NEXT: s_lshl_b32 s1, s9, 16 ; GFX7-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 20 +; GFX7-NEXT: s_lshl_b32 s1, s9, 20 ; GFX7-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 24 +; GFX7-NEXT: s_lshl_b32 s1, s9, 24 ; GFX7-NEXT: s_ashr_i64 s[36:37], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 28 -; GFX7-NEXT: s_mov_b32 s20, s0 -; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s38 -; GFX7-NEXT: v_mad_i32_i24 v0, s22, v0, v1 -; GFX7-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 +; GFX7-NEXT: s_lshl_b32 s1, s9, 28 +; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 +; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1 +; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 ; GFX7-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s34 -; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s32 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s30 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX7-NEXT: v_mov_b32_e32 v1, s28 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s26 -; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s24 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3724,67 +3717,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: s_mov_b32 s10, s8 -; GFX8-NEXT: s_mov_b32 s12, s8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s9, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s36, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, s8 -; GFX8-NEXT: s_mov_b32 s16, s8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s11, s9, 4 -; GFX8-NEXT: s_lshl_b32 s13, s9, 8 -; GFX8-NEXT: s_lshl_b32 s15, s9, 16 -; GFX8-NEXT: s_lshl_b32 s17, s9, 20 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], 60 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 -; GFX8-NEXT: s_lshl_b32 s13, s9, 12 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s19, s9, 24 -; GFX8-NEXT: s_lshl_b32 s9, s9, 28 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX8-NEXT: s_mov_b32 s9, s2 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 8 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 12 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 16 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 20 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 24 -; GFX8-NEXT: s_ashr_i64 s[34:35], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 28 -; GFX8-NEXT: s_mov_b32 s18, s8 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s36 -; GFX8-NEXT: v_mad_i32_i24 v0, s20, v0, v1 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s34 -; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s32 -; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s30 -; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s28 -; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s26 -; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s24 -; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 24 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 28 +; GFX8-NEXT: s_lshl_b32 s9, s5, 8 +; GFX8-NEXT: s_lshl_b32 s11, s5, 12 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 4 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 16 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 24 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s32 +; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s26 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3792,67 +3778,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s36, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s16, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s11, s9, 4 -; GFX9-NEXT: s_lshl_b32 s13, s9, 8 -; GFX9-NEXT: s_lshl_b32 s15, s9, 16 -; GFX9-NEXT: s_lshl_b32 s17, s9, 20 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], 60 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 -; GFX9-NEXT: s_lshl_b32 s13, s9, 12 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s19, s9, 24 -; GFX9-NEXT: s_lshl_b32 s9, s9, 28 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX9-NEXT: s_mov_b32 s9, s2 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 4 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 8 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 12 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 16 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 20 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 24 -; GFX9-NEXT: s_ashr_i64 s[34:35], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 28 -; GFX9-NEXT: s_mov_b32 s18, s8 -; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s36 -; GFX9-NEXT: v_mad_i32_i24 v0, s20, v0, v1 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 20 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 24 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 28 +; GFX9-NEXT: s_lshl_b32 s9, s5, 8 +; GFX9-NEXT: s_lshl_b32 s11, s5, 12 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 4 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 12 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 16 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 20 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 24 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 28 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s32 +; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s30 +; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-NEXT: v_mov_b32_e32 v3, s26 +; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX9-NEXT: v_mov_b32_e32 v3, s24 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s20 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -3860,67 +3839,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0 -; GFX9-DL-NEXT: s_mov_b32 s10, s8 -; GFX9-DL-NEXT: s_mov_b32 s12, s8 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s9, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s36, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, s8 -; GFX9-DL-NEXT: s_mov_b32 s16, s8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshl_b32 s11, s9, 4 -; GFX9-DL-NEXT: s_lshl_b32 s13, s9, 8 -; GFX9-DL-NEXT: s_lshl_b32 s15, s9, 16 -; GFX9-DL-NEXT: s_lshl_b32 s17, s9, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[10:11], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 -; GFX9-DL-NEXT: s_lshl_b32 s13, s9, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s19, s9, 24 -; GFX9-DL-NEXT: s_lshl_b32 s9, s9, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX9-DL-NEXT: s_mov_b32 s9, s2 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 28 -; GFX9-DL-NEXT: s_mov_b32 s18, s8 -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s36 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v0, v1 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8 +; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 23341aab56b..13380e03e32 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -76,20 +76,19 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v[0:1], off offset:14 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7f0000, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f0000, v0 +; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 +; GFX9-NEXT: ds_write_b32 v1, v3 ; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index a10d9c57e41..5d3bb5a1ab8 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1531,9 +1531,8 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: psrlq $62, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -1548,18 +1547,18 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: psrlq $62, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlq $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,2305843009213693952] -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: psrlq $62, %xmm2 +; SSE41-NEXT: paddq %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlq $2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952] +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: psubq %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: @@ -1637,10 +1636,10 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: psrlq $62, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: paddq %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: psrlq $2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] @@ -1669,17 +1668,17 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: psrlq $62, %xmm2 -; SSE41-NEXT: paddq %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlq $2, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952] -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: psubq %xmm2, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: psrlq $62, %xmm3 +; SSE41-NEXT: paddq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: psrlq $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubq %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -1783,119 +1782,118 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: psrlq $62, %xmm5 -; SSE2-NEXT: paddq %xmm1, %xmm5 +; SSE2-NEXT: paddq %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: psrlq $2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE2-NEXT: movapd {{.*#+}} xmm5 = [9223372036854775808,2305843009213693952] ; SSE2-NEXT: xorpd %xmm5, %xmm0 ; SSE2-NEXT: psubq %xmm5, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrlq $61, %xmm6 -; SSE2-NEXT: psrlq $60, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] -; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: psrlq $62, %xmm6 +; SSE2-NEXT: paddq %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: psrlq $2, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm6[0],xmm2[1] +; SSE2-NEXT: xorpd %xmm5, %xmm2 +; SSE2-NEXT: psubq %xmm5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlq $3, %xmm4 -; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1152921504606846976,576460752303423488] -; SSE2-NEXT: xorpd %xmm6, %xmm1 -; SSE2-NEXT: psubq %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-NEXT: psrlq $62, %xmm7 -; SSE2-NEXT: paddq %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: psrlq $2, %xmm4 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] -; SSE2-NEXT: xorpd %xmm5, %xmm4 -; SSE2-NEXT: psubq %xmm5, %xmm4 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrlq $61, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psrlq $61, %xmm5 +; SSE2-NEXT: psrlq $60, %xmm4 +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE2-NEXT: paddq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrlq $3, %xmm1 +; SSE2-NEXT: psrlq $4, %xmm4 +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] +; SSE2-NEXT: xorpd %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: psrlq $61, %xmm6 ; SSE2-NEXT: psrlq $60, %xmm5 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] ; SSE2-NEXT: paddq %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrlq $3, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: psrlq $3, %xmm3 ; SSE2-NEXT: psrlq $4, %xmm5 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE2-NEXT: xorpd %xmm6, %xmm5 -; SSE2-NEXT: psubq %xmm6, %xmm5 -; SSE2-NEXT: movapd %xmm4, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE2-NEXT: xorpd %xmm1, %xmm5 +; SSE2-NEXT: psubq %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: psrlq $62, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrlq $2, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm1[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: psrlq $62, %xmm5 +; SSE41-NEXT: paddq %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: psrlq $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,2305843009213693952] -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: psubq %xmm5, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrlq $60, %xmm6 -; SSE41-NEXT: psrlq $61, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: paddq %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: psubq %xmm5, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: psrad $31, %xmm6 +; SSE41-NEXT: psrlq $62, %xmm6 +; SSE41-NEXT: paddq %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: psrlq $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: psubq %xmm5, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlq $4, %xmm4 -; SSE41-NEXT: psrlq $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1152921504606846976,576460752303423488] -; SSE41-NEXT: pxor %xmm6, %xmm1 -; SSE41-NEXT: psubq %xmm6, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: psrlq $62, %xmm4 -; SSE41-NEXT: paddq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: psrlq $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm4[0,1,2,3],xmm7[4,5,6,7] -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: psubq %xmm5, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrlq $60, %xmm5 ; SSE41-NEXT: psrlq $61, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: paddq %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: psrlq $4, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrlq $4, %xmm1 ; SSE41-NEXT: psrlq $3, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: psubq %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] +; SSE41-NEXT: pxor %xmm1, %xmm4 +; SSE41-NEXT: psubq %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrlq $60, %xmm6 +; SSE41-NEXT: psrlq $61, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: paddq %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: psrlq $4, %xmm3 +; SSE41-NEXT: psrlq $3, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: psubq %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: diff --git a/llvm/test/CodeGen/X86/pr35918.ll b/llvm/test/CodeGen/X86/pr35918.ll index 5c84bd946fd..f3678e5e5c5 100644 --- a/llvm/test/CodeGen/X86/pr35918.ll +++ b/llvm/test/CodeGen/X86/pr35918.ll @@ -1,101 +1,99 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=X86,X86-SKYLAKE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X86,X86-SKX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=X64,X64-SKYLAKE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X64,X64-SKX - -define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind { -; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: -; X86-SKYLAKE: # %bb.0: # %entry -; X86-SKYLAKE-NEXT: subl $8, %esp -; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0 -; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx -; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 -; X86-SKYLAKE-NEXT: movl %ecx, (%eax) -; X86-SKYLAKE-NEXT: addl $8, %esp -; X86-SKYLAKE-NEXT: retl -; -; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8: -; X86-SKX: # %bb.0: # %entry -; X86-SKX-NEXT: subl $8, %esp -; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u] -; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0 -; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp) -; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-SKX-NEXT: vpmovdb %xmm0, (%esp) -; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SKX-NEXT: movzwl (%esp), %ecx -; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 -; X86-SKX-NEXT: movl %ecx, (%eax) -; X86-SKX-NEXT: addl $8, %esp -; X86-SKX-NEXT: retl -; -; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: -; X64-SKYLAKE: # %bb.0: # %entry -; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax -; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; X64-SKYLAKE-NEXT: movl %eax, (%rdi) -; X64-SKYLAKE-NEXT: retq -; -; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8: -; X64-SKX: # %bb.0: # %entry -; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u] -; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0 -; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) -; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp) -; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; X64-SKX-NEXT: movl %eax, (%rdi) -; X64-SKX-NEXT: retq -entry: - %5 = bitcast i8* %1 to <2 x i16>* - %6 = load <2 x i16>, <2 x i16>* %5, align 2 - %7 = shufflevector <2 x i16> %6, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %8 = icmp sgt <4 x i16> %7, zeroinitializer - %9 = select <4 x i1> %8, <4 x i16> %7, <4 x i16> zeroinitializer - %10 = lshr <4 x i16> %9, <i16 7, i16 7, i16 7, i16 7> - %11 = shufflevector <4 x i16> %10, <4 x i16> undef, <2 x i32> <i32 0, i32 1> - %12 = shufflevector <4 x i16> %10, <4 x i16> undef, <2 x i32> <i32 2, i32 3> - %13 = bitcast <2 x i16> %11 to <4 x i8> - %14 = bitcast <2 x i16> %12 to <4 x i8> - %15 = shufflevector <4 x i8> %13, <4 x i8> %14, <4 x i32> <i32 0, i32 2, i32 4, i32 6> - %16 = bitcast <4 x i8> %15 to i32 - %17 = and i32 %16, 65535 - %18 = or i32 %17, -16777216 - %19 = bitcast <4 x i8>* %0 to i32* - store i32 %18, i32* %19, align 4 - ret void -} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=X86,X86-SKYLAKE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X86,X86-SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=X64,X64-SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X64,X64-SKX
+
+define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind {
+; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
+; X86-SKYLAKE: # %bb.0: # %entry
+; X86-SKYLAKE-NEXT: subl $8, %esp
+; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx
+; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; X86-SKYLAKE-NEXT: movl %ecx, (%eax)
+; X86-SKYLAKE-NEXT: addl $8, %esp
+; X86-SKYLAKE-NEXT: retl
+;
+; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8:
+; X86-SKX: # %bb.0: # %entry
+; X86-SKX-NEXT: subl $8, %esp
+; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
+; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
+; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
+; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp)
+; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-SKX-NEXT: vpmovdb %xmm0, (%esp)
+; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT: movzwl (%esp), %ecx
+; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; X86-SKX-NEXT: movl %ecx, (%eax)
+; X86-SKX-NEXT: addl $8, %esp
+; X86-SKX-NEXT: retl
+;
+; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
+; X64-SKYLAKE: # %bb.0: # %entry
+; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax
+; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000
+; X64-SKYLAKE-NEXT: movl %eax, (%rdi)
+; X64-SKYLAKE-NEXT: retq
+;
+; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8:
+; X64-SKX: # %bb.0: # %entry
+; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
+; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
+; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
+; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000
+; X64-SKX-NEXT: movl %eax, (%rdi)
+; X64-SKX-NEXT: retq
+entry:
+ %5 = bitcast i8* %1 to <2 x i16>*
+ %6 = load <2 x i16>, <2 x i16>* %5, align 2
+ %7 = shufflevector <2 x i16> %6, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %8 = icmp sgt <4 x i16> %7, zeroinitializer
+ %9 = select <4 x i1> %8, <4 x i16> %7, <4 x i16> zeroinitializer
+ %10 = lshr <4 x i16> %9, <i16 7, i16 7, i16 7, i16 7>
+ %11 = shufflevector <4 x i16> %10, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %12 = shufflevector <4 x i16> %10, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+ %13 = bitcast <2 x i16> %11 to <4 x i8>
+ %14 = bitcast <2 x i16> %12 to <4 x i8>
+ %15 = shufflevector <4 x i8> %13, <4 x i8> %14, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %16 = bitcast <4 x i8> %15 to i32
+ %17 = and i32 %16, 65535
+ %18 = or i32 %17, -16777216
+ %19 = bitcast <4 x i8>* %0 to i32*
+ store i32 %18, i32* %19, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll index e2679dded8b..6301f3bf747 100644 --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -221,12 +221,10 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 256da3500c1..45a7e55e519 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -281,16 +281,15 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: var_rotate_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmulhuw %xmm2, %xmm1 @@ -300,8 +299,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX1-LABEL: var_rotate_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index b7c08f96d8e..374429c9644 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -453,7 +453,6 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-NEXT: psllw $8, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psllw $12, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psraw $15, %xmm0 @@ -613,7 +612,6 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X32-SSE-NEXT: psllw $8, %xmm2 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: psraw $8, %xmm3 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: psllw $12, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: psraw $15, %xmm0 @@ -1401,7 +1399,6 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psllw $12, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psraw $15, %xmm0 @@ -1563,7 +1560,6 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X32-SSE-NEXT: psraw $8, %xmm3 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: psllw $12, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: psraw $15, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index a8041446a50..2c8a2842bf2 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -180,24 +180,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll index 213cfd16acf..1323f66bdc4 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll @@ -116,24 +116,22 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; ; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -247,24 +245,22 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; ; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index ca44d8a3bc6..962479ecb30 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -102,7 +102,6 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: var_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 @@ -117,18 +116,14 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; ; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 @@ -172,7 +167,6 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; ; X32-SSE-LABEL: var_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pslld $23, %xmm1 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 @@ -295,35 +289,36 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; SSE41-LABEL: var_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -663,7 +658,6 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 @@ -679,19 +673,15 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE41-LABEL: splatvar_shift_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 @@ -741,7 +731,6 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v4i16: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pslld $23, %xmm1 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 @@ -878,16 +867,15 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE41-LABEL: splatvar_shift_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -895,8 +883,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX1-LABEL: splatvar_shift_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 |

