diff options
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-arith.ll | 293 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-cmp.ll | 101 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-conversions.ll | 239 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll | 93 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-gather.ll | 84 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-logic.ll | 102 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-phaddsub.ll | 107 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-shift.ll | 546 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-vector-shifts.ll | 722 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-vperm.ll | 61 |
10 files changed, 1687 insertions, 661 deletions
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index a205be1c0cd..e1341624cad 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -1,211 +1,326 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 -; CHECK: vpaddq %ymm define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +; X32-LABEL: test_vpaddq: +; X32: ## BB#0: +; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpaddq: +; X64: ## BB#0: +; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = add <4 x i64> %i, %j ret <4 x i64> %x } -; CHECK: vpaddd %ymm define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +; X32-LABEL: test_vpaddd: +; X32: ## BB#0: +; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpaddd: +; X64: ## BB#0: +; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = add <8 x i32> %i, %j ret <8 x i32> %x } -; CHECK: vpaddw %ymm define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +; X32-LABEL: test_vpaddw: +; X32: ## BB#0: +; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpaddw: +; X64: ## BB#0: +; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = add <16 x i16> %i, %j ret <16 x i16> %x } -; CHECK: vpaddb %ymm define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +; X32-LABEL: test_vpaddb: +; X32: ## BB#0: +; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpaddb: +; X64: ## BB#0: +; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = add <32 x i8> %i, %j ret <32 x i8> %x } -; CHECK: vpsubq %ymm define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +; X32-LABEL: test_vpsubq: +; X32: ## BB#0: +; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpsubq: +; X64: ## BB#0: +; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = sub <4 x i64> %i, %j ret <4 x i64> %x } -; CHECK: vpsubd %ymm define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +; X32-LABEL: test_vpsubd: +; X32: ## BB#0: +; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpsubd: +; X64: ## BB#0: +; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = sub <8 x i32> %i, %j ret <8 x i32> %x } -; CHECK: vpsubw %ymm define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +; X32-LABEL: test_vpsubw: +; X32: ## BB#0: +; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpsubw: +; X64: ## BB#0: +; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = sub <16 x i16> %i, %j ret <16 x i16> %x } -; CHECK: vpsubb %ymm define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +; X32-LABEL: test_vpsubb: +; X32: ## BB#0: +; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpsubb: +; X64: ## BB#0: +; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = sub <32 x i8> %i, %j ret <32 x i8> %x } -; CHECK: vpmulld %ymm define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +; X32-LABEL: test_vpmulld: +; X32: ## BB#0: +; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpmulld: +; X64: ## BB#0: +; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = mul <8 x i32> %i, %j ret <8 x i32> %x } -; CHECK: vpmullw %ymm define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +; X32-LABEL: test_vpmullw: +; X32: ## BB#0: +; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpmullw: +; X64: ## BB#0: +; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %x = mul <16 x i16> %i, %j ret <16 x i16> %x } -; CHECK: mul-v16i8 -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1 -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 -; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { %x = mul <16 x i8> %i, %j ret <16 x i8> %x } -; CHECK: mul-v32i8 -; CHECK: # BB#0: -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpmovsxbw %xmm2, %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpmovsxbw %xmm3, %ymm3 -; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; CHECK-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1 -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 -; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: retq define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { %x = mul <32 x i8> %i, %j ret <32 x i8> %x } -; CHECK: mul-v4i64 -; CHECK: vpmuludq %ymm -; CHECK-NEXT: vpsrlq $32, %ymm -; CHECK-NEXT: vpmuludq %ymm -; CHECK-NEXT: vpsllq $32, %ymm -; CHECK-NEXT: vpaddq %ymm -; CHECK-NEXT: vpsrlq $32, %ymm -; CHECK-NEXT: vpmuludq %ymm -; CHECK-NEXT: vpsllq $32, %ymm -; CHECK-NEXT: vpaddq %ymm define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone { %x = mul <4 x i64> %i, %j ret <4 x i64> %x } -; CHECK: mul_const1 -; CHECK: vpaddd -; CHECK: ret define <8 x i32> @mul_const1(<8 x i32> %x) { +; X32-LABEL: mul_const1: +; X32: ## BB#0: +; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const1: +; X64: ## BB#0: +; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq %y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ret <8 x i32> %y } -; CHECK: mul_const2 -; CHECK: vpsllq $2 -; CHECK: ret define <4 x i64> @mul_const2(<4 x i64> %x) { +; X32-LABEL: mul_const2: +; X32: ## BB#0: +; X32-NEXT: vpsllq $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const2: +; X64: ## BB#0: +; X64-NEXT: vpsllq $2, %ymm0, %ymm0 +; X64-NEXT: retq %y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4> ret <4 x i64> %y } -; CHECK: mul_const3 -; CHECK: vpsllw $3 -; CHECK: ret define <16 x i16> @mul_const3(<16 x i16> %x) { +; X32-LABEL: mul_const3: +; X32: ## BB#0: +; X32-NEXT: vpsllw $3, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const3: +; X64: ## BB#0: +; X64-NEXT: vpsllw $3, %ymm0, %ymm0 +; X64-NEXT: retq %y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> ret <16 x i16> %y } -; CHECK: mul_const4 -; CHECK: vpxor -; CHECK: vpsubq -; CHECK: ret define <4 x i64> @mul_const4(<4 x i64> %x) { +; X32-LABEL: mul_const4: +; X32: ## BB#0: +; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const4: +; X64: ## BB#0: +; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1> ret <4 x i64> %y } -; CHECK: mul_const5 -; CHECK: vxorps -; CHECK-NEXT: ret define <8 x i32> @mul_const5(<8 x i32> %x) { +; X32-LABEL: mul_const5: +; X32: ## BB#0: +; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const5: +; X64: ## BB#0: +; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %y } -; CHECK: mul_const6 -; CHECK: vpmulld -; CHECK: ret define <8 x i32> @mul_const6(<8 x i32> %x) { +; X32-LABEL: mul_const6: +; X32: ## BB#0: +; X32-NEXT: vpmulld LCPI18_0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const6: +; X64: ## BB#0: +; X64-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0> ret <8 x i32> %y } -; CHECK: mul_const7 -; CHECK: vpaddq -; CHECK: vpaddq -; CHECK: ret define <8 x i64> @mul_const7(<8 x i64> %x) { +; X32-LABEL: mul_const7: +; X32: ## BB#0: +; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; X32-NEXT: vpaddq %ymm1, %ymm1, %ymm1 +; X32-NEXT: retl +; +; X64-LABEL: mul_const7: +; X64: ## BB#0: +; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; X64-NEXT: vpaddq %ymm1, %ymm1, %ymm1 +; X64-NEXT: retq %y = mul <8 x i64> %x, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> ret <8 x i64> %y } -; CHECK: mul_const8 -; CHECK: vpsllw $3 -; CHECK: ret define <8 x i16> @mul_const8(<8 x i16> %x) { +; X32-LABEL: mul_const8: +; X32: ## BB#0: +; X32-NEXT: vpsllw $3, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const8: +; X64: ## BB#0: +; X64-NEXT: vpsllw $3, %xmm0, %xmm0 +; X64-NEXT: retq %y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> ret <8 x i16> %y } -; CHECK: mul_const9 -; CHECK: vpmulld -; CHECK: ret define <8 x i32> @mul_const9(<8 x i32> %x) { +; X32-LABEL: mul_const9: +; X32: ## BB#0: +; X32-NEXT: movl $2, %eax +; X32-NEXT: vmovd %eax, %xmm1 +; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const9: +; X64: ## BB#0: +; X64-NEXT: movl $2, %eax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %y } -; CHECK: mul_const10 -; CHECK: vpmulld -; CHECK: ret define <4 x i32> @mul_const10(<4 x i32> %x) { ; %x * 0x01010101 +; X32-LABEL: mul_const10: +; X32: ## BB#0: +; X32-NEXT: vpbroadcastd LCPI22_0, %xmm1 +; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const10: +; X64: ## BB#0: +; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009> ret <4 x i32> %m } -; CHECK: mul_const11 -; CHECK: vpmulld -; CHECK: ret define <4 x i32> @mul_const11(<4 x i32> %x) { ; %x * 0x80808080 +; X32-LABEL: mul_const11: +; X32: ## BB#0: +; X32-NEXT: vpbroadcastd LCPI23_0, %xmm1 +; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_const11: +; X64: ## BB#0: +; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152> ret <4 x i32> %m } diff --git a/llvm/test/CodeGen/X86/avx2-cmp.ll b/llvm/test/CodeGen/X86/avx2-cmp.ll index df30d9efed1..e2b550383c8 100644 --- a/llvm/test/CodeGen/X86/avx2-cmp.ll +++ b/llvm/test/CodeGen/X86/avx2-cmp.ll @@ -1,58 +1,123 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 -; CHECK: vpcmpgtd %ymm -define <8 x i32> @int256-cmp(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +define <8 x i32> @v8i32_cmpgt(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +; X32-LABEL: v8i32_cmpgt: +; X32: ## BB#0: +; X32-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v8i32_cmpgt: +; X64: ## BB#0: +; X64-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %bincmp = icmp slt <8 x i32> %i, %j %x = sext <8 x i1> %bincmp to <8 x i32> ret <8 x i32> %x } -; CHECK: vpcmpgtq %ymm -define <4 x i64> @v4i64-cmp(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +define <4 x i64> @v4i64_cmpgt(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +; X32-LABEL: v4i64_cmpgt: +; X32: ## BB#0: +; X32-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v4i64_cmpgt: +; X64: ## BB#0: +; X64-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %bincmp = icmp slt <4 x i64> %i, %j %x = sext <4 x i1> %bincmp to <4 x i64> ret <4 x i64> %x } -; CHECK: vpcmpgtw %ymm -define <16 x i16> @v16i16-cmp(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +define <16 x i16> @v16i16_cmpgt(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +; X32-LABEL: v16i16_cmpgt: +; X32: ## BB#0: +; X32-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v16i16_cmpgt: +; X64: ## BB#0: +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %bincmp = icmp slt <16 x i16> %i, %j %x = sext <16 x i1> %bincmp to <16 x i16> ret <16 x i16> %x } -; CHECK: vpcmpgtb %ymm -define <32 x i8> @v32i8-cmp(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +define <32 x i8> @v32i8_cmpgt(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +; X32-LABEL: v32i8_cmpgt: +; X32: ## BB#0: +; X32-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v32i8_cmpgt: +; X64: ## BB#0: +; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %bincmp = icmp slt <32 x i8> %i, %j %x = sext <32 x i1> %bincmp to <32 x i8> ret <32 x i8> %x } -; CHECK: vpcmpeqd %ymm -define <8 x i32> @int256-cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +; X32-LABEL: int256_cmpeq: +; X32: ## BB#0: +; X32-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: int256_cmpeq: +; X64: ## BB#0: +; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %bincmp = icmp eq <8 x i32> %i, %j %x = sext <8 x i1> %bincmp to <8 x i32> ret <8 x i32> %x } -; CHECK: vpcmpeqq %ymm -define <4 x i64> @v4i64-cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +; X32-LABEL: v4i64_cmpeq: +; X32: ## BB#0: +; X32-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v4i64_cmpeq: +; X64: ## BB#0: +; X64-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %bincmp = icmp eq <4 x i64> %i, %j %x = sext <4 x i1> %bincmp to <4 x i64> ret <4 x i64> %x } -; CHECK: vpcmpeqw %ymm -define <16 x i16> @v16i16-cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +; X32-LABEL: v16i16_cmpeq: +; X32: ## BB#0: +; X32-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v16i16_cmpeq: +; X64: ## BB#0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %bincmp = icmp eq <16 x i16> %i, %j %x = sext <16 x i1> %bincmp to <16 x i16> ret <16 x i16> %x } -; CHECK: vpcmpeqb %ymm -define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +; X32-LABEL: v32i8_cmpeq: +; X32: ## BB#0: +; X32-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: v32i8_cmpeq: +; X64: ## BB#0: +; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %bincmp = icmp eq <32 x i8> %i, %j %x = sext <32 x i1> %bincmp to <32 x i8> ret <32 x i8> %x } - diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll index 7c16ec800a5..f0fb58ff7c8 100644 --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -1,153 +1,246 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 define <4 x i32> @trunc4(<4 x i64> %A) nounwind { -; CHECK-LABEL: trunc4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: trunc4: +; X32: ## BB#0: +; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: trunc4: +; X64: ## BB#0: +; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %B = trunc <4 x i64> %A to <4 x i32> ret <4 x i32>%B } define <8 x i16> @trunc8(<8 x i32> %A) nounwind { -; CHECK-LABEL: trunc8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: trunc8: +; X32: ## BB#0: +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: trunc8: +; X64: ## BB#0: +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %B = trunc <8 x i32> %A to <8 x i16> ret <8 x i16>%B } define <4 x i64> @sext4(<4 x i32> %A) nounwind { -; CHECK-LABEL: sext4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: sext4: +; X32: ## BB#0: +; X32-NEXT: vpmovsxdq %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sext4: +; X64: ## BB#0: +; X64-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-NEXT: retq %B = sext <4 x i32> %A to <4 x i64> ret <4 x i64>%B } define <8 x i32> @sext8(<8 x i16> %A) nounwind { -; CHECK-LABEL: sext8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: sext8: +; X32: ## BB#0: +; X32-NEXT: vpmovsxwd %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sext8: +; X64: ## BB#0: +; X64-NEXT: vpmovsxwd %xmm0, %ymm0 +; X64-NEXT: retq %B = sext <8 x i16> %A to <8 x i32> ret <8 x i32>%B } define <4 x i64> @zext4(<4 x i32> %A) nounwind { -; CHECK-LABEL: zext4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: retq +; X32-LABEL: zext4: +; X32: ## BB#0: +; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: retl +; +; X64-LABEL: zext4: +; X64: ## BB#0: +; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: retq %B = zext <4 x i32> %A to <4 x i64> ret <4 x i64>%B } define <8 x i32> @zext8(<8 x i16> %A) nounwind { -; CHECK-LABEL: zext8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: retq +; X32-LABEL: zext8: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: retl +; +; X64-LABEL: zext8: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: retq %B = zext <8 x i16> %A to <8 x i32> ret <8 x i32>%B } define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { -; CHECK-LABEL: zext_8i8_8i32: -; CHECK: ## BB#0: -; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: retq +; X32-LABEL: zext_8i8_8i32: +; X32: ## BB#0: +; X32-NEXT: vpand LCPI6_0, %xmm0, %xmm0 +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: retl +; +; X64-LABEL: zext_8i8_8i32: +; X64: ## BB#0: +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> ret <8 x i32>%B } define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) { -; CHECK-LABEL: zext_16i8_16i16: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: retq +; X32-LABEL: zext_16i8_16i16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X32-NEXT: retl +; +; X64-LABEL: zext_16i8_16i16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-NEXT: retq %t = zext <16 x i8> %z to <16 x i16> ret <16 x i16> %t } define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) { -; CHECK-LABEL: sext_16i8_16i16: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: sext_16i8_16i16: +; X32: ## BB#0: +; X32-NEXT: vpmovsxbw %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sext_16i8_16i16: +; X64: ## BB#0: +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-NEXT: retq %t = sext <16 x i8> %z to <16 x i16> ret <16 x i16> %t } define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) { -; CHECK-LABEL: trunc_16i16_16i8: -; CHECK: ## BB#0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: trunc_16i16_16i8: +; X32: ## BB#0: +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X32-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: trunc_16i16_16i8: +; X64: ## BB#0: +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vzeroupper +; X64-NEXT: retq %t = trunc <16 x i16> %z to <16 x i8> ret <16 x i8> %t } define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) { -; CHECK-LABEL: load_sext_test1: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxdq (%rdi), %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: load_sext_test1: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpmovsxdq (%eax), %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: load_sext_test1: +; X64: ## BB#0: +; X64-NEXT: vpmovsxdq (%rdi), %ymm0 +; X64-NEXT: retq %X = load <4 x i32>, <4 x i32>* %ptr %Y = sext <4 x i32> %X to <4 x i64> ret <4 x i64>%Y } define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) { -; CHECK-LABEL: load_sext_test2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbq (%rdi), %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: load_sext_test2: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpmovsxbq (%eax), %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: load_sext_test2: +; X64: ## BB#0: +; X64-NEXT: vpmovsxbq (%rdi), %ymm0 +; X64-NEXT: retq %X = load <4 x i8>, <4 x i8>* %ptr %Y = sext <4 x i8> %X to <4 x i64> ret <4 x i64>%Y } define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) { -; CHECK-LABEL: load_sext_test3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq (%rdi), %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: load_sext_test3: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpmovsxwq (%eax), %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: load_sext_test3: +; X64: ## BB#0: +; X64-NEXT: vpmovsxwq (%rdi), %ymm0 +; X64-NEXT: retq %X = load <4 x i16>, <4 x i16>* %ptr %Y = sext <4 x i16> %X to <4 x i64> ret <4 x i64>%Y } define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) { -; CHECK-LABEL: load_sext_test4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwd (%rdi), %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: load_sext_test4: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpmovsxwd (%eax), %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: load_sext_test4: +; X64: ## BB#0: +; X64-NEXT: vpmovsxwd (%rdi), %ymm0 +; X64-NEXT: retq %X = load <8 x i16>, <8 x i16>* %ptr %Y = sext <8 x i16> %X to <8 x i32> ret <8 x i32>%Y } define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) { -; CHECK-LABEL: load_sext_test5: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd (%rdi), %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: load_sext_test5: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpmovsxbd (%eax), %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: load_sext_test5: +; X64: ## BB#0: +; X64-NEXT: vpmovsxbd (%rdi), %ymm0 +; X64-NEXT: retq %X = load <8 x i8>, <8 x i8>* %ptr %Y = sext <8 x i8> %X to <8 x i32> ret <8 x i32>%Y diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll index 8c52b8439a2..345943bd730 100644 --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -1,13 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -mattr=+fma | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 ; This test checks combinations of FNEG and FMA intrinsics define <8 x float> @test1(<8 x float> %a, <8 x float> %b, <8 x float> %c) { -; CHECK-LABEL: test1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test1: +; X32: ## BB#0: ## %entry +; X32-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: ## BB#0: ## %entry +; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; X64-NEXT: retq entry: %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c %0 = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2 @@ -17,10 +23,15 @@ entry: declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: test2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: retq +; X32-LABEL: test2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 +; X64-NEXT: retq entry: %0 = tail call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0 @@ -30,12 +41,19 @@ entry: declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: test3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; X32-LABEL: test3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 +; X32-NEXT: vbroadcastss LCPI2_0, %xmm1 +; X32-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 +; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq entry: %0 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0 @@ -45,10 +63,15 @@ entry: declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) define <8 x float> @test4(<8 x float> %a, <8 x float> %b, <8 x float> %c) { -; CHECK-LABEL: test4: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test4: +; X32: ## BB#0: ## %entry +; X32-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test4: +; X64: ## BB#0: ## %entry +; X64-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 +; X64-NEXT: retq entry: %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0 @@ -56,12 +79,19 @@ entry: } define <8 x float> @test5(<8 x float> %a, <8 x float> %b, <8 x float> %c) { -; CHECK-LABEL: test5: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm2 -; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test5: +; X32: ## BB#0: ## %entry +; X32-NEXT: vbroadcastss LCPI4_0, %ymm3 +; X32-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test5: +; X64: ## BB#0: ## %entry +; X64-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 +; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; X64-NEXT: retq entry: %sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2 @@ -72,10 +102,15 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f define <2 x double> @test6(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: test6: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: retq +; X32-LABEL: test6: +; X32: ## BB#0: ## %entry +; X32-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test6: +; X64: ## BB#0: ## %entry +; X64-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 +; X64-NEXT: retq entry: %0 = tail call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %0 diff --git a/llvm/test/CodeGen/X86/avx2-gather.ll b/llvm/test/CodeGen/X86/avx2-gather.ll index 91fa20bc0af..cd8c354e996 100644 --- a/llvm/test/CodeGen/X86/avx2-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-gather.ll @@ -1,61 +1,87 @@ -; RUN: not llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly -define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1, - <4 x i32> %idx, <4 x float> %mask) { +define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1, <4 x i32> %idx, <4 x float> %mask) { +; X32-LABEL: test_x86_avx2_gather_d_ps: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vgatherdps %xmm1, (%eax,%xmm0,2), %xmm2 +; X32-NEXT: vmovaps %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_x86_avx2_gather_d_ps: +; X64: ## BB#0: +; X64-NEXT: vgatherdps %xmm1, (%rdi,%xmm0,2), %xmm2 +; X64-NEXT: vmovaps %xmm2, %xmm0 +; X64-NEXT: retq %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ; ret <4 x float> %res } -; CHECK: test_x86_avx2_gather_d_ps -; CHECK: vgatherdps -; CHECK-NOT: [[DST]] -; CHECK: [[DST:%xmm[0-9]+]]{{$}} -; CHECK: vmovaps -; CHECK: ret - declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly -define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1, - <4 x i32> %idx, <2 x double> %mask) { +define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1, <4 x i32> %idx, <2 x double> %mask) { +; X32-LABEL: test_x86_avx2_gather_d_pd: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vgatherdpd %xmm1, (%eax,%xmm0,2), %xmm2 +; X32-NEXT: vmovapd %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_x86_avx2_gather_d_pd: +; X64: ## BB#0: +; X64-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,2), %xmm2 +; X64-NEXT: vmovapd %xmm2, %xmm0 +; X64-NEXT: retq %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ; ret <2 x double> %res } -; CHECK: test_x86_avx2_gather_d_pd -; CHECK: vgatherdpd -; CHECK: vmovapd -; CHECK: ret - declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly -define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1, - <8 x i32> %idx, <8 x float> %mask) { +define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1, <8 x i32> %idx, <8 x float> %mask) { +; X32-LABEL: test_x86_avx2_gather_d_ps_256: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2 +; X32-NEXT: vmovaps %ymm2, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_x86_avx2_gather_d_ps_256: +; X64: ## BB#0: +; X64-NEXT: vgatherdps %ymm1, (%rdi,%ymm0,4), %ymm2 +; X64-NEXT: vmovaps %ymm2, %ymm0 +; X64-NEXT: retq %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 4) ; ret <8 x float> %res } -; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256 -; CHECK: vgatherdps %ymm -; CHECK: ret declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly -define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, - <4 x i32> %idx, <4 x double> %mask) { +define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, <4 x i32> %idx, <4 x double> %mask) { +; X32-LABEL: test_x86_avx2_gather_d_pd_256: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vgatherdpd %ymm1, (%eax,%xmm0,8), %ymm2 +; X32-NEXT: vmovapd %ymm2, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_x86_avx2_gather_d_pd_256: +; X64: ## BB#0: +; X64-NEXT: vgatherdpd %ymm1, (%rdi,%xmm0,8), %ymm2 +; X64-NEXT: vmovapd %ymm2, %ymm0 +; X64-NEXT: retq %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 8) ; ret <4 x double> %res } - -; CHECK-LABEL: test_x86_avx2_gather_d_pd_256 -; CHECK: vgatherdpd %ymm -; CHECK: ret diff --git a/llvm/test/CodeGen/X86/avx2-logic.ll b/llvm/test/CodeGen/X86/avx2-logic.ll index e187933f66b..fd4d2cbf927 100644 --- a/llvm/test/CodeGen/X86/avx2-logic.ll +++ b/llvm/test/CodeGen/X86/avx2-logic.ll @@ -1,9 +1,20 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 -; CHECK: vpandn -; CHECK: vpandn %ymm -; CHECK: ret define <4 x i64> @vpandn(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; X32-LABEL: vpandn: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddq LCPI0_0, %ymm0, %ymm1 +; X32-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vpandn: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; X64-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq entry: ; Force the execution domain with an add. %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> @@ -12,10 +23,19 @@ entry: ret <4 x i64> %x } -; CHECK: vpand -; CHECK: vpand %ymm -; CHECK: ret define <4 x i64> @vpand(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; X32-LABEL: vpand: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0 +; X32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vpand: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq entry: ; Force the execution domain with an add. %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> @@ -23,10 +43,19 @@ entry: ret <4 x i64> %x } -; CHECK: vpor -; CHECK: vpor %ymm -; CHECK: ret define <4 x i64> @vpor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; X32-LABEL: vpor: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddq LCPI2_0, %ymm0, %ymm0 +; X32-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vpor: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq entry: ; Force the execution domain with an add. %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> @@ -34,10 +63,19 @@ entry: ret <4 x i64> %x } -; CHECK: vpxor -; CHECK: vpxor %ymm -; CHECK: ret define <4 x i64> @vpxor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; X32-LABEL: vpxor: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddq LCPI3_0, %ymm0, %ymm0 +; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vpxor: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq entry: ; Force the execution domain with an add. %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> @@ -45,22 +83,46 @@ entry: ret <4 x i64> %x } -; CHECK: vpblendvb -; CHECK: vpblendvb %ymm -; CHECK: ret define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) { +; X32-LABEL: vpblendvb: +; X32: ## BB#0: +; X32-NEXT: vpsllw $7, %ymm0, %ymm0 +; X32-NEXT: vpand LCPI4_0, %ymm0, %ymm0 +; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vpblendvb: +; X64: ## BB#0: +; X64-NEXT: vpsllw $7, %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; X64-NEXT: retq %min = select <32 x i1> %cond, <32 x i8> %x, <32 x i8> %y ret <32 x i8> %min } define <8 x i32> @allOnes() nounwind { -; CHECK: vpcmpeqd -; CHECK-NOT: vinsert +; X32-LABEL: allOnes: +; X32: ## BB#0: +; X32-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: allOnes: +; X64: ## BB#0: +; X64-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> } define <16 x i16> @allOnes2() nounwind { -; CHECK: vpcmpeqd -; CHECK-NOT: vinsert +; X32-LABEL: allOnes2: +; X32: ## BB#0: +; X32-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: allOnes2: +; X64: ## BB#0: +; X64-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq ret <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> } diff --git a/llvm/test/CodeGen/X86/avx2-phaddsub.ll b/llvm/test/CodeGen/X86/avx2-phaddsub.ll index 88c70ad84fa..9eafac902b8 100644 --- a/llvm/test/CodeGen/X86/avx2-phaddsub.ll +++ b/llvm/test/CodeGen/X86/avx2-phaddsub.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) { -; CHECK-LABEL: phaddw1: -; CHECK: # BB#0: -; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phaddw1: +; X32: ## BB#0: +; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phaddw1: +; X64: ## BB#0: +; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> %b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> %r = add <16 x i16> %a, %b @@ -13,10 +19,15 @@ define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) { } define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) { -; CHECK-LABEL: phaddw2: -; CHECK: # BB#0: -; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phaddw2: +; X32: ## BB#0: +; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phaddw2: +; X64: ## BB#0: +; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> %b = shufflevector <16 x i16> %y, <16 x i16> %x, <16 x i32> <i32 16, i32 18, i32 20, i32 22, i32 0, i32 2, i32 4, i32 6, i32 24, i32 26, i32 28, i32 30, i32 8, i32 10, i32 12, i32 14> %r = add <16 x i16> %a, %b @@ -24,10 +35,15 @@ define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) { } define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) { -; CHECK-LABEL: phaddd1: -; CHECK: # BB#0: -; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phaddd1: +; X32: ## BB#0: +; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phaddd1: +; X64: ## BB#0: +; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> %b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> %r = add <8 x i32> %a, %b @@ -35,10 +51,15 @@ define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) { } define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) { -; CHECK-LABEL: phaddd2: -; CHECK: # BB#0: -; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phaddd2: +; X32: ## BB#0: +; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phaddd2: +; X64: ## BB#0: +; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14> %b = shufflevector <8 x i32> %y, <8 x i32> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7> %r = add <8 x i32> %a, %b @@ -46,10 +67,15 @@ define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) { } define <8 x i32> @phaddd3(<8 x i32> %x) { -; CHECK-LABEL: phaddd3: -; CHECK: # BB#0: -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phaddd3: +; X32: ## BB#0: +; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phaddd3: +; X64: ## BB#0: +; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> %r = add <8 x i32> %a, %b @@ -57,10 +83,15 @@ define <8 x i32> @phaddd3(<8 x i32> %x) { } define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) { -; CHECK-LABEL: phsubw1: -; CHECK: # BB#0: -; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phsubw1: +; X32: ## BB#0: +; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phsubw1: +; X64: ## BB#0: +; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> %b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> %r = sub <16 x i16> %a, %b @@ -68,10 +99,15 @@ define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) { } define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) { -; CHECK-LABEL: phsubd1: -; CHECK: # BB#0: -; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phsubd1: +; X32: ## BB#0: +; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phsubd1: +; X64: ## BB#0: +; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> %b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> %r = sub <8 x i32> %a, %b @@ -79,10 +115,15 @@ define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) { } define <8 x i32> @phsubd2(<8 x i32> %x, <8 x i32> %y) { -; CHECK-LABEL: phsubd2: -; CHECK: # BB#0: -; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: phsubd2: +; X32: ## BB#0: +; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phsubd2: +; X64: ## BB#0: +; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 undef, i32 8, i32 undef, i32 4, i32 6, i32 12, i32 14> %b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 undef, i32 9, i32 11, i32 5, i32 7, i32 undef, i32 15> %r = sub <8 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll index 5adbb2ef665..887fef113e7 100644 --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -1,301 +1,603 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 -; CHECK: variable_shl0 -; CHECK: psllvd -; CHECK: ret define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) { +; X32-LABEL: variable_shl0: +; X32: ## BB#0: +; X32-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl0: +; X64: ## BB#0: +; X64-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %k = shl <4 x i32> %x, %y ret <4 x i32> %k } -; CHECK: variable_shl1 -; CHECK: psllvd -; CHECK: ret + define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) { +; X32-LABEL: variable_shl1: +; X32: ## BB#0: +; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl1: +; X64: ## BB#0: +; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %k = shl <8 x i32> %x, %y ret <8 x i32> %k } -; CHECK: variable_shl2 -; CHECK: psllvq -; CHECK: ret + define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) { +; X32-LABEL: variable_shl2: +; X32: ## BB#0: +; X32-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl2: +; X64: ## BB#0: +; X64-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %k = shl <2 x i64> %x, %y ret <2 x i64> %k } -; CHECK: variable_shl3 -; CHECK: psllvq -; CHECK: ret + define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) { +; X32-LABEL: variable_shl3: +; X32: ## BB#0: +; X32-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl3: +; X64: ## BB#0: +; X64-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %k = shl <4 x i64> %x, %y ret <4 x i64> %k } -; CHECK: variable_srl0 -; CHECK: psrlvd -; CHECK: ret + define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) { +; X32-LABEL: variable_srl0: +; X32: ## BB#0: +; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl0: +; X64: ## BB#0: +; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %k = lshr <4 x i32> %x, %y ret <4 x i32> %k } -; CHECK: variable_srl1 -; CHECK: psrlvd -; CHECK: ret + define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) { +; X32-LABEL: variable_srl1: +; X32: ## BB#0: +; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl1: +; X64: ## BB#0: +; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %k = lshr <8 x i32> %x, %y ret <8 x i32> %k } -; CHECK: variable_srl2 -; CHECK: psrlvq -; CHECK: ret + define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) { +; X32-LABEL: variable_srl2: +; X32: ## BB#0: +; X32-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl2: +; X64: ## BB#0: +; X64-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %k = lshr <2 x i64> %x, %y ret <2 x i64> %k } -; CHECK: variable_srl3 -; CHECK: psrlvq -; CHECK: ret + define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) { +; X32-LABEL: variable_srl3: +; X32: ## BB#0: +; X32-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl3: +; X64: ## BB#0: +; X64-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %k = lshr <4 x i64> %x, %y ret <4 x i64> %k } -; CHECK: variable_sra0 -; CHECK: vpsravd -; CHECK: ret define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) { +; X32-LABEL: variable_sra0: +; X32: ## BB#0: +; X32-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_sra0: +; X64: ## BB#0: +; X64-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq %k = ashr <4 x i32> %x, %y ret <4 x i32> %k } -; CHECK: variable_sra1 -; CHECK: vpsravd -; CHECK: ret + define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) { +; X32-LABEL: variable_sra1: +; X32: ## BB#0: +; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_sra1: +; X64: ## BB#0: +; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %k = ashr <8 x i32> %x, %y ret <8 x i32> %k } ;;; Shift left -; CHECK: vpslld + define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone { - %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 -2> +; X32-LABEL: vshift00: +; X32: ## BB#0: +; X32-NEXT: vpslld $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift00: +; X64: ## BB#0: +; X64-NEXT: vpslld $2, %ymm0, %ymm0 +; X64-NEXT: retq + %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ret <8 x i32> %s } -; CHECK: vpsllw define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone { +; X32-LABEL: vshift01: +; X32: ## BB#0: +; X32-NEXT: vpsllw $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift01: +; X64: ## BB#0: +; X64-NEXT: vpsllw $2, %ymm0, %ymm0 +; X64-NEXT: retq %s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> ret <16 x i16> %s } -; CHECK: vpsllq define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone { +; X32-LABEL: vshift02: +; X32: ## BB#0: +; X32-NEXT: vpsllq $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift02: +; X64: ## BB#0: +; X64-NEXT: vpsllq $2, %ymm0, %ymm0 +; X64-NEXT: retq %s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2> ret <4 x i64> %s } ;;; Logical Shift right -; CHECK: vpsrld + define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone { - %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 -2> +; X32-LABEL: vshift03: +; X32: ## BB#0: +; X32-NEXT: vpsrld $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift03: +; X64: ## BB#0: +; X64-NEXT: vpsrld $2, %ymm0, %ymm0 +; X64-NEXT: retq + %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ret <8 x i32> %s } -; CHECK: vpsrlw define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone { +; X32-LABEL: vshift04: +; X32: ## BB#0: +; X32-NEXT: vpsrlw $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift04: +; X64: ## BB#0: +; X64-NEXT: vpsrlw $2, %ymm0, %ymm0 +; X64-NEXT: retq %s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> ret <16 x i16> %s } -; CHECK: vpsrlq define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone { +; X32-LABEL: vshift05: +; X32: ## BB#0: +; X32-NEXT: vpsrlq $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift05: +; X64: ## BB#0: +; X64-NEXT: vpsrlq $2, %ymm0, %ymm0 +; X64-NEXT: retq %s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2> ret <4 x i64> %s } ;;; Arithmetic Shift right -; CHECK: vpsrad + define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone { - %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 -2> +; X32-LABEL: vshift06: +; X32: ## BB#0: +; X32-NEXT: vpsrad $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift06: +; X64: ## BB#0: +; X64-NEXT: vpsrad $2, %ymm0, %ymm0 +; X64-NEXT: retq + %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ret <8 x i32> %s } -; CHECK: vpsraw define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone { +; X32-LABEL: vshift07: +; X32: ## BB#0: +; X32-NEXT: vpsraw $2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: vshift07: +; X64: ## BB#0: +; X64-NEXT: vpsraw $2, %ymm0, %ymm0 +; X64-NEXT: retq %s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> ret <16 x i16> %s } -; CHECK: variable_sra0_load -; CHECK: vpsravd (% -; CHECK: ret define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) { +; X32-LABEL: variable_sra0_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsravd (%eax), %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_sra0_load: +; X64: ## BB#0: +; X64-NEXT: vpsravd (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq %y1 = load <4 x i32>, <4 x i32>* %y %k = ashr <4 x i32> %x, %y1 ret <4 x i32> %k } -; CHECK: variable_sra1_load -; CHECK: vpsravd (% -; CHECK: ret define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) { +; X32-LABEL: variable_sra1_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsravd (%eax), %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_sra1_load: +; X64: ## BB#0: +; X64-NEXT: vpsravd (%rdi), %ymm0, %ymm0 +; X64-NEXT: retq %y1 = load <8 x i32>, <8 x i32>* %y %k = ashr <8 x i32> %x, %y1 ret <8 x i32> %k } -; CHECK: variable_shl0_load -; CHECK: vpsllvd (% -; CHECK: ret define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) { +; X32-LABEL: variable_shl0_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsllvd (%eax), %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl0_load: +; X64: ## BB#0: +; X64-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq %y1 = load <4 x i32>, <4 x i32>* %y %k = shl <4 x i32> %x, %y1 ret <4 x i32> %k } -; CHECK: variable_shl1_load -; CHECK: vpsllvd (% -; CHECK: ret + define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) { +; X32-LABEL: variable_shl1_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsllvd (%eax), %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl1_load: +; X64: ## BB#0: +; X64-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 +; X64-NEXT: retq %y1 = load <8 x i32>, <8 x i32>* %y %k = shl <8 x i32> %x, %y1 ret <8 x i32> %k } -; CHECK: variable_shl2_load -; CHECK: vpsllvq (% -; CHECK: ret + define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) { +; X32-LABEL: variable_shl2_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsllvq (%eax), %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl2_load: +; X64: ## BB#0: +; X64-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq %y1 = load <2 x i64>, <2 x i64>* %y %k = shl <2 x i64> %x, %y1 ret <2 x i64> %k } -; CHECK: variable_shl3_load -; CHECK: vpsllvq (% -; CHECK: ret + define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) { +; X32-LABEL: variable_shl3_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsllvq (%eax), %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_shl3_load: +; X64: ## BB#0: +; X64-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 +; X64-NEXT: retq %y1 = load <4 x i64>, <4 x i64>* %y %k = shl <4 x i64> %x, %y1 ret <4 x i64> %k } -; CHECK: variable_srl0_load -; CHECK: vpsrlvd (% -; CHECK: ret + define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) { +; X32-LABEL: variable_srl0_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsrlvd (%eax), %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl0_load: +; X64: ## BB#0: +; X64-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq %y1 = load <4 x i32>, <4 x i32>* %y %k = lshr <4 x i32> %x, %y1 ret <4 x i32> %k } -; CHECK: variable_srl1_load -; CHECK: vpsrlvd (% -; CHECK: ret + define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) { +; X32-LABEL: variable_srl1_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsrlvd (%eax), %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl1_load: +; X64: ## BB#0: +; X64-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 +; X64-NEXT: retq %y1 = load <8 x i32>, <8 x i32>* %y %k = lshr <8 x i32> %x, %y1 ret <8 x i32> %k } -; CHECK: variable_srl2_load -; CHECK: vpsrlvq (% -; CHECK: ret + define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) { +; X32-LABEL: variable_srl2_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsrlvq (%eax), %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl2_load: +; X64: ## BB#0: +; X64-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq %y1 = load <2 x i64>, <2 x i64>* %y %k = lshr <2 x i64> %x, %y1 ret <2 x i64> %k } -; CHECK: variable_srl3_load -; CHECK: vpsrlvq (% -; CHECK: ret + define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) { +; X32-LABEL: variable_srl3_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vpsrlvq (%eax), %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: variable_srl3_load: +; X64: ## BB#0: +; X64-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 +; X64-NEXT: retq %y1 = load <4 x i64>, <4 x i64>* %y %k = lshr <4 x i64> %x, %y1 ret <4 x i64> %k } define <32 x i8> @shl9(<32 x i8> %A) nounwind { +; X32-LABEL: shl9: +; X32: ## BB#0: +; X32-NEXT: vpsllw $3, %ymm0, %ymm0 +; X32-NEXT: vpand LCPI28_0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: shl9: +; X64: ## BB#0: +; X64-NEXT: vpsllw $3, %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq %B = shl <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %B -; CHECK-LABEL: shl9: -; CHECK: vpsllw $3 -; CHECK: vpand -; CHECK: ret } define <32 x i8> @shr9(<32 x i8> %A) nounwind { +; X32-LABEL: shr9: +; X32: ## BB#0: +; X32-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-NEXT: vpand LCPI29_0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: shr9: +; X64: ## BB#0: +; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq %B = lshr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %B -; CHECK-LABEL: shr9: -; CHECK: vpsrlw $3 -; CHECK: vpand -; CHECK: ret } define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind { +; X32-LABEL: sra_v32i8_7: +; X32: ## BB#0: +; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sra_v32i8_7: +; X64: ## BB#0: +; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> ret <32 x i8> %B -; CHECK-LABEL: sra_v32i8_7: -; CHECK: vpxor -; CHECK: vpcmpgtb -; CHECK: ret } define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind { +; X32-LABEL: sra_v32i8: +; X32: ## BB#0: +; X32-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-NEXT: vpand LCPI31_0, %ymm0, %ymm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sra_v32i8: +; X64: ## BB#0: +; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq %B = ashr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %B -; CHECK-LABEL: sra_v32i8: -; CHECK: vpsrlw $3 -; CHECK: vpand -; CHECK: vpxor -; CHECK: vpsubb -; CHECK: ret -} - -; CHECK: _sext_v16i16 -; CHECK: vpsllw -; CHECK: vpsraw -; CHECK-NOT: vinsertf128 +} + define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind { +; X32-LABEL: sext_v16i16: +; X32: ## BB#0: +; X32-NEXT: vpsllw $8, %ymm0, %ymm0 +; X32-NEXT: vpsraw $8, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sext_v16i16: +; X64: ## BB#0: +; X64-NEXT: vpsllw $8, %ymm0, %ymm0 +; X64-NEXT: vpsraw $8, %ymm0, %ymm0 +; X64-NEXT: retq %b = trunc <16 x i16> %a to <16 x i8> %c = sext <16 x i8> %b to <16 x i16> ret <16 x i16> %c } -; CHECK: _sext_v8i32 -; CHECK: vpslld -; CHECK: vpsrad -; CHECK-NOT: vinsertf128 define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind { +; X32-LABEL: sext_v8i32: +; X32: ## BB#0: +; X32-NEXT: vpslld $16, %ymm0, %ymm0 +; X32-NEXT: vpsrad $16, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: sext_v8i32: +; X64: ## BB#0: +; X64-NEXT: vpslld $16, %ymm0, %ymm0 +; X64-NEXT: vpsrad $16, %ymm0, %ymm0 +; X64-NEXT: retq %b = trunc <8 x i32> %a to <8 x i16> %c = sext <8 x i16> %b to <8 x i32> ret <8 x i32> %c } define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8 x i16> %rhs) { -; CHECK-LABEL: variable_shl16: -; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]] -; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]] -; CHECK: vpsllvd [[AMT]], [[LHS]], {{%ymm[0-9]+}} -; CHECK: vpshufb -; CHECK: vpermq +; X32-LABEL: variable_shl16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: variable_shl16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %res = shl <8 x i16> %lhs, %rhs ret <8 x i16> %res } define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8 x i16> %rhs) { -; CHECK-LABEL: variable_ashr16: -; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]] -; CHECK-DAG: vpmovsxwd %xmm0, [[LHS:%ymm[0-9]+]] -; CHECK: vpsravd [[AMT]], [[LHS]], {{%ymm[0-9]+}} -; CHECK: vpshufb -; CHECK: vpermq +; X32-LABEL: variable_ashr16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-NEXT: vpmovsxwd %xmm0, %ymm0 +; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: variable_ashr16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-NEXT: vpmovsxwd %xmm0, %ymm0 +; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %res = ashr <8 x i16> %lhs, %rhs ret <8 x i16> %res } define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8 x i16> %rhs) { -; CHECK-LABEL: variable_lshr16: -; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]] -; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]] -; CHECK: vpsrlvd [[AMT]], [[LHS]], {{%ymm[0-9]+}} -; CHECK: vpshufb -; CHECK: vpermq +; X32-LABEL: variable_lshr16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: variable_lshr16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %res = lshr <8 x i16> %lhs, %rhs ret <8 x i16> %res -}
\ No newline at end of file +} diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll index c9ab80bc549..d509046cccd 100644 --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -1,102 +1,152 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 ; AVX2 Logical Shift Left define <16 x i16> @test_sllw_1(<16 x i16> %InVec) { -; CHECK-LABEL: test_sllw_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_sllw_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_sllw_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> ret <16 x i16> %shl } define <16 x i16> @test_sllw_2(<16 x i16> %InVec) { -; CHECK-LABEL: test_sllw_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_sllw_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_sllw_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> ret <16 x i16> %shl } define <16 x i16> @test_sllw_3(<16 x i16> %InVec) { -; CHECK-LABEL: test_sllw_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_sllw_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsllw $15, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_sllw_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsllw $15, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> ret <16 x i16> %shl } define <8 x i32> @test_slld_1(<8 x i32> %InVec) { -; CHECK-LABEL: test_slld_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_slld_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_slld_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shl } define <8 x i32> @test_slld_2(<8 x i32> %InVec) { -; CHECK-LABEL: test_slld_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_slld_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_slld_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <8 x i32> %shl } define <8 x i32> @test_vpslld_var(i32 %shift) { -; CHECK-LABEL: test_vpslld_var: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] -; CHECK-NEXT: vpslld %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_vpslld_var: +; X32: ## BB#0: +; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] +; X32-NEXT: vpslld %xmm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_vpslld_var: +; X64: ## BB#0: +; X64-NEXT: vmovd %edi, %xmm0 +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] +; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0 +; X64-NEXT: retq %amt = insertelement <8 x i32> undef, i32 %shift, i32 0 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt ret <8 x i32> %tmp } define <8 x i32> @test_slld_3(<8 x i32> %InVec) { -; CHECK-LABEL: test_slld_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpslld $31, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_slld_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpslld $31, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_slld_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpslld $31, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> ret <8 x i32> %shl } define <4 x i64> @test_sllq_1(<4 x i64> %InVec) { -; CHECK-LABEL: test_sllq_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_sllq_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_sllq_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0> ret <4 x i64> %shl } define <4 x i64> @test_sllq_2(<4 x i64> %InVec) { -; CHECK-LABEL: test_sllq_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_sllq_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_sllq_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1> ret <4 x i64> %shl } define <4 x i64> @test_sllq_3(<4 x i64> %InVec) { -; CHECK-LABEL: test_sllq_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_sllq_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsllq $63, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_sllq_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsllq $63, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63> ret <4 x i64> %shl @@ -105,58 +155,86 @@ entry: ; AVX2 Arithmetic Shift define <16 x i16> @test_sraw_1(<16 x i16> %InVec) { -; CHECK-LABEL: test_sraw_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_sraw_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_sraw_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> ret <16 x i16> %shl } define <16 x i16> @test_sraw_2(<16 x i16> %InVec) { -; CHECK-LABEL: test_sraw_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsraw $1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_sraw_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsraw $1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_sraw_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsraw $1, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> ret <16 x i16> %shl } define <16 x i16> @test_sraw_3(<16 x i16> %InVec) { -; CHECK-LABEL: test_sraw_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_sraw_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsraw $15, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_sraw_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsraw $15, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> ret <16 x i16> %shl } define <8 x i32> @test_srad_1(<8 x i32> %InVec) { -; CHECK-LABEL: test_srad_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_srad_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_srad_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shl } define <8 x i32> @test_srad_2(<8 x i32> %InVec) { -; CHECK-LABEL: test_srad_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrad $1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srad_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrad $1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srad_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrad $1, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <8 x i32> %shl } define <8 x i32> @test_srad_3(<8 x i32> %InVec) { -; CHECK-LABEL: test_srad_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srad_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrad $31, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srad_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrad $31, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> ret <8 x i32> %shl @@ -165,102 +243,154 @@ entry: ; SSE Logical Shift Right define <16 x i16> @test_srlw_1(<16 x i16> %InVec) { -; CHECK-LABEL: test_srlw_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_srlw_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_srlw_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> ret <16 x i16> %shl } define <16 x i16> @test_srlw_2(<16 x i16> %InVec) { -; CHECK-LABEL: test_srlw_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srlw_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrlw $1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srlw_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrlw $1, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> ret <16 x i16> %shl } define <16 x i16> @test_srlw_3(<16 x i16> %InVec) { -; CHECK-LABEL: test_srlw_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srlw_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrlw $15, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srlw_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrlw $15, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> ret <16 x i16> %shl } define <8 x i32> @test_srld_1(<8 x i32> %InVec) { -; CHECK-LABEL: test_srld_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_srld_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_srld_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shl } define <8 x i32> @test_srld_2(<8 x i32> %InVec) { -; CHECK-LABEL: test_srld_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srld_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrld $1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srld_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrld $1, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <8 x i32> %shl } define <8 x i32> @test_srld_3(<8 x i32> %InVec) { -; CHECK-LABEL: test_srld_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srld_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrld $31, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srld_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrld $31, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> ret <8 x i32> %shl } define <4 x i64> @test_srlq_1(<4 x i64> %InVec) { -; CHECK-LABEL: test_srlq_1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: retq +; X32-LABEL: test_srlq_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: retl +; +; X64-LABEL: test_srlq_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: retq entry: %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0> ret <4 x i64> %shl } define <4 x i64> @test_srlq_2(<4 x i64> %InVec) { -; CHECK-LABEL: test_srlq_2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrlq $1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srlq_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrlq $1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srlq_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrlq $1, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1> ret <4 x i64> %shl } define <4 x i64> @test_srlq_3(<4 x i64> %InVec) { -; CHECK-LABEL: test_srlq_3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: test_srlq_3: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpsrlq $63, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_srlq_3: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpsrlq $63, %ymm0, %ymm0 +; X64-NEXT: retq entry: %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63> ret <4 x i64> %shl } define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { -; CHECK-LABEL: srl_trunc_and_v4i64: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: srl_trunc_and_v4i64: +; X32: ## BB#0: +; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; X32-NEXT: vpbroadcastd LCPI25_0, %xmm2 +; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: srl_trunc_and_v4i64: +; X64: ## BB#0: +; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8> %trunc = trunc <4 x i64> %and to <4 x i32> %sra = lshr <4 x i32> %x, %trunc @@ -272,171 +402,305 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { -; CHECK-LABEL: shl_8i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: shl_8i16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: shl_8i16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %shl = shl <8 x i16> %r, %a ret <8 x i16> %shl } define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { -; CHECK-LABEL: shl_16i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 -; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 -; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 -; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: shl_16i16: +; X32: ## BB#0: +; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 +; X32-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: shl_16i16: +; X64: ## BB#0: +; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 +; X64-NEXT: vpsrld $16, %ymm3, %ymm3 +; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpsrld $16, %ymm0, %ymm0 +; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X64-NEXT: retq %shl = shl <16 x i16> %r, %a ret <16 x i16> %shl } define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { -; CHECK-LABEL: shl_32i8: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2 -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2 -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: shl_32i8: +; X32: ## BB#0: +; X32-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-NEXT: vpsllw $4, %ymm0, %ymm2 +; X32-NEXT: vpand LCPI28_0, %ymm2, %ymm2 +; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-NEXT: vpsllw $2, %ymm0, %ymm2 +; X32-NEXT: vpand LCPI28_1, %ymm2, %ymm2 +; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: shl_32i8: +; X64: ## BB#0: +; X64-NEXT: vpsllw $5, %ymm1, %ymm1 +; X64-NEXT: vpsllw $4, %ymm0, %ymm2 +; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpsllw $2, %ymm0, %ymm2 +; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X64-NEXT: retq %shl = shl <32 x i8> %r, %a ret <32 x i8> %shl } define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { -; CHECK-LABEL: ashr_8i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 -; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: ashr_8i16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-NEXT: vpmovsxwd %xmm0, %ymm0 +; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: ashr_8i16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-NEXT: vpmovsxwd %xmm0, %ymm0 +; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ashr = ashr <8 x i16> %r, %a ret <8 x i16> %ashr } define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { -; CHECK-LABEL: ashr_16i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3 -; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 -; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 -; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: ashr_16i16: +; X32: ## BB#0: +; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; X32-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: ashr_16i16: +; X64: ## BB#0: +; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; X64-NEXT: vpsrld $16, %ymm3, %ymm3 +; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpsrld $16, %ymm0, %ymm0 +; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X64-NEXT: retq %ashr = ashr <16 x i16> %r, %a ret <16 x i16> %ashr } define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { -; CHECK-LABEL: ashr_32i8: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1 -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4 -; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4 -; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4 -; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2 -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3 -; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3 -; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3 -; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: ashr_32i8: +; X32: ## BB#0: +; X32-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: ashr_32i8: +; X64: ## BB#0: +; X64-NEXT: vpsllw $5, %ymm1, %ymm1 +; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X64-NEXT: vpsraw $4, %ymm3, %ymm4 +; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X64-NEXT: vpsraw $2, %ymm3, %ymm4 +; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X64-NEXT: vpsraw $1, %ymm3, %ymm4 +; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X64-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X64-NEXT: vpsraw $4, %ymm0, %ymm3 +; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X64-NEXT: vpsraw $2, %ymm0, %ymm3 +; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X64-NEXT: vpsraw $1, %ymm0, %ymm3 +; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X64-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X64-NEXT: retq %ashr = ashr <32 x i8> %r, %a ret <32 x i8> %ashr } define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { -; CHECK-LABEL: lshr_8i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: lshr_8i16: +; X32: ## BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: lshr_8i16: +; X64: ## BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; X64-NEXT: vzeroupper +; X64-NEXT: retq %lshr = lshr <8 x i16> %r, %a ret <8 x i16> %lshr } define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { -; CHECK-LABEL: lshr_16i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 -; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 -; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 -; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: lshr_16i16: +; X32: ## BB#0: +; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; X32-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: lshr_16i16: +; X64: ## BB#0: +; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; X64-NEXT: vpsrld $16, %ymm3, %ymm3 +; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpsrld $16, %ymm0, %ymm0 +; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X64-NEXT: retq %lshr = lshr <16 x i16> %r, %a ret <16 x i16> %lshr } define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { -; CHECK-LABEL: lshr_32i8: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1 -; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm2 -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlw $2, %ymm0, %ymm2 -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm2 -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: lshr_32i8: +; X32: ## BB#0: +; X32-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X32-NEXT: vpand LCPI34_0, %ymm2, %ymm2 +; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-NEXT: vpsrlw $2, %ymm0, %ymm2 +; X32-NEXT: vpand LCPI34_1, %ymm2, %ymm2 +; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-NEXT: vpsrlw $1, %ymm0, %ymm2 +; X32-NEXT: vpand LCPI34_2, %ymm2, %ymm2 +; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: lshr_32i8: +; X64: ## BB#0: +; X64-NEXT: vpsllw $5, %ymm1, %ymm1 +; X64-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpsrlw $2, %ymm0, %ymm2 +; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpsrlw $1, %ymm0, %ymm2 +; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X64-NEXT: retq %lshr = lshr <32 x i8> %r, %a ret <32 x i8> %lshr } diff --git a/llvm/test/CodeGen/X86/avx2-vperm.ll b/llvm/test/CodeGen/X86/avx2-vperm.ll index cba8bbe4af4..d0e18550f6a 100644 --- a/llvm/test/CodeGen/X86/avx2-vperm.ll +++ b/llvm/test/CodeGen/X86/avx2-vperm.ll @@ -1,12 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64 define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone { -; CHECK-LABEL: perm_cl_int_8x32: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] -; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: perm_cl_int_8x32: +; X32: ## BB#0: ## %entry +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] +; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: perm_cl_int_8x32: +; X64: ## BB#0: ## %entry +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] +; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq entry: %B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> <i32 0, i32 7, i32 2, i32 1, i32 2, i32 7, i32 6, i32 0> ret <8 x i32> %B @@ -14,31 +21,47 @@ entry: define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone { -; CHECK-LABEL: perm_cl_fp_8x32: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6> -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; X32-LABEL: perm_cl_fp_8x32: +; X32: ## BB#0: ## %entry +; X32-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6> +; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: perm_cl_fp_8x32: +; X64: ## BB#0: ## %entry +; X64-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6> +; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq entry: %B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> <i32 undef, i32 7, i32 2, i32 undef, i32 4, i32 undef, i32 1, i32 6> ret <8 x float> %B } define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone { -; CHECK-LABEL: perm_cl_int_4x64: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] -; CHECK-NEXT: retq +; X32-LABEL: perm_cl_int_4x64: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X32-NEXT: retl +; +; X64-LABEL: perm_cl_int_4x64: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X64-NEXT: retq entry: %B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1> ret <4 x i64> %B } define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone { -; CHECK-LABEL: perm_cl_fp_4x64: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] -; CHECK-NEXT: retq +; X32-LABEL: perm_cl_fp_4x64: +; X32: ## BB#0: ## %entry +; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X32-NEXT: retl +; +; X64-LABEL: perm_cl_fp_4x64: +; X64: ## BB#0: ## %entry +; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X64-NEXT: retq entry: %B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1> ret <4 x double> %B |