diff options
| author | Craig Topper <craig.topper@gmail.com> | 2017-01-14 07:50:52 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@gmail.com> | 2017-01-14 07:50:52 +0000 |
| commit | 63e2cd6caa14e1ed5b383c4f68a7f913a2c834eb (patch) | |
| tree | 77a421301749186159d75d4716bc9869a4b6cb0c | |
| parent | 09b7e0f01d0505b2ba1e3340486685c0393f2b55 (diff) | |
| download | bcm5719-llvm-63e2cd6caa14e1ed5b383c4f68a7f913a2c834eb.tar.gz bcm5719-llvm-63e2cd6caa14e1ed5b383c4f68a7f913a2c834eb.zip | |
[AVX-512] Teach two address instruction pass to replace masked move instructions with blendm instructions when its beneficial.
Isel now selects masked move instructions for vselect instead of blendm. But sometimes it beneficial to register allocation to remove the tied register constraint by using blendm instructions.
This also picks up cases where the masked move was created due to a masked load intrinsic.
Differential Revision: https://reviews.llvm.org/D28454
llvm-svn: 292005
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 125 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-bugfix-26264.ll | 26 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll | 53 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll | 18 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 127 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-vec-cmp.ll | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bwvl-vec-cmp.ll | 72 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll | 144 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/masked_memop.ll | 29 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-masked.ll | 3 |
16 files changed, 314 insertions, 357 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 5ef943c2c76..ddf269d3505 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2738,7 +2738,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], _.ExeDomain>, EVEX; - let Constraints = "$src0 = $dst" in { + let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index d30a0683cf6..11aa5444b50 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4044,6 +4044,131 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), MI.getOperand(2)); break; + + case X86::VMOVDQU8Z128rmk: + case X86::VMOVDQU8Z256rmk: + case X86::VMOVDQU8Zrmk: + case X86::VMOVDQU16Z128rmk: + case X86::VMOVDQU16Z256rmk: + case X86::VMOVDQU16Zrmk: + case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk: + case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk: + case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk: + case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk: + case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk: + case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk: + case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk: + case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk: + case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk: + case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: + case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: + case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: { + unsigned Opc; + switch (MIOpc) { + default: llvm_unreachable("Unreachable!"); + case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; + case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; + case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; + case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; + case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; + case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; + case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + } + + NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .add(MI.getOperand(2)) + .add(Src) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)) + .add(MI.getOperand(6)) + .add(MI.getOperand(7)); + break; + } + case X86::VMOVDQU8Z128rrk: + case X86::VMOVDQU8Z256rrk: + case X86::VMOVDQU8Zrrk: + case X86::VMOVDQU16Z128rrk: + case X86::VMOVDQU16Z256rrk: + case X86::VMOVDQU16Zrrk: + case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk: + case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk: + case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk: + case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk: + case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk: + case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk: + case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk: + case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk: + case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk: + case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk: + case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk: + case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: { + unsigned Opc; + switch (MIOpc) { + default: llvm_unreachable("Unreachable!"); + case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break; + case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break; + case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break; + case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break; + case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break; + case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break; + case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; + case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; + case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break; + case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; + case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; + case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break; + case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; + case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; + case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break; + case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; + case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; + case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break; + case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; + case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; + case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break; + case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; + case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; + case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break; + case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; + case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; + case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break; + case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; + case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; + case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break; + } + + NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .add(MI.getOperand(2)) + .add(Src) + .add(MI.getOperand(3)); + break; + } } if (!NewMI) return nullptr; diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll index b15d28a649b..b29b6ee0658 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll @@ -6,17 +6,14 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2} ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 -; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} -; AVX512BW-NEXT: vmovapd %zmm1, %zmm0 -; AVX512BW-NEXT: vmovapd %zmm2, %zmm1 -; AVX512BW-NEXT: vmovapd %zmm3, %zmm2 -; AVX512BW-NEXT: vmovapd %zmm4, %zmm3 +; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovapd %zmm5, %zmm2 ; AVX512BW-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res @@ -27,17 +24,14 @@ define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm5 {%k2} ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 -; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 ; AVX512BW-NEXT: retq %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) ret <32 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll index 95ce212ab5a..09754ad2620 100644 --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -43,8 +43,7 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float ; AVX512: ## BB#0: ; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) @@ -189,22 +188,18 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16i64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 -; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 -; SKX-NEXT: vmovdqa64 %zmm2, %zmm1 +; SKX-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} ; SKX-NEXT: retq %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res @@ -217,22 +212,18 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 -; AVX512F-NEXT: vmovapd %zmm2, %zmm1 +; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 -; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 -; SKX-NEXT: vmovapd %zmm2, %zmm1 +; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; SKX-NEXT: retq %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res @@ -246,36 +237,30 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 ; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 ; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 ; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1} +; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1} ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2} +; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k2, %k1 -; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 -; AVX512F-NEXT: vmovapd %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm3, %zmm2 -; AVX512F-NEXT: vmovapd %zmm4, %zmm3 +; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovapd %zmm5, %zmm2 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_32f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k1 -; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: kshiftrd $16, %k1, %k2 -; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2} +; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2} ; SKX-NEXT: kshiftrw $8, %k1, %k1 -; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; SKX-NEXT: kshiftrw $8, %k2, %k1 -; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 -; SKX-NEXT: vmovapd %zmm2, %zmm1 -; SKX-NEXT: vmovapd %zmm3, %zmm2 -; SKX-NEXT: vmovapd %zmm4, %zmm3 +; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} +; SKX-NEXT: vmovapd %zmm5, %zmm2 ; SKX-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res diff --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll index bab8b96d9b8..72c3451a685 100644 --- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -20,8 +20,7 @@ define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x ; CHECK: ## BB#0: ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> %val) ret <32 x i8> %res @@ -33,8 +32,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; CHECK: ## BB#0: ; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 ; CHECK-NEXT: vpmovb2m %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val) ret <64 x i8> %res @@ -70,8 +68,7 @@ define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val) ret <32 x i16> %res diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index a29c1e4628a..a2b058d0a40 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -325,13 +325,11 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) { } ; X32-LABEL: test_argRet128Vector: -; X32: vmovdqa{{.*}} %xmm0, %xmm1 -; X32: vmovdqa{{.*}} %xmm1, %xmm0 +; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet128Vector: -; WIN64: vmovdqa{{.*}} %xmm0, %xmm1 -; WIN64: vmovdqa{{.*}} %xmm1, %xmm0 +; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 128 bit vector @@ -360,13 +358,11 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) { } ; X32-LABEL: test_argRet256Vector: -; X32: vmovdqa{{.*}} %ymm0, %ymm1 -; X32: vmovdqa{{.*}} %ymm1, %ymm0 +; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet256Vector: -; WIN64: vmovdqa{{.*}} %ymm0, %ymm1 -; WIN64: vmovdqa{{.*}} %ymm1, %ymm0 +; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 256 bit vector @@ -395,13 +391,11 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) { } ; X32-LABEL: test_argRet512Vector: -; X32: vmovdqa{{.*}} %zmm0, %zmm1 -; X32: vmovdqa{{.*}} %zmm1, %zmm0 +; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet512Vector: -; WIN64: vmovdqa{{.*}} %zmm0, %zmm1 -; WIN64: vmovdqa{{.*}} %zmm1, %zmm0 +; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 512 bit vector diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 361ee1ddbf9..d06b15ead4b 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -6,8 +6,7 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y @@ -18,8 +17,7 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y @@ -30,8 +28,7 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin ; CHECK-LABEL: test3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y @@ -43,8 +40,7 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) ; CHECK-LABEL: test4_unsigned: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp uge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y @@ -55,8 +51,7 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y @@ -67,8 +62,7 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun ; CHECK-LABEL: test6_unsigned: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y @@ -87,8 +81,7 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) { ; SKX: ## BB#0: ; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = fcmp olt <4 x float> %a, zeroinitializer @@ -108,8 +101,7 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { ; SKX: ## BB#0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = fcmp olt <2 x double> %a, zeroinitializer %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b @@ -122,15 +114,14 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { ; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; KNL-NEXT: vmovdqa %ymm1, %ymm0 +; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; ; SKX-LABEL: test9: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovdqa %ymm1, %ymm0 +; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y @@ -143,15 +134,14 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { ; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; KNL-NEXT: vmovaps %ymm1, %ymm0 +; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; ; SKX-LABEL: test10: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = fcmp oeq <8 x float> %x, %y @@ -699,8 +689,7 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind ; CHECK-LABEL: test16: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 -; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y @@ -711,8 +700,7 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-LABEL: test17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sgt <16 x i32> %x, %y @@ -724,8 +712,7 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-LABEL: test18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sle <16 x i32> %x, %y @@ -737,8 +724,7 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-LABEL: test19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp ule <16 x i32> %x, %y @@ -751,8 +737,7 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <16 x i32> %x1, %y1 %mask0 = icmp eq <16 x i32> %x, %y @@ -766,8 +751,7 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i64> %x1, %y1 %mask0 = icmp sle <8 x i64> %x, %y @@ -781,8 +765,7 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <8 x i64> %x1, %y1 %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4 @@ -797,8 +780,7 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i32> %x1, %y1 %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 @@ -812,8 +794,7 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { ; CHECK-LABEL: test24: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 @@ -827,8 +808,7 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind ; CHECK-LABEL: test25: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 @@ -843,8 +823,7 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -861,8 +840,7 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -932,8 +910,7 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind { ; SKX-LABEL: test30: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = fcmp oeq <4 x double> %x, %y @@ -951,8 +928,7 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp ; SKX-LABEL: test31: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1 -; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %y = load <2 x double>, <2 x double>* %yp, align 4 @@ -971,8 +947,7 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp ; SKX-LABEL: test32: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1 -; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %y = load <4 x double>, <4 x double>* %yp, align 4 @@ -985,8 +960,7 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp ; CHECK-LABEL: test33: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x double>, <8 x double>* %yp, align 4 %mask = fcmp olt <8 x double> %x, %y @@ -1004,8 +978,7 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no ; SKX-LABEL: test34: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1 -; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %y = load <4 x float>, <4 x float>* %yp, align 4 %mask = fcmp olt <4 x float> %x, %y @@ -1020,15 +993,14 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vmovups (%rdi), %ymm2 ; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1 -; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; KNL-NEXT: vmovaps %ymm1, %ymm0 +; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; ; SKX-LABEL: test35: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1 -; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %y = load <8 x float>, <8 x float>* %yp, align 4 @@ -1041,8 +1013,7 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp ; CHECK-LABEL: test36: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x float>, <16 x float>* %yp, align 4 %mask = fcmp olt <16 x float> %x, %y @@ -1054,8 +1025,7 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou ; CHECK-LABEL: test37: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 -; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %a = load double, double* %ptr @@ -1078,8 +1048,7 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou ; SKX-LABEL: test38: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1 -; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %a = load double, double* %ptr @@ -1102,8 +1071,7 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou ; SKX-LABEL: test39: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1 -; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %a = load double, double* %ptr @@ -1120,8 +1088,7 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n ; CHECK-LABEL: test40: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %a = load float, float* %ptr @@ -1140,15 +1107,14 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vbroadcastss (%rdi), %ymm2 ; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1 -; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; KNL-NEXT: vmovaps %ymm1, %ymm0 +; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; ; SKX-LABEL: test41: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1 -; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %a = load float, float* %ptr @@ -1171,8 +1137,7 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun ; SKX-LABEL: test42: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1 -; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %a = load float, float* %ptr @@ -1191,8 +1156,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} -; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; KNL-NEXT: vmovapd %zmm1, %zmm0 +; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test43: @@ -1200,8 +1164,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; SKX-NEXT: vpsllw $15, %xmm2, %xmm2 ; SKX-NEXT: vpmovw2m %xmm2, %k1 ; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} -; SKX-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq %a = load double, double* %ptr diff --git a/llvm/test/CodeGen/X86/avx512bw-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512bw-vec-cmp.ll index 34432468921..016837e6130 100644 --- a/llvm/test/CodeGen/X86/avx512bw-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512bw-vec-cmp.ll @@ -5,8 +5,7 @@ define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y @@ -17,8 +16,7 @@ define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y @@ -29,8 +27,7 @@ define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind ; CHECK-LABEL: test3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1 -; CHECK-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <32 x i16> %x, %y %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y @@ -41,8 +38,7 @@ define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind { ; CHECK-LABEL: test4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y @@ -53,8 +49,7 @@ define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwin ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %yp, align 4 %mask = icmp eq <32 x i16> %x, %y @@ -66,8 +61,7 @@ define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun ; CHECK-LABEL: test6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp sgt <32 x i16> %x, %y @@ -79,8 +73,7 @@ define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun ; CHECK-LABEL: test7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp sle <32 x i16> %x, %y @@ -92,8 +85,7 @@ define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun ; CHECK-LABEL: test8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp ule <32 x i16> %x, %y @@ -106,8 +98,7 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <32 x i16> %x1, %y1 %mask0 = icmp eq <32 x i16> %x, %y @@ -121,8 +112,7 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <64 x i8> %x1, %y1 %mask0 = icmp sle <64 x i8> %x, %y @@ -136,8 +126,7 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <64 x i8> %x1, %y1 %y = load <64 x i8>, <64 x i8>* %y.ptr, align 4 @@ -152,8 +141,7 @@ define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <32 x i16> %x1, %y1 %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 diff --git a/llvm/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512bwvl-vec-cmp.ll index 3e7f0acae78..17e581bbb50 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-vec-cmp.ll @@ -5,8 +5,7 @@ define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: test256_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y @@ -17,8 +16,7 @@ define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind ; CHECK-LABEL: test256_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 @@ -29,8 +27,7 @@ define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounw ; CHECK-LABEL: test256_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <16 x i16> %x, %y %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y @@ -41,8 +38,7 @@ define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind ; CHECK-LABEL: test256_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 @@ -53,8 +49,7 @@ define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nou ; CHECK-LABEL: test256_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %yp, align 4 %mask = icmp eq <16 x i16> %x, %y @@ -66,8 +61,7 @@ define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) ; CHECK-LABEL: test256_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp sgt <16 x i16> %x, %y @@ -79,8 +73,7 @@ define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) ; CHECK-LABEL: test256_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp sle <16 x i16> %x, %y @@ -92,8 +85,7 @@ define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) ; CHECK-LABEL: test256_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp ule <16 x i16> %x, %y @@ -106,8 +98,7 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <16 x i16> %x1, %y1 %mask0 = icmp eq <16 x i16> %x, %y @@ -121,8 +112,7 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <32 x i8> %x1, %y1 %mask0 = icmp sle <32 x i8> %x, %y @@ -136,8 +126,7 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <32 x i8> %x1, %y1 %y = load <32 x i8>, <32 x i8>* %y.ptr, align 4 @@ -152,8 +141,7 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i16> %x1, %y1 %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 @@ -167,8 +155,7 @@ define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: test128_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y @@ -179,8 +166,7 @@ define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind ; CHECK-LABEL: test128_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 @@ -191,8 +177,7 @@ define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind ; CHECK-LABEL: test128_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <8 x i16> %x, %y %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y @@ -203,8 +188,7 @@ define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind ; CHECK-LABEL: test128_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 @@ -215,8 +199,7 @@ define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwin ; CHECK-LABEL: test128_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %yp, align 4 %mask = icmp eq <8 x i16> %x, %y @@ -228,8 +211,7 @@ define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun ; CHECK-LABEL: test128_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp sgt <8 x i16> %x, %y @@ -241,8 +223,7 @@ define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun ; CHECK-LABEL: test128_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp sle <8 x i16> %x, %y @@ -254,8 +235,7 @@ define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun ; CHECK-LABEL: test128_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp ule <8 x i16> %x, %y @@ -268,8 +248,7 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <8 x i16> %x1, %y1 %mask0 = icmp eq <8 x i16> %x, %y @@ -283,8 +262,7 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i8> %x1, %y1 %mask0 = icmp sle <16 x i8> %x, %y @@ -298,8 +276,7 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <16 x i8> %x1, %y1 %y = load <16 x i8>, <16 x i8>* %y.ptr, align 4 @@ -314,8 +291,7 @@ define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i16> %x1, %y1 %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll index 25b9cc79096..e0acf2be653 100644 --- a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -5,8 +5,7 @@ define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: test256_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y @@ -17,8 +16,7 @@ define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind ; CHECK-LABEL: test256_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y @@ -29,8 +27,7 @@ define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind ; CHECK-LABEL: test256_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y @@ -41,8 +38,7 @@ define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind ; CHECK-LABEL: test256_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y @@ -53,8 +49,7 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin ; CHECK-LABEL: test256_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %x, %y @@ -66,8 +61,7 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_5b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %y, %x @@ -79,8 +73,7 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun ; CHECK-LABEL: test256_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sgt <8 x i32> %x, %y @@ -92,8 +85,7 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou ; CHECK-LABEL: test256_6b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp slt <8 x i32> %y, %x @@ -105,8 +97,7 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun ; CHECK-LABEL: test256_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sle <8 x i32> %x, %y @@ -118,8 +109,7 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou ; CHECK-LABEL: test256_7b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sge <8 x i32> %y, %x @@ -131,8 +121,7 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun ; CHECK-LABEL: test256_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp ule <8 x i32> %x, %y @@ -144,8 +133,7 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou ; CHECK-LABEL: test256_8b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp uge <8 x i32> %y, %x @@ -158,8 +146,7 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <8 x i32> %x1, %y1 %mask0 = icmp eq <8 x i32> %x, %y @@ -173,8 +160,7 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %mask0 = icmp sle <4 x i64> %x, %y @@ -188,8 +174,7 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <4 x i64> %x1, %y1 %y = load <4 x i64>, <4 x i64>* %y.ptr, align 4 @@ -204,8 +189,7 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 @@ -219,8 +203,7 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind ; CHECK-LABEL: test256_13: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 @@ -234,8 +217,7 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind ; CHECK-LABEL: test256_14: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 @@ -250,8 +232,7 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -268,8 +249,7 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -285,8 +265,7 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %x, %y @@ -298,8 +277,7 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %y, %x @@ -311,8 +289,7 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %x, %y @@ -324,8 +301,7 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_20: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %y, %x @@ -337,8 +313,7 @@ define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: test128_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y @@ -349,8 +324,7 @@ define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind ; CHECK-LABEL: test128_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y @@ -361,8 +335,7 @@ define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind ; CHECK-LABEL: test128_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y @@ -373,8 +346,7 @@ define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind ; CHECK-LABEL: test128_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y @@ -385,8 +357,7 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin ; CHECK-LABEL: test128_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %x, %y @@ -398,8 +369,7 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi ; CHECK-LABEL: test128_5b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %y, %x @@ -411,8 +381,7 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun ; CHECK-LABEL: test128_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sgt <4 x i32> %x, %y @@ -424,8 +393,7 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_6b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp slt <4 x i32> %y, %x @@ -437,8 +405,7 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun ; CHECK-LABEL: test128_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sle <4 x i32> %x, %y @@ -450,8 +417,7 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_7b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sge <4 x i32> %y, %x @@ -463,8 +429,7 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun ; CHECK-LABEL: test128_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ule <4 x i32> %x, %y @@ -476,8 +441,7 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_8b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x @@ -490,8 +454,7 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <4 x i32> %x1, %y1 %mask0 = icmp eq <4 x i32> %x, %y @@ -505,8 +468,7 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %mask0 = icmp sle <2 x i64> %x, %y @@ -520,8 +482,7 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <2 x i64> %x1, %y1 %y = load <2 x i64>, <2 x i64>* %y.ptr, align 4 @@ -536,8 +497,7 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 @@ -551,8 +511,7 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind ; CHECK-LABEL: test128_13: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 @@ -566,8 +525,7 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind ; CHECK-LABEL: test128_14: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 @@ -582,8 +540,7 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -600,8 +557,7 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -617,8 +573,7 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %x, %y @@ -630,8 +585,7 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %y, %x @@ -643,8 +597,7 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %x, %y @@ -656,8 +609,7 @@ define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_20: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 4e65b169c7e..15ba276e14b 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -29,8 +29,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovupd (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) @@ -58,8 +57,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) @@ -95,8 +93,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) @@ -171,8 +168,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovapd (%rdi), %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) @@ -246,16 +242,15 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %ymm1, %ymm0 +; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11a: ; SKX: ## BB#0: ; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; SKX-NEXT: vmovaps (%rdi), %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) @@ -293,16 +288,15 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11b: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} -; SKX-NEXT: vmovdqa %ymm1, %ymm0 +; SKX-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) ret <8 x i32> %res @@ -557,8 +551,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> % ; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 ; SKX-NEXT: kshiftlw $14, %k0, %k0 ; SKX-NEXT: kshiftrw $14, %k0, %k1 -; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index f47669d1e26..b5290414b98 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -425,8 +425,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> ret <16 x i8> %shuffle @@ -466,8 +465,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31> ret <16 x i8> %shuffle @@ -526,8 +524,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090 ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31> ret <16 x i8> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 49da853aca6..a1cf97d787a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -719,8 +719,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3 ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000 ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> ret <16 x i16> %shuffle @@ -745,8 +744,7 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1 ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $1, %ax ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> ret <16 x i16> %shuffle @@ -771,8 +769,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1 ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> ret <16 x i16> %shuffle @@ -797,8 +794,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3 ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55 ; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> ret <16 x i16> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 4876aa1526a..390ab16699d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1036,8 +1036,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63> ret <32 x i8> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 0b9a2c6a977..628cb935828 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -254,8 +254,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a ; ALL: # BB#0: ; ALL-NEXT: movw $8, %ax ; ALL-NEXT: kmovw %eax, %k1 -; ALL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 +; ALL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i32> %c diff --git a/llvm/test/CodeGen/X86/vector-shuffle-masked.ll b/llvm/test/CodeGen/X86/vector-shuffle-masked.ll index 37fd022999e..e30ee2f4c71 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-masked.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-masked.ll @@ -216,8 +216,7 @@ define <8 x i32> @mask_shuffle_v8i32_23456701(<8 x i32> %a, <8 x i32> %passthru, ; CHECK: # BB#0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1> %mask.cast = bitcast i8 %mask to <8 x i1> |

