diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 44 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFormats.td | 1 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 40 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 26 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-half-conversions.ll | 265 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 44 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 76 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll | 8 |
9 files changed, 195 insertions, 311 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 98527cd728c..38304917548 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4224,6 +4224,7 @@ defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8X //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4235,10 +4236,12 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i8 imm:$src2))), SSE_INTSHIFT_ITINS_P.rm>; + } } multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", @@ -4249,6 +4252,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { // src2 is always 128-bit + let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, VR128X:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4260,6 +4264,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; + } } multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -4353,6 +4358,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4365,10 +4371,12 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (bitconvert (_.LdFrag addr:$src2))))), SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + } } multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", @@ -4635,8 +4643,10 @@ multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; } +let ExeDomain = SSEPackedSingle in defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, avx512vl_i32_info>; +let ExeDomain = SSEPackedDouble in defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// @@ -7196,7 +7206,8 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, // op(broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -7216,11 +7227,13 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), (i32 imm:$src2), (i32 FROUND_CURRENT))>, EVEX_B; + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, {sae}, $src1", @@ -7250,7 +7263,8 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, // op(reg_vec2,broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -7273,13 +7287,14 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), (i32 imm:$src3), (i32 FROUND_CURRENT))>, EVEX_B; + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ - + let ExeDomain = DestInfo.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -7293,6 +7308,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (SrcInfo.VT (bitconvert (SrcInfo.LdFrag addr:$src2))), (i8 imm:$src3)))>; + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) @@ -7302,6 +7318,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>: avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + let ExeDomain = _.ExeDomain in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", @@ -7315,8 +7332,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, // op(reg_vec2,mem_scalar,imm) //all instruction created with FROUND_CURRENT multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { - + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -7339,11 +7356,13 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", []>; } + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", @@ -7907,8 +7926,8 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", HasBWI>, EVEX_4V; multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - let Constraints = "$src1 = $dst" in { + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -7953,8 +7972,8 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - let Constraints = "$src1 = $dst" in { + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -7984,8 +8003,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ -let Constraints = "$src1 = $dst" in { + SDNode OpNode, X86VectorVTInfo _>{ +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -8000,7 +8019,8 @@ let Constraints = "$src1 = $dst" in { multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, X86VectorVTInfo _src3VT> { - let Constraints = "$src1 = $dst" , Predicates = [HasAVX512] in { + let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], + ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index 5183adc834b..8cbadd38faf 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -785,7 +785,6 @@ class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, Requires<[HasAVX512]>; class AVX512AIi8Base : TAPD { - Domain ExeDomain = SSEPackedInt; ImmType ImmT = Imm8; } class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index d0dfb621b54..63f50ef7036 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4956,7 +4956,7 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm3 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 @@ -5995,7 +5995,7 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} @@ -6017,10 +6017,10 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 +; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 @@ -6041,10 +6041,10 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 @@ -6065,11 +6065,11 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x f ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm4 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 @@ -6088,10 +6088,10 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 +; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 @@ -6111,9 +6111,9 @@ define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm4 ; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} @@ -6135,11 +6135,11 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vmovapd %zmm0, %zmm4 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 @@ -6159,10 +6159,10 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 3079f0ece1d..f0c7a3eb354 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -6889,7 +6889,7 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xd8] +; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0xd8] ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] @@ -6911,7 +6911,7 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xd8] +; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0xd8] ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x05] ; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xc2,0x03] @@ -6931,7 +6931,7 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xd8] +; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0xd8] ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] @@ -6953,10 +6953,10 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xd8] +; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0xd8] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05] ; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm5 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xe8] +; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] ; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xcd] @@ -6976,9 +6976,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xd8] +; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xda,0x05] -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm4 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xe0] +; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xe0] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05] ; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xc2,0x05] @@ -6999,9 +6999,9 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xd8] +; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xda,0x05] -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm4 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xe0] +; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xe0] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05] ; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xc2,0x05] @@ -7022,9 +7022,9 @@ define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xd8] +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xda,0x05] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm4 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xe0] +; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe0] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05] ; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xc2,0x05] @@ -7045,9 +7045,9 @@ define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xd8] +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xda,0x05] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm4 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xe0] +; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe0] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05] ; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xc2,0x05] diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index aeb93a217d1..6fc63957c71 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -2997,12 +2997,12 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3109,12 +3109,12 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3225,12 +3225,12 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3410,12 +3410,12 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d -; AVX512VL-NEXT: vmovdqu64 (%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqu64 (%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d @@ -3433,12 +3433,12 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3677,7 +3677,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> @@ -3691,12 +3691,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> @@ -3720,7 +3720,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r13w @@ -3731,11 +3731,11 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r14w -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r15w @@ -3888,11 +3888,11 @@ define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r14d -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r15d -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %ebp @@ -4009,12 +4009,12 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -4133,12 +4133,12 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -4286,11 +4286,11 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r12d @@ -4300,11 +4300,11 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r13d -; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %ebp -; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r14d @@ -4554,11 +4554,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill @@ -4568,11 +4568,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill @@ -4582,11 +4582,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill @@ -4596,11 +4596,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r14d -; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r15d @@ -4671,77 +4671,23 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind { } define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { -; AVX1-LABEL: cvt_2f64_to_2i16: -; AVX1: # BB#0: -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $16, %rsp -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %ebx, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: addq $16, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_2f64_to_2i16: -; AVX2: # BB#0: -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $16, %rsp -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %ebx, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: addq $16, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_2f64_to_2i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $16, %rsp -; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bx -; AVX512F-NEXT: shll $16, %ebx -; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: orl %ebx, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: addq $16, %rsp -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_2f64_to_2i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: subq $16, %rsp -; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bx -; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: orl %ebx, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: addq $16, %rsp -; AVX512VL-NEXT: popq %rbx -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_2f64_to_2i16: +; ALL: # BB#0: +; ALL-NEXT: pushq %rbx +; ALL-NEXT: subq $16, %rsp +; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: movw %ax, %bx +; ALL-NEXT: shll $16, %ebx +; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: movzwl %ax, %eax +; ALL-NEXT: orl %ebx, %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: addq $16, %rsp +; ALL-NEXT: popq %rbx +; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> ret <2 x i16> %2 @@ -4861,7 +4807,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5011,7 +4957,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5165,7 +5111,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5410,7 +5356,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $96, %rsp -; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512VL-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5491,81 +5437,24 @@ define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind { } define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_2f64_to_2i16: -; AVX1: # BB#0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, (%rbx) -; AVX1-NEXT: movw %bp, 2(%rbx) -; AVX1-NEXT: addq $24, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_2f64_to_2i16: -; AVX2: # BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $24, %rsp -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, (%rbx) -; AVX2-NEXT: movw %bp, 2(%rbx) -; AVX2-NEXT: addq $24, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_cvt_2f64_to_2i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: pushq %rbp -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $24, %rsp -; AVX512F-NEXT: movq %rdi, %rbx -; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movl %eax, %ebp -; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, (%rbx) -; AVX512F-NEXT: movw %bp, 2(%rbx) -; AVX512F-NEXT: addq $24, %rsp -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %rbp -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_2f64_to_2i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: pushq %rbp -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: subq $24, %rsp -; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movl %eax, %ebp -; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, (%rbx) -; AVX512VL-NEXT: movw %bp, 2(%rbx) -; AVX512VL-NEXT: addq $24, %rsp -; AVX512VL-NEXT: popq %rbx -; AVX512VL-NEXT: popq %rbp -; AVX512VL-NEXT: retq +; ALL-LABEL: store_cvt_2f64_to_2i16: +; ALL: # BB#0: +; ALL-NEXT: pushq %rbp +; ALL-NEXT: pushq %rbx +; ALL-NEXT: subq $24, %rsp +; ALL-NEXT: movq %rdi, %rbx +; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: movl %eax, %ebp +; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: movw %ax, (%rbx) +; ALL-NEXT: movw %bp, 2(%rbx) +; ALL-NEXT: addq $24, %rsp +; ALL-NEXT: popq %rbx +; ALL-NEXT: popq %rbp +; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> store <2 x i16> %2, <2 x i16>* %a1 @@ -5692,7 +5581,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $88, %rsp ; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqu64 %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r14d @@ -5855,7 +5744,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $32, %rsp ; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp @@ -6026,7 +5915,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $32, %rsp ; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp @@ -6281,7 +6170,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $200, %rsp ; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill +; AVX512VL-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index d853655dea7..22b6e28fae1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -389,45 +389,21 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_3254: -; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_3254: -; AVX2: # BB#0: -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_3254: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_3254: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_3276: -; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_3276: -; AVX2: # BB#0: -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_3276: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_3276: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6> ret <4 x double> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 4d441aa67ce..6cd0366d5ad 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -190,7 +190,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_2 define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> +; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> ; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index da8ea83ee94..6e2df62d5ed 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -51,13 +51,13 @@ define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000010: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00000010: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> @@ -67,13 +67,13 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000200: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00000200: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> @@ -83,13 +83,13 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00003000: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00003000: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> @@ -99,13 +99,13 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00040000: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00040000: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> @@ -115,13 +115,13 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00500000: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00500000: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -131,13 +131,13 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_06000000: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_06000000: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -184,13 +184,13 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00112233: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00112233: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> @@ -200,13 +200,13 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00001111: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00001111: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> @@ -624,13 +624,13 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00015444: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00015444: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> @@ -641,13 +641,13 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00204644: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00204644: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> @@ -658,13 +658,13 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_03004474: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_03004474: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> @@ -675,13 +675,13 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_10004444: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_10004444: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> @@ -692,13 +692,13 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_22006446: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_22006446: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> @@ -709,13 +709,13 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_33307474: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_33307474: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> @@ -726,13 +726,13 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_32104567: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_32104567: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> @@ -743,13 +743,13 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00236744: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00236744: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> @@ -760,13 +760,13 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00226644: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00226644: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> @@ -837,13 +837,13 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_002u6u44: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_002u6u44: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> @@ -854,13 +854,13 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00uu66uu: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> +; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 3e0c3e4d8f3..d57e1fe4255 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -29,11 +29,11 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d ; CHECK-LABEL: combine_permvar_8f64_identity_mask: ; CHECK: # BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; CHECK-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 %m) %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 %m) @@ -433,7 +433,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x ; CHECK: # BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 %m) ret <8 x double> %1 |

