summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@gmail.com>2016-08-28 06:06:28 +0000
committerCraig Topper <craig.topper@gmail.com>2016-08-28 06:06:28 +0000
commitabe80cc04df000d51895d7532c748be2f25fbd86 (patch)
tree2e1bd49eafd6e3481cde380221656168279b7ff1 /llvm
parent8046e2033e725a811f5078652ce21e26e736375d (diff)
downloadbcm5719-llvm-abe80cc04df000d51895d7532c748be2f25fbd86.tar.gz
bcm5719-llvm-abe80cc04df000d51895d7532c748be2f25fbd86.zip
[AVX-512] Promote AND/OR/XOR to v2i64/v4i64/v8i64 even when we have AVX512F/AVX512VL.
Previously we weren't creating masked logical operations if bitcasts appeared between the logic operation and the select. The IR optimizers can move bitcasts across logic operations and create these cases. To minimize the number of cases we need to handle, this change promotes all logic ops to an i64 vector type just like when only SSE or AVX is available. Unfortunately, this also has the consequence of making it difficult to select unmasked VPANDD/VPORD/VPXORD in all the cases it was previously used. This is the cause of most of the test change. This shouldn't result in any functional change though. llvm-svn: 279929
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp20
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td122
-rw-r--r--llvm/test/CodeGen/X86/avx512-arith.ll12
-rw-r--r--llvm/test/CodeGen/X86/avx512-logic.ll51
-rw-r--r--llvm/test/CodeGen/X86/avx512-select.ll2
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll8
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-bitreverse.ll6
8 files changed, 177 insertions, 56 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bf82dc646cf..896ce13667d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1345,13 +1345,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::AND, VT, Legal);
- setOperationAction(ISD::OR, VT, Legal);
- setOperationAction(ISD::XOR, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
}
+ // Need to promote to 64-bit even though we have 32-bit masked instructions
+ // because the IR optimizers rearrange bitcasts around logic ops leaving
+ // too many variations to handle if we don't promote them.
+ setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
+ setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
+ setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
+
if (Subtarget.hasCDI()) {
setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
@@ -1561,12 +1565,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
- setOperationAction(ISD::AND, VT, Legal);
- setOperationAction(ISD::OR, VT, Legal);
- setOperationAction(ISD::XOR, VT, Legal);
- }
-
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -28479,9 +28477,7 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
- VT != MVT::v8i64 && VT != MVT::v16i32 &&
- VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
+ if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
return SDValue();
// Canonicalize XOR to the left.
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 2db7ad35611..6dcb4627ec4 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -122,6 +122,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+ // A vector tye of the same width with element type i64. This is used to
+ // create patterns for logic ops.
+ ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");
+
// A vector type of the same width with element type i32. This is used to
// create the canonical constant zero node ImmAllZerosV.
ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
@@ -387,6 +391,27 @@ multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
AttSrcAsm, IntelSrcAsm, [],[]>;
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskedRHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0, SDNode Select = vselect> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskedRHS,
+ _.ImmAllZerosV))],
+ "$src0 = $dst", itin, IsCommutable>;
+
// Bitcasts between 512-bit vector types. Return the original type since
// no instruction is needed for the conversion.
def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
@@ -3860,17 +3885,102 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
//===----------------------------------------------------------------------===//
// AVX-512 Logical Instructions
//===----------------------------------------------------------------------===//
-defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+ (bitconvert (_.VT _.RC:$src2)))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ _.RC:$src2)))),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+
+ defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+ (bitconvert (_.LdFrag addr:$src2)))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> :
+ avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+ defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ itins, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
SSE_INTALU_ITINS_P, HasAVX512, 0>;
//===----------------------------------------------------------------------===//
@@ -7715,8 +7825,8 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
def : Pat<(xor
- (bc_v16i32 (v16i1sextv16i32)),
- (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+ (bc_v8i64 (v16i1sextv16i32)),
+ (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
(VPABSDZrr VR512:$src)>;
def : Pat<(xor
(bc_v8i64 (v8i1sextv8i64)),
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 72219a8413e..783983344cf 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -945,17 +945,17 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
define <16 x float> @test_fxor(<16 x float> %a) {
; AVX512F-LABEL: test_fxor:
; AVX512F: ## BB#0:
-; AVX512F-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_fxor:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_fxor:
; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_fxor:
@@ -1015,17 +1015,17 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
define <16 x float> @fabs_v16f32(<16 x float> %p)
; AVX512F-LABEL: fabs_v16f32:
; AVX512F: ## BB#0:
-; AVX512F-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fabs_v16f32:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: fabs_v16f32:
; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: fabs_v16f32:
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index 543ce12a924..e4ee454dd00 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -430,12 +430,17 @@ define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou
}
define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
-; ALL-LABEL: test_mm512_mask_and_epi32:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vpandq %zmm2, %zmm1, %zmm1
-; ALL-NEXT: kmovw %edi, %k1
-; ALL-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
-; ALL-NEXT: retq
+; KNL-LABEL: test_mm512_mask_and_epi32:
+; KNL: ## BB#0: ## %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vpandd %zmm2, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_mm512_mask_and_epi32:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
entry:
%and1.i.i = and <8 x i64> %__a, %__b
%0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
@@ -447,12 +452,17 @@ entry:
}
define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
-; ALL-LABEL: test_mm512_mask_or_epi32:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vporq %zmm2, %zmm1, %zmm1
-; ALL-NEXT: kmovw %edi, %k1
-; ALL-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
-; ALL-NEXT: retq
+; KNL-LABEL: test_mm512_mask_or_epi32:
+; KNL: ## BB#0: ## %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vpord %zmm2, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_mm512_mask_or_epi32:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
entry:
%or1.i.i = or <8 x i64> %__a, %__b
%0 = bitcast <8 x i64> %or1.i.i to <16 x i32>
@@ -464,12 +474,17 @@ entry:
}
define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
-; ALL-LABEL: test_mm512_mask_xor_epi32:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vpxorq %zmm2, %zmm1, %zmm1
-; ALL-NEXT: kmovw %edi, %k1
-; ALL-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
-; ALL-NEXT: retq
+; KNL-LABEL: test_mm512_mask_xor_epi32:
+; KNL: ## BB#0: ## %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_mm512_mask_xor_epi32:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
entry:
%xor1.i.i = xor <8 x i64> %__a, %__b
%0 = bitcast <8 x i64> %xor1.i.i to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index d010a83277d..ee9be946c76 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -10,7 +10,7 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
; CHECK-NEXT: ## BB#1:
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
; CHECK-NEXT: LBB0_2:
-; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
index 86fd2dd5ad5..52faee31995 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -78,7 +78,7 @@ define <8 x double> @stack_fold_andnpd_zmm(<8 x double> %a0, <8 x double> %a1) {
define <16 x float> @stack_fold_andnps_zmm(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_andnps_zmm
- ;CHECK: vpandnd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ ;CHECK: vpandnq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <16 x float> %a0 to <16 x i32>
%3 = bitcast <16 x float> %a1 to <16 x i32>
@@ -105,7 +105,7 @@ define <8 x double> @stack_fold_andpd_zmm(<8 x double> %a0, <8 x double> %a1) {
define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_andps_zmm
- ;CHECK: vpandd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <16 x float> %a0 to <16 x i32>
%3 = bitcast <16 x float> %a1 to <16 x i32>
@@ -295,7 +295,7 @@ define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) {
define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_orps_zmm
- ;CHECK: vpord {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <16 x float> %a0 to <16 x i32>
%3 = bitcast <16 x float> %a1 to <16 x i32>
@@ -375,7 +375,7 @@ define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) {
define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_xorps_zmm
- ;CHECK: vpxord {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <16 x float> %a0 to <16 x i32>
%3 = bitcast <16 x float> %a1 to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index f551e33383b..0f499c268e7 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -124,7 +124,7 @@ define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
;CHECK-LABEL: stack_fold_andps
- ;CHECK: vpandd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <4 x float> %a0 to <4 x i32>
%3 = bitcast <4 x float> %a1 to <4 x i32>
@@ -137,7 +137,7 @@ define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_andps_ymm
- ;CHECK: vpandd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x float> %a0 to <8 x i32>
%3 = bitcast <8 x float> %a1 to <8 x i32>
@@ -314,7 +314,7 @@ define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
;CHECK-LABEL: stack_fold_orps
- ;CHECK: vpord {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <4 x float> %a0 to <4 x i32>
%3 = bitcast <4 x float> %a1 to <4 x i32>
@@ -327,7 +327,7 @@ define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_orps_ymm
- ;CHECK: vpord {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x float> %a0 to <8 x i32>
%3 = bitcast <8 x float> %a1 to <8 x i32>
@@ -398,7 +398,7 @@ define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
;CHECK-LABEL: stack_fold_xorps
- ;CHECK: vpxord {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <4 x float> %a0 to <4 x i32>
%3 = bitcast <4 x float> %a1 to <4 x i32>
@@ -411,7 +411,7 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_xorps_ymm
- ;CHECK: vpxord {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x float> %a0 to <8 x i32>
%3 = bitcast <8 x float> %a1 to <8 x i32>
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index b4b52a60b9c..f9746bcfcde 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -2041,12 +2041,12 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1
; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2
; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
OpenPOWER on IntegriCloud