diff options
author | Craig Topper <craig.topper@intel.com> | 2017-09-24 17:28:14 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2017-09-24 17:28:14 +0000 |
commit | 23f18307489216db737eb4d7d0e4dec3c79e0761 (patch) | |
tree | b113c7d83a83c8270ce5565795ef6b096121fef8 | |
parent | 554ab538dcbf7e49e7d028026ca495057972b261 (diff) | |
download | bcm5719-llvm-23f18307489216db737eb4d7d0e4dec3c79e0761.tar.gz bcm5719-llvm-23f18307489216db737eb4d7d0e4dec3c79e0761.zip |
[X86] Add IFMA instructions to the load folding tables and make them commutable for the multiply operands.
llvm-svn: 314080
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 53 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll | 70 |
3 files changed, 124 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1155b6dcb49..a8b7c80cdab 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6484,7 +6484,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1, 1>, AVX512FMA3Base; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4b56807cffc..0561bcd8d0a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2528,6 +2528,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 }, { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 }, { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 }, + { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 }, + { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 }, { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 }, { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 }, @@ -2544,6 +2546,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 }, { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 }, { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 }, + { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 }, + { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 }, { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 }, { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 }, @@ -2560,6 +2564,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 }, { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 }, { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 }, + { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 }, + { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 }, { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 }, { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 }, @@ -3234,6 +3240,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, + { X86::VPMADD52HUQZrk, X86::VPMADD52HUQZmk, 0 }, + { X86::VPMADD52LUQZrk, X86::VPMADD52LUQZmk, 0 }, { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 }, @@ -3376,6 +3384,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, + { X86::VPMADD52HUQZ256rk, X86::VPMADD52HUQZ256mk, 0 }, + { X86::VPMADD52LUQZ256rk, X86::VPMADD52LUQZ256mk, 0 }, { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 }, @@ -3509,6 +3519,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, + { X86::VPMADD52HUQZ128rk, X86::VPMADD52HUQZ128mk, 0 }, + { X86::VPMADD52LUQZ128rk, X86::VPMADD52LUQZ128mk, 0 }, { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 }, @@ -3597,6 +3609,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, + { X86::VPMADD52HUQZrkz, X86::VPMADD52HUQZmkz, 0 }, + { X86::VPMADD52LUQZrkz, X86::VPMADD52LUQZmkz, 0 }, { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, @@ -3613,6 +3627,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, + { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz, 0 }, + { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz, 0 }, { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, @@ -3629,6 +3645,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, + { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz, 0 }, + { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz, 0 }, { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, }; @@ -5631,6 +5649,41 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + case X86::VPMADD52HUQZ128r: + case X86::VPMADD52HUQZ128rk: + case X86::VPMADD52HUQZ128rkz: + case X86::VPMADD52HUQZ256r: + case X86::VPMADD52HUQZ256rk: + case X86::VPMADD52HUQZ256rkz: + case X86::VPMADD52HUQZr: + case X86::VPMADD52HUQZrk: + case X86::VPMADD52HUQZrkz: + case X86::VPMADD52LUQZ128r: + case X86::VPMADD52LUQZ128rk: + case X86::VPMADD52LUQZ128rkz: + case X86::VPMADD52LUQZ256r: + case X86::VPMADD52LUQZ256rk: + case X86::VPMADD52LUQZ256rkz: + case X86::VPMADD52LUQZr: + case X86::VPMADD52LUQZrk: + case X86::VPMADD52LUQZrkz: { + unsigned CommutableOpIdx1 = 2; + unsigned CommutableOpIdx2 = 3; + if (Desc.TSFlags & X86II::EVEX_K) { + // Skip the mask register. + ++CommutableOpIdx1; + ++CommutableOpIdx2; + } + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + if (!MI.getOperand(SrcOpIdx1).isReg() || + !MI.getOperand(SrcOpIdx2).isReg()) + // No idea. + return false; + return true; + } + default: const X86InstrFMA3Group *FMA3Group = X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll index 2df2d528dca..3a27d9dbf00 100644 --- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -112,3 +112,73 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x %res6 = add <8 x i64> %res5, %res4 ret <8 x i64> %res6 } + +define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr) { +; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + + %x2 = load <8 x i64>, <8 x i64>* %x2ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + + %x1 = load <8 x i64>, <8 x i64>* %x1ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + + %x2 = load <8 x i64>, <8 x i64>* %x2ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + + %x1 = load <8 x i64>, <8 x i64>* %x1ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + + %x2 = load <8 x i64>, <8 x i64>* %x2ptr + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + + %x1 = load <8 x i64>, <8 x i64>* %x1ptr + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} |