diff options
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFoldTables.cpp | 74 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFoldTables.h | 13 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 78 |
3 files changed, 155 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index d42fec3770c..4f98fdf5ea4 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -5245,6 +5245,69 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, }; +static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { + { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD }, + { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS }, + { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS }, + { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD }, + { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD }, + { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD }, + { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS }, + { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS }, + { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS }, + { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD }, + { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD }, + { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD }, + { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS }, + { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS }, + { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS }, + { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D }, + { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D }, + { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D }, + { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q }, + { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q }, + { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q }, + { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D }, + { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D }, + { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D }, + { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D }, + { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D }, + { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D }, + { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q }, + { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q }, + { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q }, + { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q }, + { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q }, + { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q }, + { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D }, + { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D }, + { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D }, + { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q }, + { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q }, + { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q }, + { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D }, + { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D }, + { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D }, + { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q }, + { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q }, + { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q }, + { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D }, + { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D }, + { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D }, + { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q }, + { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q }, + { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q }, + { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD }, + { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD }, + { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD }, + { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS }, + { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS }, + { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS }, +}; + static const X86MemoryFoldTableEntry * lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { #ifndef NDEBUG @@ -5287,6 +5350,12 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { std::end(MemoryFoldTable4)) == std::end(MemoryFoldTable4) && "MemoryFoldTable4 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) && + std::adjacent_find(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) == + std::end(BroadcastFoldTable2) && + "BroadcastFoldTable2 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } #endif @@ -5355,6 +5424,11 @@ struct X86MemUnfoldTable { // Index 4, folded load addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD); + // Broadcast tables. + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + // Sort the memory->reg unfold table. array_pod_sort(Table.begin(), Table.end()); diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h index 4efbeb9f0be..7dc236a0d7e 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.h +++ b/llvm/lib/Target/X86/X86InstrFoldTables.h @@ -38,7 +38,7 @@ enum { TB_FOLDED_LOAD = 1 << 5, TB_FOLDED_STORE = 1 << 6, - // Unused bit 7 + TB_FOLDED_BCAST = 1 << 7, // Minimum alignment required for load/store. // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0 @@ -51,7 +51,16 @@ enum { TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT, TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT, - // Unused bits 12-15 + // Broadcast type. + // (stored in bits 12 - 13) + TB_BCAST_TYPE_SHIFT = 12, + TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT, + + // Unused bits 14-15 }; // This struct is used for both the folding and unfold tables. They KeyOp diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 88c5ae2d416..d597ec90ea6 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5311,6 +5311,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { return StoreMMOs; } +static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I, + const TargetRegisterClass *RC, + const X86Subtarget &STI) { + assert(STI.hasAVX512() && "Expected at least AVX512!"); + unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC); + assert((SpillSize == 64 || STI.hasVLX()) && + "Can't broadcast less than 64 bytes without AVX512VL!"); + + switch (I->Flags & TB_BCAST_MASK) { + default: llvm_unreachable("Unexpected broadcast type!"); + case TB_BCAST_D: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTDZ128m; + case 32: return X86::VPBROADCASTDZ256m; + case 64: return X86::VPBROADCASTDZm; + } + break; + case TB_BCAST_Q: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTQZ128m; + case 32: return X86::VPBROADCASTQZ256m; + case 64: return X86::VPBROADCASTQZm; + } + break; + case TB_BCAST_SS: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VBROADCASTSSZ128m; + case 32: return X86::VBROADCASTSSZ256m; + case 64: return X86::VBROADCASTSSZm; + } + break; + case TB_BCAST_SD: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VMOVDDUPZ128rm; + case 32: return X86::VBROADCASTSDZ256m; + case 64: return X86::VBROADCASTSDZm; + } + break; + } +} + bool X86InstrInfo::unfoldMemoryOperand( MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const { @@ -5321,6 +5366,7 @@ bool X86InstrInfo::unfoldMemoryOperand( unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; if (UnfoldLoad && !FoldedLoad) return false; UnfoldLoad &= FoldedLoad; @@ -5329,6 +5375,7 @@ bool X86InstrInfo::unfoldMemoryOperand( UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? @@ -5354,12 +5401,19 @@ bool X86InstrInfo::unfoldMemoryOperand( AfterOps.push_back(Op); } - // Emit the load instruction. + // Emit the load or broadcast instruction. if (UnfoldLoad) { auto MMOs = extractLoadMMOs(MI.memoperands(), MF); - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - unsigned Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); + } + DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg); for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) @@ -5460,6 +5514,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -5493,10 +5548,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, - VT, MVT::Other, AddrOps); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); + } + + Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. |