diff options
| author | Amara Emerson <aemerson@apple.com> | 2019-03-04 19:16:00 +0000 | 
|---|---|---|
| committer | Amara Emerson <aemerson@apple.com> | 2019-03-04 19:16:00 +0000 | 
| commit | 8acb0d9c82ed49512ba2df6e4a0e3ee65a220fdf (patch) | |
| tree | 282b6336419bda3f1510960aabf1d156f641f8e3 /llvm/lib/Target | |
| parent | 05e233507697ab81007a6b48728add2ee4627e5b (diff) | |
| download | bcm5719-llvm-8acb0d9c82ed49512ba2df6e4a0e3ee65a220fdf.tar.gz bcm5719-llvm-8acb0d9c82ed49512ba2df6e4a0e3ee65a220fdf.zip | |
Re-commit r355104: "[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1."
The code to materialize a mask from a constant pool load tried to use a 128 bit
LDR to load a 64 bit constant pool entry, which was 8 byte aligned. This resulted
in a link failure in the NEON tests in the test suite since the LDR address was
unaligned. This change fixes that to instead emit a 64 bit LDR if the entry is
64 bit, before converting back to a 128 bit register for the TBL.
llvm-svn: 355326
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp | 189 | 
1 files changed, 153 insertions, 36 deletions
| diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index 2a5599f665d..41f4eb9563a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -67,7 +67,7 @@ private:    // Helper to generate an equivalent of scalar_to_vector into a new register,    // returned via 'Dst'. -  MachineInstr *emitScalarToVector(const LLT DstTy, +  MachineInstr *emitScalarToVector(unsigned EltSize,                                     const TargetRegisterClass *DstRC,                                     unsigned Scalar,                                     MachineIRBuilder &MIRBuilder) const; @@ -82,6 +82,8 @@ private:    unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;    MachineInstr *emitLoadFromConstantPool(Constant *CPVal,                                           MachineIRBuilder &MIRBuilder) const; +  MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2, +                                 MachineIRBuilder &MIRBuilder) const;    ComplexRendererFns selectArithImmed(MachineOperand &Root) const; @@ -1713,7 +1715,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,  }  MachineInstr *AArch64InstructionSelector::emitScalarToVector( -    const LLT DstTy, const TargetRegisterClass *DstRC, unsigned Scalar, +    unsigned EltSize, const TargetRegisterClass *DstRC, unsigned Scalar,      MachineIRBuilder &MIRBuilder) const {    auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); @@ -1727,7 +1729,7 @@ MachineInstr *AArch64InstructionSelector::emitScalarToVector(      return &*Ins;    }; -  switch (DstTy.getElementType().getSizeInBits()) { +  switch (EltSize) {    case 16:      return BuildFn(AArch64::hsub);    case 32: @@ -1957,13 +1959,123 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(    auto Adrp =        MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})            .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); -  auto Load = -      MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) -          .addConstantPoolIndex(CPIdx, 0, -                                AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + +  MachineInstr *LoadMI = nullptr; +  switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) { +  case 16: +    LoadMI = +        &*MIRBuilder +              .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) +              .addConstantPoolIndex(CPIdx, 0, +                                    AArch64II::MO_PAGEOFF | AArch64II::MO_NC); +    break; +  case 8: +    LoadMI = &*MIRBuilder +                 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) +                 .addConstantPoolIndex( +                     CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); +    break; +  default: +    LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " +                      << *CPVal->getType()); +    return nullptr; +  }    constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); -  constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); -  return &*Load; +  constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); +  return LoadMI; +} + +/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given +/// size and RB. +static std::pair<unsigned, unsigned> +getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { +  unsigned Opc, SubregIdx; +  if (RB.getID() == AArch64::GPRRegBankID) { +    if (EltSize == 32) { +      Opc = AArch64::INSvi32gpr; +      SubregIdx = AArch64::ssub; +    } else if (EltSize == 64) { +      Opc = AArch64::INSvi64gpr; +      SubregIdx = AArch64::dsub; +    } else { +      llvm_unreachable("invalid elt size!"); +    } +  } else { +    if (EltSize == 8) { +      Opc = AArch64::INSvi8lane; +      SubregIdx = AArch64::bsub; +    } else if (EltSize == 16) { +      Opc = AArch64::INSvi16lane; +      SubregIdx = AArch64::hsub; +    } else if (EltSize == 32) { +      Opc = AArch64::INSvi32lane; +      SubregIdx = AArch64::ssub; +    } else if (EltSize == 64) { +      Opc = AArch64::INSvi64lane; +      SubregIdx = AArch64::dsub; +    } else { +      llvm_unreachable("invalid elt size!"); +    } +  } +  return std::make_pair(Opc, SubregIdx); +} + +MachineInstr *AArch64InstructionSelector::emitVectorConcat( +    unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const { +  // We implement a vector concat by: +  // 1. Use scalar_to_vector to insert the lower vector into the larger dest +  // 2. Insert the upper vector into the destination's upper element +  // TODO: some of this code is common with G_BUILD_VECTOR handling. +  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + +  const LLT Op1Ty = MRI.getType(Op1); +  const LLT Op2Ty = MRI.getType(Op2); + +  if (Op1Ty != Op2Ty) { +    LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); +    return nullptr; +  } +  assert(Op1Ty.isVector() && "Expected a vector for vector concat"); + +  if (Op1Ty.getSizeInBits() >= 128) { +    LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); +    return nullptr; +  } + +  // At the moment we just support 64 bit vector concats. +  if (Op1Ty.getSizeInBits() != 64) { +    LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); +    return nullptr; +  } + +  const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); +  const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); +  const TargetRegisterClass *DstRC = +      getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); + +  MachineInstr *WidenedOp1 = +      emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); +  MachineInstr *WidenedOp2 = +      emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); +  if (!WidenedOp1 || !WidenedOp2) { +    LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); +    return nullptr; +  } + +  // Now do the insert of the upper element. +  unsigned InsertOpc, InsSubRegIdx; +  std::tie(InsertOpc, InsSubRegIdx) = +      getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); + +  auto InsElt = +      MIRBuilder +          .buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()}) +          .addImm(1) /* Lane index */ +          .addUse(WidenedOp2->getOperand(0).getReg()) +          .addImm(0); + +  constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); +  return &*InsElt;  }  bool AArch64InstructionSelector::selectShuffleVector( @@ -2003,21 +2115,43 @@ bool AArch64InstructionSelector::selectShuffleVector(      }    } -  if (DstTy.getSizeInBits() != 128) { -    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); -    // This case can be done with TBL1. -    return false; -  } +  MachineIRBuilder MIRBuilder(I);    // Use a constant pool to load the index vector for TBL.    Constant *CPVal = ConstantVector::get(CstIdxs); -  MachineIRBuilder MIRBuilder(I);    MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);    if (!IndexLoad) {      LLVM_DEBUG(dbgs() << "Could not load from a constant pool");      return false;    } +  if (DstTy.getSizeInBits() != 128) { +    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); +    // This case can be done with TBL1. +    MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder); +    if (!Concat) { +      LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); +      return false; +    } + +    // The constant pool load will be 64 bits, so need to convert to FPR128 reg. +    IndexLoad = +        emitScalarToVector(64, &AArch64::FPR128RegClass, +                           IndexLoad->getOperand(0).getReg(), MIRBuilder); + +    auto TBL1 = MIRBuilder.buildInstr( +        AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, +        {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); +    constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); + +    auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(), +                        TII.get(TargetOpcode::COPY), I.getOperand(0).getReg()) +                    .addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub); +    RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); +    I.eraseFromParent(); +    return true; +  } +    // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive    // Q registers for regalloc.    auto RegSeq = MIRBuilder @@ -2049,32 +2183,15 @@ bool AArch64InstructionSelector::selectBuildVector(    const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);    unsigned Opc;    unsigned SubregIdx; -  if (RB.getID() == AArch64::GPRRegBankID) { -    if (EltSize == 32) { -      Opc = AArch64::INSvi32gpr; -      SubregIdx = AArch64::ssub; -    } else { -      Opc = AArch64::INSvi64gpr; -      SubregIdx = AArch64::dsub; -    } -  } else { -    if (EltSize == 16) { -      Opc = AArch64::INSvi16lane; -      SubregIdx = AArch64::hsub; -    } else if (EltSize == 32) { -      Opc = AArch64::INSvi32lane; -      SubregIdx = AArch64::ssub; -    } else { -      Opc = AArch64::INSvi64lane; -      SubregIdx = AArch64::dsub; -    } -  } + +  std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);    MachineIRBuilder MIRBuilder(I);    const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;    MachineInstr *ScalarToVec = -      emitScalarToVector(DstTy, DstRC, I.getOperand(1).getReg(), MIRBuilder); +      emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, +                         I.getOperand(1).getReg(), MIRBuilder);    if (!ScalarToVec)      return false; | 

