diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 83 | ||||
| -rw-r--r-- | llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp | 35 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SystemZ/memcmp-01.ll | 92 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SystemZ/memcmp-02.ll | 8 | 
4 files changed, 188 insertions, 30 deletions
| diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 216ca3450c5..dd230c62a5a 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1954,6 +1954,18 @@ static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {    return NewMBB;  } +// Split MBB after MI and return the new block (the one that contains +// instructions after MI). +static MachineBasicBlock *splitBlockAfter(MachineInstr *MI, +                                          MachineBasicBlock *MBB) { +  MachineBasicBlock *NewMBB = emitBlockAfter(MBB); +  NewMBB->splice(NewMBB->begin(), MBB, +                 llvm::next(MachineBasicBlock::iterator(MI)), +                 MBB->end()); +  NewMBB->transferSuccessorsAndUpdatePHIs(MBB); +  return NewMBB; +} +  // Split MBB before MI and return the new block (the one that contains MI).  static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,                                             MachineBasicBlock *MBB) { @@ -2490,6 +2502,11 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,    uint64_t       SrcDisp  = MI->getOperand(3).getImm();    uint64_t       Length   = MI->getOperand(4).getImm(); +  // When generating more than one CLC, all but the last will need to +  // branch to the end when a difference is found. +  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? +                               splitBlockAfter(MI, MBB) : 0); +    // Check for the loop form, in which operand 5 is the trip count.    if (MI->getNumExplicitOperands() > 5) {      bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); @@ -2514,6 +2531,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,      MachineBasicBlock *StartMBB = MBB;      MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);      MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); +    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);      //  StartMBB:      //   # fall through to LoopMMB @@ -2521,39 +2539,54 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,      //  LoopMBB:      //   %ThisDestReg = phi [ %StartDestReg, StartMBB ], -    //                      [ %NextDestReg, LoopMBB ] +    //                      [ %NextDestReg, NextMBB ]      //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ], -    //                     [ %NextSrcReg, LoopMBB ] +    //                     [ %NextSrcReg, NextMBB ]      //   %ThisCountReg = phi [ %StartCountReg, StartMBB ], -    //                       [ %NextCountReg, LoopMBB ] -    //   PFD 2, 768+DestDisp(%ThisDestReg) +    //                       [ %NextCountReg, NextMBB ] +    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )      //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg) -    //   %NextDestReg = LA 256(%ThisDestReg) -    //   %NextSrcReg = LA 256(%ThisSrcReg) -    //   %NextCountReg = AGHI %ThisCountReg, -1 -    //   CGHI %NextCountReg, 0 -    //   JLH LoopMBB -    //   # fall through to DoneMMB +    //   ( JLH EndMBB )      // -    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes. +    // The prefetch is used only for MVC.  The JLH is used only for CLC.      MBB = LoopMBB;      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)        .addReg(StartDestReg).addMBB(StartMBB) -      .addReg(NextDestReg).addMBB(LoopMBB); +      .addReg(NextDestReg).addMBB(NextMBB);      if (!HaveSingleBase)        BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)          .addReg(StartSrcReg).addMBB(StartMBB) -        .addReg(NextSrcReg).addMBB(LoopMBB); +        .addReg(NextSrcReg).addMBB(NextMBB);      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)        .addReg(StartCountReg).addMBB(StartMBB) -      .addReg(NextCountReg).addMBB(LoopMBB); -    BuildMI(MBB, DL, TII->get(SystemZ::PFD)) -      .addImm(SystemZ::PFD_WRITE) -      .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); +      .addReg(NextCountReg).addMBB(NextMBB); +    if (Opcode == SystemZ::MVC) +      BuildMI(MBB, DL, TII->get(SystemZ::PFD)) +        .addImm(SystemZ::PFD_WRITE) +        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);      BuildMI(MBB, DL, TII->get(Opcode))        .addReg(ThisDestReg).addImm(DestDisp).addImm(256)        .addReg(ThisSrcReg).addImm(SrcDisp); +    if (EndMBB) { +      BuildMI(MBB, DL, TII->get(SystemZ::BRC)) +        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) +        .addMBB(EndMBB); +      MBB->addSuccessor(EndMBB); +      MBB->addSuccessor(NextMBB); +    } + +    // NextMBB: +    //   %NextDestReg = LA 256(%ThisDestReg) +    //   %NextSrcReg = LA 256(%ThisSrcReg) +    //   %NextCountReg = AGHI %ThisCountReg, -1 +    //   CGHI %NextCountReg, 0 +    //   JLH LoopMBB +    //   # fall through to DoneMMB +    // +    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes. +    MBB = NextMBB; +      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)        .addReg(ThisDestReg).addImm(256).addReg(0);      if (!HaveSingleBase) @@ -2599,6 +2632,22 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,      DestDisp += ThisLength;      SrcDisp += ThisLength;      Length -= ThisLength; +    // If there's another CLC to go, branch to the end if a difference +    // was found. +    if (EndMBB && Length > 0) { +      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB); +      BuildMI(MBB, DL, TII->get(SystemZ::BRC)) +        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) +        .addMBB(EndMBB); +      MBB->addSuccessor(EndMBB); +      MBB->addSuccessor(NextMBB); +      MBB = NextMBB; +    } +  } +  if (EndMBB) { +    MBB->addSuccessor(EndMBB); +    MBB = EndMBB; +    MBB->addLiveIn(SystemZ::CC);    }    MI->eraseFromParent(); diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 6026b1f6f0e..dc2f225acb2 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -141,6 +141,28 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,    return SDValue();  } +// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size), +// deciding whether to use a loop or straight-line code. +static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain, +                       SDValue Src1, SDValue Src2, uint64_t Size) { +  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); +  EVT PtrVT = Src1.getValueType(); +  // A two-CLC sequence is a clear win over a loop, not least because it +  // needs only one branch.  A three-CLC sequence needs the same number +  // of branches as a loop (i.e. 2), but is shorter.  That brings us to +  // lengths greater than 768 bytes.  It seems relatively likely that +  // a difference will be found within the first 768 bytes, so we just +  // optimize for the smallest number of branch instructions, in order +  // to avoid polluting the prediction buffer too much.  A loop only ever +  // needs 2 branches, whereas a straight-line sequence would need 3 or more. +  if (Size > 3 * 256) +    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2, +                       DAG.getConstant(Size, PtrVT), +                       DAG.getConstant(Size / 256, PtrVT)); +  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2, +                     DAG.getConstant(Size, PtrVT)); +} +  // Convert the current CC value into an integer that is 0 if CC == 0,  // less than zero if CC == 1 and greater than zero if CC >= 2.  // The sequence starts with IPM, which puts CC into bits 29 and 28 @@ -159,17 +181,12 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,                          SDValue Src1, SDValue Src2, SDValue Size,                          MachinePointerInfo Op1PtrInfo,                          MachinePointerInfo Op2PtrInfo) const { -  EVT PtrVT = Src1.getValueType();    if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {      uint64_t Bytes = CSize->getZExtValue(); -    if (Bytes >= 1 && Bytes <= 0x100) { -      // A single CLC. -      SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); -      Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, -                          Src1, Src2, Size, DAG.getConstant(0, PtrVT)); -      SDValue Glue = Chain.getValue(1); -      return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain); -    } +    assert(Bytes > 0 && "Caller should have handled 0-size case"); +    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes); +    SDValue Glue = Chain.getValue(1); +    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);    }    return std::make_pair(SDValue(), SDValue());  } diff --git a/llvm/test/CodeGen/SystemZ/memcmp-01.ll b/llvm/test/CodeGen/SystemZ/memcmp-01.ll index 5f5752b336d..a0144194693 100644 --- a/llvm/test/CodeGen/SystemZ/memcmp-01.ll +++ b/llvm/test/CodeGen/SystemZ/memcmp-01.ll @@ -123,11 +123,99 @@ exit:    ret i32 %res  } -; 257 bytes is too big for a single CLC.  For now expect a call instead. +; 257 bytes needs two CLCs.  define i32 @f8(i8 *%src1, i8 *%src2) {  ; CHECK-LABEL: f8: -; CHECK: brasl %r14, memcmp@PLT +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(1,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]]  ; CHECK: br %r14    %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)    ret i32 %res  } + +; Test a comparison of 258 bytes in which the CC result can be used directly. +define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f9: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(1,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK-NEXT: jl .L +; CHECK: br %r14 +entry: +  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257) +  %cmp = icmp slt i32 %res, 0 +  br i1 %cmp, label %exit, label %store + +store: +  store i32 0, i32 *%dest +  br label %exit + +exit: +  ret void +} + +; Test the largest size that can use two CLCs. +define i32 @f10(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f10: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(256,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 +  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 512) +  ret i32 %res +} + +; Test the smallest size that needs 3 CLCs. +define i32 @f11(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f11: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(256,%r2), 256(%r3) +; CHECK: jlh [[LABEL]] +; CHECK: clc 512(1,%r2), 512(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 +  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 513) +  ret i32 %res +} + +; Test the largest size than can use 3 CLCs. +define i32 @f12(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f12: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(256,%r2), 256(%r3) +; CHECK: jlh [[LABEL]] +; CHECK: clc 512(256,%r2), 512(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 +  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 768) +  ret i32 %res +} + +; The next size up uses a loop instead.  We leave the more complicated +; loop tests to memcpy-01.ll, which shares the same form. +define i32 @f13(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f13: +; CHECK: lghi [[COUNT:%r[0-5]]], 3 +; CHECK: [[LOOP:.L[^:]*]]: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK-DAG: la %r2, 256(%r2) +; CHECK-DAG: la %r3, 256(%r3) +; CHECK: brctg [[COUNT]], [[LOOP]] +; CHECK: clc 0(1,%r2), 0(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 +  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769) +  ret i32 %res +} diff --git a/llvm/test/CodeGen/SystemZ/memcmp-02.ll b/llvm/test/CodeGen/SystemZ/memcmp-02.ll index cae3d3d4943..74b090dcdd8 100644 --- a/llvm/test/CodeGen/SystemZ/memcmp-02.ll +++ b/llvm/test/CodeGen/SystemZ/memcmp-02.ll @@ -125,10 +125,14 @@ exit:    ret i64 %res  } -; 257 bytes is too big for a single CLC.  For now expect a call instead. +; 257 bytes needs two CLCs.  define i64 @f8(i8 *%src1, i8 *%src2) {  ; CHECK-LABEL: f8: -; CHECK: brasl %r14, memcmp@PLT +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(1,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]]  ; CHECK: br %r14    %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257)    ret i64 %res | 

