diff options
| author | Evan Cheng <evan.cheng@apple.com> | 2009-02-21 02:06:47 +0000 | 
|---|---|---|
| committer | Evan Cheng <evan.cheng@apple.com> | 2009-02-21 02:06:47 +0000 | 
| commit | 107b06c4b9740530fee9dbcf5fd50a5d537d2ab7 (patch) | |
| tree | 61caa718724038c60fc447de98dce6cc19e2acc9 | |
| parent | 82aa14fae81259da4d9bedf4d0b25c0c08fe7a54 (diff) | |
| download | bcm5719-llvm-107b06c4b9740530fee9dbcf5fd50a5d537d2ab7.tar.gz bcm5719-llvm-107b06c4b9740530fee9dbcf5fd50a5d537d2ab7.zip  | |
Teach LSR sink to sink the immediate portion of the common expression back into uses if they fit in address modes of all the uses.
llvm-svn: 65215
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 99 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/2007-10-05-3AddrConvert.ll | 52 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/loop-strength-reduce-2.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/loop-strength-reduce8.ll | 78 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/stride-nine-with-base-reg.ll | 8 | 
5 files changed, 199 insertions, 40 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index d18a008fef0..2099ceace51 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -26,19 +26,19 @@  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Transforms/Utils/AddrModeMatcher.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Target/TargetData.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/Compiler.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/GetElementPtrTypeIterator.h"  #include "llvm/Target/TargetLowering.h"  #include <algorithm> -#include <set>  using namespace llvm;  STATISTIC(NumReduced ,    "Number of GEPs strength reduced"); @@ -46,6 +46,7 @@ STATISTIC(NumInserted,    "Number of PHIs inserted");  STATISTIC(NumVariable,    "Number of PHIs with variable strides");  STATISTIC(NumEliminated,  "Number of strides eliminated");  STATISTIC(NumShadow,      "Number of Shadow IVs optimized"); +STATISTIC(NumImmSunk,     "Number of common expr immediates sunk into uses");  static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",                                         cl::init(false), @@ -954,21 +955,17 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,  /// that can fit into the immediate field of instructions in the target.  /// Accumulate these immediate values into the Imm value.  static void MoveImmediateValues(const TargetLowering *TLI, -                                Instruction *User, +                                const Type *UseTy,                                  SCEVHandle &Val, SCEVHandle &Imm,                                  bool isAddress, Loop *L,                                  ScalarEvolution *SE) { -  const Type *UseTy = User->getType(); -  if (StoreInst *SI = dyn_cast<StoreInst>(User)) -    UseTy = SI->getOperand(0)->getType(); -    if (SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {      std::vector<SCEVHandle> NewOps;      NewOps.reserve(SAE->getNumOperands());      for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {        SCEVHandle NewOp = SAE->getOperand(i); -      MoveImmediateValues(TLI, User, NewOp, Imm, isAddress, L, SE); +      MoveImmediateValues(TLI, UseTy, NewOp, Imm, isAddress, L, SE);        if (!NewOp->isLoopInvariant(L)) {          // If this is a loop-variant expression, it must stay in the immediate @@ -987,7 +984,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,    } else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {      // Try to pull immediates out of the start value of nested addrec's.      SCEVHandle Start = SARE->getStart(); -    MoveImmediateValues(TLI, User, Start, Imm, isAddress, L, SE); +    MoveImmediateValues(TLI, UseTy, Start, Imm, isAddress, L, SE);      if (Start != SARE->getStart()) {        std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end()); @@ -1002,7 +999,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,        SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());        SCEVHandle NewOp = SME->getOperand(1); -      MoveImmediateValues(TLI, User, NewOp, SubImm, isAddress, L, SE); +      MoveImmediateValues(TLI, UseTy, NewOp, SubImm, isAddress, L, SE);        // If we extracted something out of the subexpressions, see if we can         // simplify this! @@ -1034,6 +1031,16 @@ static void MoveImmediateValues(const TargetLowering *TLI,    // Otherwise, no immediates to move.  } +static void MoveImmediateValues(const TargetLowering *TLI, +                                Instruction *User, +                                SCEVHandle &Val, SCEVHandle &Imm, +                                bool isAddress, Loop *L, +                                ScalarEvolution *SE) { +  const Type *UseTy = User->getType(); +  if (StoreInst *SI = dyn_cast<StoreInst>(User)) +    UseTy = SI->getOperand(0)->getType(); +  MoveImmediateValues(TLI, UseTy, Val, Imm, isAddress, L, SE); +}  /// SeparateSubExprs - Decompose Expr into all of the subexpressions that are  /// added together.  This is used to reassociate common addition subexprs @@ -1450,6 +1457,9 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,        UsersToProcess[i].Base =           SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType());      } else { +      // Not all uses are outside the loop. +      AllUsesAreOutsideLoop = false;  +        // Addressing modes can be folded into loads and stores.  Be careful that        // the store is through the expression, not of the expression though.        bool isPHI = false; @@ -1460,9 +1470,6 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,          ++NumPHI;        } -      // Not all uses are outside the loop. -      AllUsesAreOutsideLoop = false;  -        if (isAddress)          HasAddress = true; @@ -1475,12 +1482,12 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,      }    } -  // If one of the use if a PHI node and all other uses are addresses, still +  // If one of the use is a PHI node and all other uses are addresses, still    // allow iv reuse. Essentially we are trading one constant multiplication    // for one fewer iv.    if (NumPHI > 1)      AllUsesAreAddresses = false; - +        // There are no in-loop address uses.    if (AllUsesAreAddresses && (!HasAddress && !AllUsesAreOutsideLoop))      AllUsesAreAddresses = false; @@ -1754,6 +1761,28 @@ LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride(                                    "commonbase", PreInsertPt);  } +static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset, +                                    const Type *ReplacedTy, +                                   std::vector<BasedUser> &UsersToProcess, +                                   const TargetLowering *TLI) { +  SmallVector<Instruction*, 16> AddrModeInsts; +  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) { +    if (UsersToProcess[i].isUseOfPostIncrementedValue) +      continue; +    ExtAddrMode AddrMode = +      AddressingModeMatcher::Match(UsersToProcess[i].OperandValToReplace, +                                   ReplacedTy, UsersToProcess[i].Inst, +                                   AddrModeInsts, *TLI); +    if (GV && GV != AddrMode.BaseGV) +      return false; +    if (Offset && !AddrMode.BaseOffs) +      // FIXME: How to accurate check it's immediate offset is folded. +      return false; +    AddrModeInsts.clear(); +  } +  return true; +} +  /// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single  /// stride of IV.  All of the users may have different starting values, and this  /// may not be the only stride (we know it is if isOnlyStride is true). @@ -1797,6 +1826,41 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,    const Type *ReplacedTy = CommonExprs->getType(); +  // If all uses are addresses, consider sinking the immediate part of the +  // common expression back into uses if they can fit in the immediate fields. +  if (HaveCommonExprs && AllUsesAreAddresses) { +    SCEVHandle NewCommon = CommonExprs; +    SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy); +    MoveImmediateValues(TLI, ReplacedTy, NewCommon, Imm, true, L, SE); +    if (!Imm->isZero()) { +      bool DoSink = true; + +      // If the immediate part of the common expression is a GV, check if it's +      // possible to fold it into the target addressing mode. +      GlobalValue *GV = 0; +      if (SCEVUnknown *SU = dyn_cast<SCEVUnknown>(Imm)) { +        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(SU->getValue())) +          if (CE->getOpcode() == Instruction::PtrToInt) +            GV = dyn_cast<GlobalValue>(CE->getOperand(0)); +      } +      int64_t Offset = 0; +      if (SCEVConstant *SC = dyn_cast<SCEVConstant>(Imm)) +        Offset = SC->getValue()->getSExtValue(); +      if (GV || Offset) +        DoSink = IsImmFoldedIntoAddrMode(GV, Offset, ReplacedTy, +                                         UsersToProcess, TLI); + +      if (DoSink) { +        DOUT << "  Sinking " << *Imm << " back down into uses\n"; +        for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) +          UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm); +        CommonExprs = NewCommon; +        HaveCommonExprs = !CommonExprs->isZero(); +        ++NumImmSunk; +      } +    } +  } +    // Now that we know what we need to do, insert the PHI node itself.    //    DOUT << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE " @@ -2556,7 +2620,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {      bool HasOneStride = IVUsesByStride.size() == 1;  #ifndef NDEBUG -    DOUT << "\nLSR on "; +    DOUT << "\nLSR on \"" << L->getHeader()->getParent()->getNameStart() +         << "\" ";      DEBUG(L->dump());  #endif diff --git a/llvm/test/CodeGen/X86/2007-10-05-3AddrConvert.ll b/llvm/test/CodeGen/X86/2007-10-05-3AddrConvert.ll index 2cc9124727f..e9fbe797f5b 100644 --- a/llvm/test/CodeGen/X86/2007-10-05-3AddrConvert.ll +++ b/llvm/test/CodeGen/X86/2007-10-05-3AddrConvert.ll @@ -4,29 +4,43 @@  	%struct.bnode = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode*, %struct.bnode* }  	%struct.node = type { i16, double, [3 x double], i32, i32 } -define fastcc void @old_main() { +define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {  entry: -	%tmp44 = malloc %struct.anon		; <%struct.anon*> [#uses=2] -	store double 4.000000e+00, double* null, align 4 -	br label %bb41 +	%0 = malloc %struct.anon		; <%struct.anon*> [#uses=2] +	%1 = getelementptr %struct.anon* %0, i32 0, i32 2		; <%struct.node**> [#uses=1] +	br label %bb14.i -bb41:		; preds = %uniform_testdata.exit, %entry -	%i.0110 = phi i32 [ 0, %entry ], [ %tmp48, %uniform_testdata.exit ]		; <i32> [#uses=2] -	%tmp48 = add i32 %i.0110, 1		; <i32> [#uses=1] -	br i1 false, label %uniform_testdata.exit, label %bb33.preheader.i +bb14.i:		; preds = %bb14.i, %entry +	%i8.0.reg2mem.0.i = phi i32 [ 0, %entry ], [ %2, %bb14.i ]		; <i32> [#uses=1] +	%2 = add i32 %i8.0.reg2mem.0.i, 1		; <i32> [#uses=2] +	%exitcond74.i = icmp eq i32 %2, 32		; <i1> [#uses=1] +	br i1 %exitcond74.i, label %bb32.i, label %bb14.i -bb33.preheader.i:		; preds = %bb41 -	ret void +bb32.i:		; preds = %bb32.i, %bb14.i +	%tmp.0.reg2mem.0.i = phi i32 [ %indvar.next63.i, %bb32.i ], [ 0, %bb14.i ]		; <i32> [#uses=1] +	%indvar.next63.i = add i32 %tmp.0.reg2mem.0.i, 1		; <i32> [#uses=2] +	%exitcond64.i = icmp eq i32 %indvar.next63.i, 64		; <i1> [#uses=1] +	br i1 %exitcond64.i, label %bb47.loopexit.i, label %bb32.i -uniform_testdata.exit:		; preds = %bb41 -	%tmp57 = getelementptr %struct.anon* %tmp44, i32 0, i32 3, i32 %i.0110		; <%struct.bnode**> [#uses=1] -	store %struct.bnode* null, %struct.bnode** %tmp57, align 4 -	br i1 false, label %bb154, label %bb41 +bb.i.i:		; preds = %bb47.loopexit.i +	unreachable -bb154:		; preds = %bb154, %uniform_testdata.exit -	br i1 false, label %bb166, label %bb154 +stepsystem.exit.i:		; preds = %bb47.loopexit.i +	store %struct.node* null, %struct.node** %1, align 4 +	br label %bb.i6.i -bb166:		; preds = %bb154 -	%tmp169 = getelementptr %struct.anon* %tmp44, i32 0, i32 3, i32 0		; <%struct.bnode**> [#uses=0] -	ret void +bb.i6.i:		; preds = %bb.i6.i, %stepsystem.exit.i +	%tmp.0.i.i = add i32 0, -1		; <i32> [#uses=1] +	%3 = icmp slt i32 %tmp.0.i.i, 0		; <i1> [#uses=1] +	br i1 %3, label %bb107.i.i, label %bb.i6.i + +bb107.i.i:		; preds = %bb107.i.i, %bb.i6.i +	%q_addr.0.i.i.in = phi %struct.bnode** [ null, %bb107.i.i ], [ %4, %bb.i6.i ]		; <%struct.bnode**> [#uses=1] +	%q_addr.0.i.i = load %struct.bnode** %q_addr.0.i.i.in		; <%struct.bnode*> [#uses=0] +	br label %bb107.i.i + +bb47.loopexit.i:		; preds = %bb32.i +	%4 = getelementptr %struct.anon* %0, i32 0, i32 4, i32 0		; <%struct.bnode**> [#uses=1] +	%5 = icmp eq %struct.node* null, null		; <i1> [#uses=1] +	br i1 %5, label %stepsystem.exit.i, label %bb.i.i  } diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll index b67e618ac8c..8ea5bdb208e 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll @@ -1,8 +1,10 @@  ; RUN: llvm-as < %s | llc -march=x86 -relocation-model=pic | \  ; RUN:   grep {, 4} | count 1 +; RUN: llvm-as < %s | llc -march=x86 | not grep lea  ;  ; Make sure the common loop invariant A is hoisted up to preheader,  ; since too many registers are needed to subsume it into the addressing modes. +; It's safe to sink A in when it's not pic.  @A = global [16 x [16 x i32]] zeroinitializer, align 32		; <[16 x [16 x i32]]*> [#uses=2] diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce8.ll b/llvm/test/CodeGen/X86/loop-strength-reduce8.ll new file mode 100644 index 00000000000..1846c7d4467 --- /dev/null +++ b/llvm/test/CodeGen/X86/loop-strength-reduce8.ll @@ -0,0 +1,78 @@ +; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin | grep leal | not grep 16 + +	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 } +	%struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] } +	%struct.bitmap_head_def = type { %struct.bitmap_element*, %struct.bitmap_element*, i32 } +	%struct.branch_path = type { %struct.rtx_def*, i32 } +	%struct.c_lang_decl = type <{ i8, [3 x i8] }> +	%struct.constant_descriptor = type { %struct.constant_descriptor*, i8*, %struct.rtx_def*, { x86_fp80 } } +	%struct.eh_region = type { %struct.eh_region*, %struct.eh_region*, %struct.eh_region*, i32, %struct.bitmap_head_def*, i32, { { %struct.eh_region*, %struct.eh_region*, %struct.eh_region*, %struct.rtx_def* } }, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* } +	%struct.eh_status = type { %struct.eh_region*, %struct.eh_region**, %struct.eh_region*, %struct.eh_region*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32, i32, %struct.varray_head_tag*, %struct.varray_head_tag*, %struct.varray_head_tag*, %struct.branch_path*, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* } +	%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.sequence_stack*, i32, i32, i8*, i32, i8*, %struct.tree_node**, %struct.rtx_def** } +	%struct.equiv_table = type { %struct.rtx_def*, %struct.rtx_def* } +	%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* } +	%struct.function = type { %struct.eh_status*, %struct.stmt_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, i8*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, i8*, %struct.initial_value_struct*, i32, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32, %struct.rtx_def**, %struct.temp_slot*, i32, i32, i32, %struct.var_refs_queue*, i32, i32, i8*, %struct.tree_node*, %struct.rtx_def*, i32, i32, %struct.machine_function*, i32, i32, %struct.language_function*, %struct.rtx_def*, i8, i8, i8 } +	%struct.goto_fixup = type { %struct.goto_fixup*, %struct.rtx_def*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.rtx_def*, %struct.tree_node* } +	%struct.initial_value_struct = type { i32, i32, %struct.equiv_table* } +	%struct.label_chain = type { %struct.label_chain*, %struct.tree_node* } +	%struct.lang_decl = type { %struct.c_lang_decl, %struct.tree_node* } +	%struct.language_function = type { %struct.stmt_tree_s, %struct.tree_node* } +	%struct.machine_function = type { [59 x [3 x %struct.rtx_def*]], i32, i32 } +	%struct.nesting = type { %struct.nesting*, %struct.nesting*, i32, %struct.rtx_def*, { { i32, %struct.rtx_def*, %struct.rtx_def*, %struct.nesting*, %struct.tree_node*, %struct.tree_node*, %struct.label_chain*, i32, i32, i32, i32, %struct.rtx_def*, %struct.tree_node** } } } +	%struct.pool_constant = type { %struct.constant_descriptor*, %struct.pool_constant*, %struct.pool_constant*, %struct.rtx_def*, i32, i32, i32, i64, i32 } +	%struct.rtunion = type { i64 } +	%struct.rtx_def = type { i16, i8, i8, [1 x %struct.rtunion] } +	%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.sequence_stack* } +	%struct.stmt_status = type { %struct.nesting*, %struct.nesting*, %struct.nesting*, %struct.nesting*, %struct.nesting*, %struct.nesting*, i32, i32, %struct.tree_node*, %struct.rtx_def*, i32, i8*, i32, %struct.goto_fixup* } +	%struct.stmt_tree_s = type { %struct.tree_node*, %struct.tree_node*, i8*, i32 } +	%struct.temp_slot = type { %struct.temp_slot*, %struct.rtx_def*, %struct.rtx_def*, i32, i64, %struct.tree_node*, %struct.tree_node*, i8, i8, i32, i32, i64, i64 } +	%struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, i8, i8, i8, i8 } +	%struct.tree_decl = type { %struct.tree_common, i8*, i32, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, %struct.rtunion, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, { %struct.function* }, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* } +	%struct.tree_exp = type { %struct.tree_common, i32, [1 x %struct.tree_node*] } +	%struct.tree_node = type { %struct.tree_decl } +	%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* } +	%struct.varasm_status = type { %struct.constant_descriptor**, %struct.pool_constant**, %struct.pool_constant*, %struct.pool_constant*, i64, %struct.rtx_def* } +	%struct.varray_data = type { [1 x i64] } +	%struct.varray_head_tag = type { i32, i32, i32, i8*, %struct.varray_data } +@lineno = internal global i32 0		; <i32*> [#uses=1] +@tree_code_length = internal global [256 x i32] zeroinitializer +@llvm.used = appending global [1 x i8*] [ i8* bitcast (%struct.tree_node* (i32, ...)* @build_stmt to i8*) ], section "llvm.metadata"		; <[1 x i8*]*> [#uses=0] + +define %struct.tree_node* @build_stmt(i32 %code, ...) nounwind { +entry: +	%p = alloca i8*		; <i8**> [#uses=3] +	%p1 = bitcast i8** %p to i8*		; <i8*> [#uses=2] +	call void @llvm.va_start(i8* %p1) +	%0 = call fastcc %struct.tree_node* @make_node(i32 %code) nounwind		; <%struct.tree_node*> [#uses=2] +	%1 = getelementptr [256 x i32]* @tree_code_length, i32 0, i32 %code		; <i32*> [#uses=1] +	%2 = load i32* %1, align 4		; <i32> [#uses=2] +	%3 = load i32* @lineno, align 4		; <i32> [#uses=1] +	%4 = bitcast %struct.tree_node* %0 to %struct.tree_exp*		; <%struct.tree_exp*> [#uses=2] +	%5 = getelementptr %struct.tree_exp* %4, i32 0, i32 1		; <i32*> [#uses=1] +	store i32 %3, i32* %5, align 4 +	%6 = icmp sgt i32 %2, 0		; <i1> [#uses=1] +	br i1 %6, label %bb, label %bb3 + +bb:		; preds = %bb, %entry +	%i.01 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ]		; <i32> [#uses=2] +	%7 = load i8** %p, align 4		; <i8*> [#uses=2] +	%8 = getelementptr i8* %7, i32 4		; <i8*> [#uses=1] +	store i8* %8, i8** %p, align 4 +	%9 = bitcast i8* %7 to %struct.tree_node**		; <%struct.tree_node**> [#uses=1] +	%10 = load %struct.tree_node** %9, align 4		; <%struct.tree_node*> [#uses=1] +	%11 = getelementptr %struct.tree_exp* %4, i32 0, i32 2, i32 %i.01		; <%struct.tree_node**> [#uses=1] +	store %struct.tree_node* %10, %struct.tree_node** %11, align 4 +	%indvar.next = add i32 %i.01, 1		; <i32> [#uses=2] +	%exitcond = icmp eq i32 %indvar.next, %2		; <i1> [#uses=1] +	br i1 %exitcond, label %bb3, label %bb + +bb3:		; preds = %bb, %entry +	call void @llvm.va_end(i8* %p1) +	ret %struct.tree_node* %0 +} + +declare void @llvm.va_start(i8*) nounwind + +declare void @llvm.va_end(i8*) nounwind + +declare fastcc %struct.tree_node* @make_node(i32) nounwind diff --git a/llvm/test/CodeGen/X86/stride-nine-with-base-reg.ll b/llvm/test/CodeGen/X86/stride-nine-with-base-reg.ll index 4bd9924f265..c0cfb852bd3 100644 --- a/llvm/test/CodeGen/X86/stride-nine-with-base-reg.ll +++ b/llvm/test/CodeGen/X86/stride-nine-with-base-reg.ll @@ -1,14 +1,14 @@ -; RUN: llvm-as < %s | llc -march=x86 -relocation-model=static | grep lea | count 1 +; RUN: llvm-as < %s | llc -march=x86 -relocation-model=static | not grep lea  ; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea -; For x86 there's an lea above the loop. In both cases, there shouldn't -; be any lea instructions inside the loop. +; _P should be sunk into the loop and folded into the address mode. There +; shouldn't be any lea instructions inside the loop.  @B = external global [1000 x i8], align 32  @A = external global [1000 x i8], align 32  @P = external global [1000 x i8], align 32 -define void @foo(i32 %m, i32 %p) { +define void @foo(i32 %m, i32 %p) nounwind {  entry:  	%tmp1 = icmp sgt i32 %m, 0  	br i1 %tmp1, label %bb, label %return  | 

