|  | Commit message (Collapse) | Author | Age | Files | Lines | 
|---|
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | particular, it should realize that phi's use their values in the pred block
not the phi block itself.  This change turns our em3d loop from this:
_test:
        cmpwi cr0, r4, 0
        bgt cr0, LBB_test_2     ; entry.no_exit_crit_edge
LBB_test_1:     ; entry.loopexit_crit_edge
        li r2, 0
        b LBB_test_6    ; loopexit
LBB_test_2:     ; entry.no_exit_crit_edge
        li r6, 0
LBB_test_3:     ; no_exit
        or r2, r6, r6
        lwz r6, 0(r3)
        cmpw cr0, r6, r5
        beq cr0, LBB_test_6     ; loopexit
LBB_test_4:     ; endif
        addi r3, r3, 4
        addi r6, r2, 1
        cmpw cr0, r6, r4
        blt cr0, LBB_test_3     ; no_exit
LBB_test_5:     ; endif.loopexit.loopexit_crit_edge
        addi r3, r2, 1
        blr
LBB_test_6:     ; loopexit
        or r3, r2, r2
        blr
into:
_test:
        cmpwi cr0, r4, 0
        bgt cr0, LBB_test_2     ; entry.no_exit_crit_edge
LBB_test_1:     ; entry.loopexit_crit_edge
        li r2, 0
        b LBB_test_5    ; loopexit
LBB_test_2:     ; entry.no_exit_crit_edge
        li r6, 0
LBB_test_3:     ; no_exit
        lwz r2, 0(r3)
        cmpw cr0, r2, r5
        or r2, r6, r6
        beq cr0, LBB_test_5     ; loopexit
LBB_test_4:     ; endif
        addi r3, r3, 4
        addi r6, r6, 1
        cmpw cr0, r6, r4
        or r2, r6, r6
        blt cr0, LBB_test_3     ; no_exit
LBB_test_5:     ; loopexit
        or r3, r2, r2
        blr
Unfortunately, this is actually worse code, because the register coallescer
is getting confused somehow.  If it were doing its job right, it could turn the
code into this:
_test:
        cmpwi cr0, r4, 0
        bgt cr0, LBB_test_2     ; entry.no_exit_crit_edge
LBB_test_1:     ; entry.loopexit_crit_edge
        li r6, 0
        b LBB_test_5    ; loopexit
LBB_test_2:     ; entry.no_exit_crit_edge
        li r6, 0
LBB_test_3:     ; no_exit
        lwz r2, 0(r3)
        cmpw cr0, r2, r5
        beq cr0, LBB_test_5     ; loopexit
LBB_test_4:     ; endif
        addi r3, r3, 4
        addi r6, r6, 1
        cmpw cr0, r6, r4
        blt cr0, LBB_test_3     ; no_exit
LBB_test_5:     ; loopexit
        or r3, r6, r6
        blr
... which I'll work on next. :)
llvm-svn: 23604 | 
| | 
| 
| 
| | llvm-svn: 23603 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | memoizing code when IV's are used by phinodes outside of loops.  In a simple
example, we were getting this code before (note that r6 and r7 are isomorphic
IV's):
        li r6, 0
        or r7, r6, r6
LBB_test_3:     ; no_exit
        lwz r2, 0(r3)
        cmpw cr0, r2, r5
        or r2, r7, r7
        beq cr0, LBB_test_5     ; loopexit
LBB_test_4:     ; endif
        addi r2, r7, 1
        addi r7, r7, 1
        addi r3, r3, 4
        addi r6, r6, 1
        cmpw cr0, r6, r4
        blt cr0, LBB_test_3     ; no_exit
Now we get:
        li r6, 0
LBB_test_3:     ; no_exit
        or r2, r6, r6
        lwz r6, 0(r3)
        cmpw cr0, r6, r5
        beq cr0, LBB_test_6     ; loopexit
LBB_test_4:     ; endif
        addi r3, r3, 4
        addi r6, r2, 1
        cmpw cr0, r6, r4
        blt cr0, LBB_test_3     ; no_exit
this was noticed in em3d.
llvm-svn: 23602 | 
| | 
| 
| 
| 
| 
| 
| 
| | check the presplit pred, not the post-split pred.  This was causing us
to make the wrong decision in some cases, leaving the critical edge block
in the loop.
llvm-svn: 23601 | 
| | 
| 
| 
| | llvm-svn: 23579 | 
| | 
| 
| 
| 
| 
| | LowerInvoke/2005-08-03-InvokeWithPHI.ll
llvm-svn: 23525 | 
| | 
| 
| 
| 
| 
| | bringing the LLC time down to the CBE time.
llvm-svn: 23521 | 
| | 
| 
| 
| | llvm-svn: 23519 | 
| | 
| 
| 
| | llvm-svn: 23517 | 
| | 
| 
| 
| | llvm-svn: 23487 | 
| | 
| 
| 
| 
| 
| | to right now.
llvm-svn: 23485 | 
| | 
| 
| 
| 
| 
| | and PR632.
llvm-svn: 23484 | 
| | 
| 
| 
| | llvm-svn: 23478 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | is performed so it is only at most once per function that contains an invoke
instead of once per invoke in the function.  This patch has the following perks:
1. It fixes PR631, which complains about slowness.
2. If fixes PR240, which complains about non-volatile vars being live across
   setjmp/longjmps.
3. It improves (but does not fix) the jmpbuf alignment issue on itanium by not
   forcing the jmpbufs to always be 8-bytes off the alignment of the structure.
4. It speeds up 253.perlbmk from 338s to 13.70s (a 25x improvement!), making us
   now about 4% faster than GCC.
Further improvements are also possible.
llvm-svn: 23477 | 
| | 
| 
| 
| | llvm-svn: 23476 | 
| | 
| 
| 
| | llvm-svn: 23473 | 
| | 
| 
| 
| 
| 
| 
| 
| | implements
ctor-list-opt.ll:CTOR8
llvm-svn: 23465 | 
| | 
| 
| 
| 
| 
| | potentially replaced at link-time.
llvm-svn: 23463 | 
| | 
| 
| 
| 
| 
| 
| 
| | because gccas runs globalopt before inlining.
This implements ctor-list-opt.ll:CTOR7
llvm-svn: 23462 | 
| | 
| 
| 
| | llvm-svn: 23460 | 
| | 
| 
| 
| | llvm-svn: 23453 | 
| | 
| 
| 
| | llvm-svn: 23452 | 
| | 
| 
| 
| | llvm-svn: 23450 | 
| | 
| 
| 
| 
| 
| | ctor-list-opt.ll:CTOR5.
llvm-svn: 23449 | 
| | 
| 
| 
| | llvm-svn: 23447 | 
| | 
| 
| 
| 
| 
| | ConstantFoldLoadThroughGEPConstantExpr function in the utils lib.
llvm-svn: 23446 | 
| | 
| 
| 
| 
| 
| | as ConstantFoldLoadThroughGEPConstantExpr.
llvm-svn: 23445 | 
| | 
| 
| 
| 
| 
| | pass.
llvm-svn: 23444 | 
| | 
| 
| 
| | llvm-svn: 23442 | 
| | 
| 
| 
| | llvm-svn: 23441 | 
| | 
| 
| 
| | llvm-svn: 23439 | 
| | 
| 
| 
| 
| 
| | global ctors that are simple enough.  This implements ctor-list-opt.ll:CTOR2.
llvm-svn: 23437 | 
| | 
| 
| 
| 
| 
| | functionality change.
llvm-svn: 23435 | 
| | 
| 
| 
| 
| 
| | accepting the null even with a non-65535 init prio
llvm-svn: 23434 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | Implement the start of global ctor optimization.  It is currently smart
enough to remove the global ctor for cases like this:
struct foo {
  foo() {}
} x;
... saving a bit of startup time for the program.
llvm-svn: 23433 | 
| | 
| 
| 
| 
| 
| | SimplifyLibCalls/2005-05-20-sprintf-crash.ll
llvm-svn: 23430 | 
| | 
| 
| 
| 
| 
| | Match a bunch of idioms for sign extensions, implementing InstCombine/signext.ll
llvm-svn: 23428 | 
| | 
| 
| 
| 
| 
| 
| 
| | sprintf("%s", P)'s that have uses.
s/hasNUses(0)/use_empty()/
llvm-svn: 23425 | 
| | 
| 
| 
| | llvm-svn: 23411 | 
| | 
| 
| 
| 
| 
| 
| | This implements SimplifyCFG/branch-fold.ll, and is useful on ?:/min/max heavy
code
llvm-svn: 23410 | 
| | 
| 
| 
| | llvm-svn: 23408 | 
| | 
| 
| 
| | llvm-svn: 23407 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| | not define a value that is used outside of it's block.  This catches many
more simplifications, e.g. 854 in 176.gcc, 137 in vpr, etc.
This implements branch-phi-thread.ll:test3.ll
llvm-svn: 23397 | 
| | 
| 
| 
| 
| 
| | predecessors.  This implements branch-phi-thread.ll::test1
llvm-svn: 23395 | 
| | 
| 
| 
| | llvm-svn: 23393 | 
| | 
| 
| 
| | llvm-svn: 23392 | 
| | 
| 
| 
| 
| 
| 
| | control across branches with determined outcomes.  More generality to follow.
This triggers a couple thousand times in specint.
llvm-svn: 23391 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus2 (unsigned int x) { b.j += x; }
To:
_plus2:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        slwi r3, r3, 6
        add r3, r4, r3
        rlwimi r3, r4, 0, 26, 14
        stw r3, 0(r2)
        blr
instead of:
_plus2:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        rlwinm r5, r4, 26, 21, 31
        add r3, r5, r3
        rlwimi r4, r3, 6, 15, 25
        stw r4, 0(r2)
        blr
by eliminating an 'and'.
I'm pretty sure this is as small as we can go :)
llvm-svn: 23386 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus2 (unsigned int x) {
  b.j += x;
}
to:
plus2:
        mov %EAX, DWORD PTR [b]
        mov %ECX, %EAX
        and %ECX, 131008
        mov %EDX, DWORD PTR [%ESP + 4]
        shl %EDX, 6
        add %EDX, %ECX
        and %EDX, 131008
        and %EAX, -131009
        or %EDX, %EAX
        mov DWORD PTR [b], %EDX
        ret
instead of:
plus2:
        mov %EAX, DWORD PTR [b]
        mov %ECX, %EAX
        shr %ECX, 6
        and %ECX, 2047
        add %ECX, DWORD PTR [%ESP + 4]
        shl %ECX, 6
        and %ECX, 131008
        and %EAX, -131009
        or %ECX, %EAX
        mov DWORD PTR [b], %ECX
        ret
llvm-svn: 23385 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus3 (unsigned int x) { b.k += x; }
To:
plus3:
        mov %EAX, DWORD PTR [%ESP + 4]
        shl %EAX, 17
        add DWORD PTR [b], %EAX
        ret
instead of:
plus3:
        mov %EAX, DWORD PTR [%ESP + 4]
        shl %EAX, 17
        mov %ECX, DWORD PTR [b]
        add %EAX, %ECX
        and %EAX, -131072
        and %ECX, 131071
        or %ECX, %EAX
        mov DWORD PTR [b], %ECX
        ret
llvm-svn: 23384 |