|  | Commit message (Collapse) | Author | Age | Files | Lines | 
|---|
| | 
| 
| 
| 
| 
| | global ctors that are simple enough.  This implements ctor-list-opt.ll:CTOR2.
llvm-svn: 23437 | 
| | 
| 
| 
| 
| 
| | functionality change.
llvm-svn: 23435 | 
| | 
| 
| 
| 
| 
| | accepting the null even with a non-65535 init prio
llvm-svn: 23434 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | Implement the start of global ctor optimization.  It is currently smart
enough to remove the global ctor for cases like this:
struct foo {
  foo() {}
} x;
... saving a bit of startup time for the program.
llvm-svn: 23433 | 
| | 
| 
| 
| 
| 
| | SimplifyLibCalls/2005-05-20-sprintf-crash.ll
llvm-svn: 23430 | 
| | 
| 
| 
| 
| 
| | Match a bunch of idioms for sign extensions, implementing InstCombine/signext.ll
llvm-svn: 23428 | 
| | 
| 
| 
| 
| 
| 
| 
| | sprintf("%s", P)'s that have uses.
s/hasNUses(0)/use_empty()/
llvm-svn: 23425 | 
| | 
| 
| 
| | llvm-svn: 23411 | 
| | 
| 
| 
| 
| 
| 
| | This implements SimplifyCFG/branch-fold.ll, and is useful on ?:/min/max heavy
code
llvm-svn: 23410 | 
| | 
| 
| 
| | llvm-svn: 23408 | 
| | 
| 
| 
| | llvm-svn: 23407 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| | not define a value that is used outside of it's block.  This catches many
more simplifications, e.g. 854 in 176.gcc, 137 in vpr, etc.
This implements branch-phi-thread.ll:test3.ll
llvm-svn: 23397 | 
| | 
| 
| 
| 
| 
| | predecessors.  This implements branch-phi-thread.ll::test1
llvm-svn: 23395 | 
| | 
| 
| 
| | llvm-svn: 23393 | 
| | 
| 
| 
| | llvm-svn: 23392 | 
| | 
| 
| 
| 
| 
| 
| | control across branches with determined outcomes.  More generality to follow.
This triggers a couple thousand times in specint.
llvm-svn: 23391 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus2 (unsigned int x) { b.j += x; }
To:
_plus2:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        slwi r3, r3, 6
        add r3, r4, r3
        rlwimi r3, r4, 0, 26, 14
        stw r3, 0(r2)
        blr
instead of:
_plus2:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        rlwinm r5, r4, 26, 21, 31
        add r3, r5, r3
        rlwimi r4, r3, 6, 15, 25
        stw r4, 0(r2)
        blr
by eliminating an 'and'.
I'm pretty sure this is as small as we can go :)
llvm-svn: 23386 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus2 (unsigned int x) {
  b.j += x;
}
to:
plus2:
        mov %EAX, DWORD PTR [b]
        mov %ECX, %EAX
        and %ECX, 131008
        mov %EDX, DWORD PTR [%ESP + 4]
        shl %EDX, 6
        add %EDX, %ECX
        and %EDX, 131008
        and %EAX, -131009
        or %EDX, %EAX
        mov DWORD PTR [b], %EDX
        ret
instead of:
plus2:
        mov %EAX, DWORD PTR [b]
        mov %ECX, %EAX
        shr %ECX, 6
        and %ECX, 2047
        add %ECX, DWORD PTR [%ESP + 4]
        shl %ECX, 6
        and %ECX, 131008
        and %EAX, -131009
        or %ECX, %EAX
        mov DWORD PTR [b], %ECX
        ret
llvm-svn: 23385 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus3 (unsigned int x) { b.k += x; }
To:
plus3:
        mov %EAX, DWORD PTR [%ESP + 4]
        shl %EAX, 17
        add DWORD PTR [b], %EAX
        ret
instead of:
plus3:
        mov %EAX, DWORD PTR [%ESP + 4]
        shl %EAX, 17
        mov %ECX, DWORD PTR [b]
        add %EAX, %ECX
        and %EAX, -131072
        and %ECX, 131071
        or %ECX, %EAX
        mov DWORD PTR [b], %ECX
        ret
llvm-svn: 23384 | 
| | 
| 
| 
| | llvm-svn: 23383 | 
| | 
| 
| 
| | llvm-svn: 23382 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus3 (unsigned int x) {
  b.k += x;
}
to:
_plus3:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r3, 0(r2)
        rlwinm r4, r3, 0, 0, 14
        add r4, r4, r3
        rlwimi r4, r3, 0, 15, 31
        stw r4, 0(r2)
        blr
instead of:
_plus3:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        srwi r5, r4, 17
        add r3, r5, r3
        slwi r3, r3, 17
        rlwimi r3, r4, 0, 15, 31
        stw r3, 0(r2)
        blr
llvm-svn: 23381 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus1 (unsigned int x) {
  b.i += x;
}
as:
_plus1:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        add r3, r4, r3
        rlwimi r3, r4, 0, 0, 25
        stw r3, 0(r2)
        blr
instead of:
_plus1:
        lis r2, ha16(L_b$non_lazy_ptr)
        lwz r2, lo16(L_b$non_lazy_ptr)(r2)
        lwz r4, 0(r2)
        rlwinm r5, r4, 0, 26, 31
        add r3, r5, r3
        rlwimi r3, r4, 0, 0, 25
        stw r3, 0(r2)
        blr
llvm-svn: 23379 | 
| | 
| 
| 
| | llvm-svn: 23377 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | struct {
   unsigned int bit0:1;
   unsigned int ubyte:31;
} sdata;
void foo() {
  sdata.ubyte++;
}
into this:
foo:
        add DWORD PTR [sdata], 2
        ret
instead of this:
foo:
        mov %EAX, DWORD PTR [sdata]
        mov %ECX, %EAX
        add %ECX, 2
        and %ECX, -2
        and %EAX, 1
        or %EAX, %ECX
        mov DWORD PTR [sdata], %EAX
        ret
llvm-svn: 23376 | 
| | 
| 
| 
| | llvm-svn: 23348 | 
| | 
| 
| 
| 
| 
| 
| | This is useful for 178.galgel where resolution of dope vectors (by the
optimizer) causes the scales to become apparent.
llvm-svn: 23328 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| | PHI node that is not the original PHI.
This fixes up a dot-product loop in galgel, speeding it up from 18.47s to
16.13s.
llvm-svn: 23327 | 
| | 
| 
| 
| 
| 
| | indentation, no functionality change
llvm-svn: 23325 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| | if () { store A -> P; } else { store B -> P; }
into a PHI node with one store, in the most trival case.  This implements
load.ll:test10.
llvm-svn: 23324 | 
| | 
| 
| 
| 
| 
| | each other.  This implements InstCombine/load.ll:test9
llvm-svn: 23322 | 
| | 
| 
| 
| 
| 
| 
| 
| | load are exactly consequtive.  This is picked up by other passes, but this
triggers thousands of times in fortran programs that use static locals
(and is thus a compile-time speedup).
llvm-svn: 23320 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | code for IV uses outside of loops that are not dominated by the latch block.
We should only convert these uses to use the post-inc value if they ARE
dominated by the latch block.
Also use a new LoopInfo method to simplify some code.
This fixes Transforms/LoopStrengthReduce/2005-09-12-UsesOutOutsideOfLoop.ll
llvm-svn: 23318 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | li r2, 0
LBB_test_1:     ; no_exit.2
        li r5, 0
        stw r5, 0(r3)
        addi r2, r2, 1
        addi r3, r3, 4
        cmpwi cr0, r2, 701
        blt cr0, LBB_test_1     ; no_exit.2
LBB_test_2:     ; loopexit.2.loopexit
        addi r2, r2, 1
        stw r2, 0(r4)
        blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version.  This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
        li r2, 0                 **** IV starts at 0
LBB_test_1:     ; no_exit.2
        or r5, r2, r2            **** Copy for loop exit
        li r2, 0
        stw r2, 0(r3)
        addi r3, r3, 4
        addi r2, r5, 1
        addi r6, r5, 2           **** IV+2
        cmpwi cr0, r6, 701
        blt cr0, LBB_test_1     ; no_exit.2
LBB_test_2:     ; loopexit.2.loopexit
        addi r2, r5, 2       ****  IV+2
        stw r2, 0(r4)
        blr
And now generated code like this:
_test:
        li r2, 1               *** IV starts at 1
LBB_test_1:     ; no_exit.2
        li r5, 0
        stw r5, 0(r3)
        addi r2, r2, 1
        addi r3, r3, 4
        cmpwi cr0, r2, 701     *** IV.postinc + 0
        blt cr0, LBB_test_1
LBB_test_2:     ; loopexit.2.loopexit
        stw r2, 0(r4)          *** IV.postinc + 0
        blr
llvm-svn: 23313 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | We used to emit this code for it:
_test:
        li r2, 1     ;; Value tying up a register for the whole loop
        li r5, 0
LBB_test_1:     ; no_exit.2
        or r6, r5, r5
        li r5, 0
        stw r5, 0(r3)
        addi r5, r6, 1
        addi r3, r3, 4
        add r7, r2, r5  ;; should be addi r7, r5, 1
        cmpwi cr0, r7, 701
        blt cr0, LBB_test_1     ; no_exit.2
LBB_test_2:     ; loopexit.2.loopexit
        addi r2, r6, 2
        stw r2, 0(r4)
        blr
now we emit this:
_test:
        li r2, 0
LBB_test_1:     ; no_exit.2
        or r5, r2, r2
        li r2, 0
        stw r2, 0(r3)
        addi r3, r3, 4
        addi r2, r5, 1
        addi r6, r5, 2   ;; whoa, fold those adds!
        cmpwi cr0, r6, 701
        blt cr0, LBB_test_1     ; no_exit.2
LBB_test_2:     ; loopexit.2.loopexit
        addi r2, r5, 2
        stw r2, 0(r4)
        blr
more improvement coming.
llvm-svn: 23306 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | in building maximal expressions before simplifying them.  In particular, i
cases like this:
X-(A+B+X)
the code would consider A+B+X to be a maximal expression (not understanding
that the single use '-' would be turned into a + later), simplify it (a noop)
then later get simplified again.
Each of these simplify steps is where the cost of reassociation comes from,
so this patch should speed up the already fast pass a bit.
Thanks to Dan for noticing this!
llvm-svn: 23214 | 
| | 
| 
| 
| 
| 
| | to where we need it when converting -(A+B+C) -> -A + -B + -C.
llvm-svn: 23213 | 
| | 
| 
| 
| 
| 
| | Ops vector out of range
llvm-svn: 23211 | 
| | 
| 
| 
| | llvm-svn: 23019 | 
| | 
| 
| 
| 
| 
| 
| | Regression/Transforms/SimplifyLibCalls/floor.ll.  This triggers 19 times in
177.mesa.
llvm-svn: 23017 | 
| | 
| 
| 
| 
| 
| | on 177.mesa
llvm-svn: 22843 | 
| | 
| 
| 
| 
| 
| 
| 
| | Do not claim to not change the CFG.  We do change the cfg to split critical
edges.  This isn't causing us a problem now, but could likely do so in the
future.
llvm-svn: 22824 | 
| | 
| 
| 
| 
| 
| 
| | loop, because a IV-dependent value was used outside of the loop and didn't
have immediate-folding capability
llvm-svn: 22798 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | .LBB_foo_3:     ; no_exit.1
        lfd f2, 0(r9)
        lfd f3, 8(r9)
        fmul f4, f1, f2
        fmadd f4, f0, f3, f4
        stfd f4, 8(r9)
        fmul f3, f1, f3
        fmsub f2, f0, f2, f3
        stfd f2, 0(r9)
        addi r9, r9, 16
        addi r8, r8, 1
        cmpw cr0, r8, r4
        ble .LBB_foo_3  ; no_exit.1
llvm-svn: 22782 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4:     ; no_exit.1
        lfd f2, 16(r9)
        fmul f3, f0, f2
        fmul f2, f1, f2
        fadd f4, f3, f2
        stfd f4, 8(r9)
        fsub f2, f3, f2
        stfd f2, 16(r9)
        addi r8, r8, 1
        addi r9, r9, 16
        cmpw cr0, r8, r4
        ble .LBB_foo_4  ; no_exit.1
instead of:
.LBB_foo_3:     ; no_exit.1
        lfdx f2, r6, r9
        add r10, r6, r9
        lfd f3, 8(r10)
        fmul f4, f1, f2
        fmadd f4, f0, f3, f4
        stfd f4, 8(r10)
        fmul f3, f1, f3
        fmsub f2, f0, f2, f3
        stfdx f2, r6, r9
        addi r9, r9, 16
        addi r8, r8, 1
        cmpw cr0, r8, r4
        ble .LBB_foo_3  ; no_exit.1
llvm-svn: 22781 | 
| | 
| 
| 
| 
| 
| 
| | a problem in LoopStrengthReduction, where it would split critical edges
then confused itself with outdated loop information.
llvm-svn: 22776 | 
| | 
| 
| 
| 
| 
| | need to be updated.  This code is a relic from when it did.
llvm-svn: 22775 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | middle of the loop.  This turns a critical loop in gzip into this:
.LBB_test_1:    ; loopentry
        or r27, r28, r28
        add r28, r3, r27
        lhz r28, 3(r28)
        add r26, r4, r27
        lhz r26, 3(r26)
        cmpw cr0, r28, r26
        bne .LBB_test_8 ; loopentry.loopexit_crit_edge
.LBB_test_2:    ; shortcirc_next.0
        add r28, r3, r27
        lhz r28, 5(r28)
        add r26, r4, r27
        lhz r26, 5(r26)
        cmpw cr0, r28, r26
        bne .LBB_test_7 ; shortcirc_next.0.loopexit_crit_edge
.LBB_test_3:    ; shortcirc_next.1
        add r28, r3, r27
        lhz r28, 7(r28)
        add r26, r4, r27
        lhz r26, 7(r26)
        cmpw cr0, r28, r26
        bne .LBB_test_6 ; shortcirc_next.1.loopexit_crit_edge
.LBB_test_4:    ; shortcirc_next.2
        add r28, r3, r27
        lhz r26, 9(r28)
        add r28, r4, r27
        lhz r25, 9(r28)
        addi r28, r27, 8
        cmpw cr7, r26, r25
        mfcr r26, 1
        rlwinm r26, r26, 31, 31, 31
        add r25, r8, r27
        cmpw cr7, r25, r7
        mfcr r25, 1
        rlwinm r25, r25, 29, 31, 31
        and. r26, r26, r25
        bne .LBB_test_1 ; loopentry
instead of this:
.LBB_test_1:    ; loopentry
        or r27, r28, r28
        add r28, r3, r27
        lhz r28, 3(r28)
        add r26, r4, r27
        lhz r26, 3(r26)
        cmpw cr0, r28, r26
        beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2:    ; loopentry.loopexit_crit_edge
        add r2, r30, r27
        add r8, r29, r27
        b .LBB_test_9   ; loopexit
.LBB_test_3:    ; shortcirc_next.0
        add r28, r3, r27
        lhz r28, 5(r28)
        add r26, r4, r27
        lhz r26, 5(r26)
        cmpw cr0, r28, r26
        beq .LBB_test_5 ; shortcirc_next.1
.LBB_test_4:    ; shortcirc_next.0.loopexit_crit_edge
        add r2, r11, r27
        add r8, r12, r27
        b .LBB_test_9   ; loopexit
.LBB_test_5:    ; shortcirc_next.1
        add r28, r3, r27
        lhz r28, 7(r28)
        add r26, r4, r27
        lhz r26, 7(r26)
        cmpw cr0, r28, r26
        beq .LBB_test_7 ; shortcirc_next.2
.LBB_test_6:    ; shortcirc_next.1.loopexit_crit_edge
        add r2, r9, r27
        add r8, r10, r27
        b .LBB_test_9   ; loopexit
.LBB_test_7:    ; shortcirc_next.2
        add r28, r3, r27
        lhz r26, 9(r28)
        add r28, r4, r27
        lhz r25, 9(r28)
        addi r28, r27, 8
        cmpw cr7, r26, r25
        mfcr r26, 1
        rlwinm r26, r26, 31, 31, 31
        add r25, r8, r27
        cmpw cr7, r25, r7
        mfcr r25, 1
        rlwinm r25, r25, 29, 31, 31
        and. r26, r26, r25
        bne .LBB_test_1 ; loopentry
Next up, improve the code for the loop.
llvm-svn: 22769 | 
| | 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| 
| | edge so that the code is not always executed for both operands.  This
prevents LSR from inserting code into loops whose exit blocks contain
PHI uses of IV expressions (which are outside of loops).  On gzip, for
example, we turn this ugly code:
.LBB_test_1:    ; loopentry
        add r27, r3, r28
        lhz r27, 3(r27)
        add r26, r4, r28
        lhz r26, 3(r26)
        add r25, r30, r28    ;; Only live if exiting the loop
        add r24, r29, r28    ;; Only live if exiting the loop
        cmpw cr0, r27, r26
        bne .LBB_test_5 ; loopexit
into this:
.LBB_test_1:    ; loopentry
        or r27, r28, r28
        add r28, r3, r27
        lhz r28, 3(r28)
        add r26, r4, r27
        lhz r26, 3(r26)
        cmpw cr0, r28, r26
        beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2:    ; loopentry.loopexit_crit_edge
        add r2, r30, r27
        add r8, r29, r27
        b .LBB_test_9   ; loopexit
.LBB_test_2:    ; shortcirc_next.0
        ...
        blt .LBB_test_1
into this:
.LBB_test_1:    ; loopentry
        or r27, r28, r28
        add r28, r3, r27
        lhz r28, 3(r28)
        add r26, r4, r27
        lhz r26, 3(r26)
        cmpw cr0, r28, r26
        beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2:    ; loopentry.loopexit_crit_edge
        add r2, r30, r27
        add r8, r29, r27
        b .LBB_t_3:    ; shortcirc_next.0
.LBB_test_3:    ; shortcirc_next.0
        ...
        blt .LBB_test_1
Next step: get the block out of the loop so that the loop is all
fall-throughs again.
llvm-svn: 22766 | 
| | 
| 
| 
| 
| 
| 
| | Instead, just update the BB in-place.  This is both faster, and it prevents
split-critical-edges from shuffling the PHI argument list unneccesarily.
llvm-svn: 22765 |