| Commit message (Collapse) | Author | Age | Files | Lines |
| |
|
|
|
|
| |
Match a bunch of idioms for sign extensions, implementing InstCombine/signext.ll
llvm-svn: 23428
|
| |
|
|
|
|
|
|
| |
sprintf("%s", P)'s that have uses.
s/hasNUses(0)/use_empty()/
llvm-svn: 23425
|
| |
|
|
| |
llvm-svn: 23411
|
| |
|
|
|
|
|
| |
This implements SimplifyCFG/branch-fold.ll, and is useful on ?:/min/max heavy
code
llvm-svn: 23410
|
| |
|
|
| |
llvm-svn: 23408
|
| |
|
|
| |
llvm-svn: 23407
|
| |
|
|
|
|
|
|
|
| |
not define a value that is used outside of it's block. This catches many
more simplifications, e.g. 854 in 176.gcc, 137 in vpr, etc.
This implements branch-phi-thread.ll:test3.ll
llvm-svn: 23397
|
| |
|
|
|
|
| |
predecessors. This implements branch-phi-thread.ll::test1
llvm-svn: 23395
|
| |
|
|
| |
llvm-svn: 23393
|
| |
|
|
| |
llvm-svn: 23392
|
| |
|
|
|
|
|
| |
control across branches with determined outcomes. More generality to follow.
This triggers a couple thousand times in specint.
llvm-svn: 23391
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus2 (unsigned int x) { b.j += x; }
To:
_plus2:
lis r2, ha16(L_b$non_lazy_ptr)
lwz r2, lo16(L_b$non_lazy_ptr)(r2)
lwz r4, 0(r2)
slwi r3, r3, 6
add r3, r4, r3
rlwimi r3, r4, 0, 26, 14
stw r3, 0(r2)
blr
instead of:
_plus2:
lis r2, ha16(L_b$non_lazy_ptr)
lwz r2, lo16(L_b$non_lazy_ptr)(r2)
lwz r4, 0(r2)
rlwinm r5, r4, 26, 21, 31
add r3, r5, r3
rlwimi r4, r3, 6, 15, 25
stw r4, 0(r2)
blr
by eliminating an 'and'.
I'm pretty sure this is as small as we can go :)
llvm-svn: 23386
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus2 (unsigned int x) {
b.j += x;
}
to:
plus2:
mov %EAX, DWORD PTR [b]
mov %ECX, %EAX
and %ECX, 131008
mov %EDX, DWORD PTR [%ESP + 4]
shl %EDX, 6
add %EDX, %ECX
and %EDX, 131008
and %EAX, -131009
or %EDX, %EAX
mov DWORD PTR [b], %EDX
ret
instead of:
plus2:
mov %EAX, DWORD PTR [b]
mov %ECX, %EAX
shr %ECX, 6
and %ECX, 2047
add %ECX, DWORD PTR [%ESP + 4]
shl %ECX, 6
and %ECX, 131008
and %EAX, -131009
or %ECX, %EAX
mov DWORD PTR [b], %ECX
ret
llvm-svn: 23385
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus3 (unsigned int x) { b.k += x; }
To:
plus3:
mov %EAX, DWORD PTR [%ESP + 4]
shl %EAX, 17
add DWORD PTR [b], %EAX
ret
instead of:
plus3:
mov %EAX, DWORD PTR [%ESP + 4]
shl %EAX, 17
mov %ECX, DWORD PTR [b]
add %EAX, %ECX
and %EAX, -131072
and %ECX, 131071
or %ECX, %EAX
mov DWORD PTR [b], %ECX
ret
llvm-svn: 23384
|
| |
|
|
| |
llvm-svn: 23383
|
| |
|
|
| |
llvm-svn: 23382
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus3 (unsigned int x) {
b.k += x;
}
to:
_plus3:
lis r2, ha16(L_b$non_lazy_ptr)
lwz r2, lo16(L_b$non_lazy_ptr)(r2)
lwz r3, 0(r2)
rlwinm r4, r3, 0, 0, 14
add r4, r4, r3
rlwimi r4, r3, 0, 15, 31
stw r4, 0(r2)
blr
instead of:
_plus3:
lis r2, ha16(L_b$non_lazy_ptr)
lwz r2, lo16(L_b$non_lazy_ptr)(r2)
lwz r4, 0(r2)
srwi r5, r4, 17
add r3, r5, r3
slwi r3, r3, 17
rlwimi r3, r4, 0, 15, 31
stw r3, 0(r2)
blr
llvm-svn: 23381
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
struct S { unsigned int i : 6, j : 11, k : 15; } b;
void plus1 (unsigned int x) {
b.i += x;
}
as:
_plus1:
lis r2, ha16(L_b$non_lazy_ptr)
lwz r2, lo16(L_b$non_lazy_ptr)(r2)
lwz r4, 0(r2)
add r3, r4, r3
rlwimi r3, r4, 0, 0, 25
stw r3, 0(r2)
blr
instead of:
_plus1:
lis r2, ha16(L_b$non_lazy_ptr)
lwz r2, lo16(L_b$non_lazy_ptr)(r2)
lwz r4, 0(r2)
rlwinm r5, r4, 0, 26, 31
add r3, r5, r3
rlwimi r3, r4, 0, 0, 25
stw r3, 0(r2)
blr
llvm-svn: 23379
|
| |
|
|
| |
llvm-svn: 23377
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
struct {
unsigned int bit0:1;
unsigned int ubyte:31;
} sdata;
void foo() {
sdata.ubyte++;
}
into this:
foo:
add DWORD PTR [sdata], 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [sdata]
mov %ECX, %EAX
add %ECX, 2
and %ECX, -2
and %EAX, 1
or %EAX, %ECX
mov DWORD PTR [sdata], %EAX
ret
llvm-svn: 23376
|
| |
|
|
| |
llvm-svn: 23348
|
| |
|
|
|
|
|
| |
This is useful for 178.galgel where resolution of dope vectors (by the
optimizer) causes the scales to become apparent.
llvm-svn: 23328
|
| |
|
|
|
|
|
|
|
| |
PHI node that is not the original PHI.
This fixes up a dot-product loop in galgel, speeding it up from 18.47s to
16.13s.
llvm-svn: 23327
|
| |
|
|
|
|
| |
indentation, no functionality change
llvm-svn: 23325
|
| |
|
|
|
|
|
|
|
| |
if () { store A -> P; } else { store B -> P; }
into a PHI node with one store, in the most trival case. This implements
load.ll:test10.
llvm-svn: 23324
|
| |
|
|
|
|
| |
each other. This implements InstCombine/load.ll:test9
llvm-svn: 23322
|
| |
|
|
|
|
|
|
| |
load are exactly consequtive. This is picked up by other passes, but this
triggers thousands of times in fortran programs that use static locals
(and is thus a compile-time speedup).
llvm-svn: 23320
|
| |
|
|
|
|
|
|
|
|
|
|
| |
code for IV uses outside of loops that are not dominated by the latch block.
We should only convert these uses to use the post-inc value if they ARE
dominated by the latch block.
Also use a new LoopInfo method to simplify some code.
This fixes Transforms/LoopStrengthReduce/2005-09-12-UsesOutOutsideOfLoop.ll
llvm-svn: 23318
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
We used to emit this code for it:
_test:
li r2, 1 ;; Value tying up a register for the whole loop
li r5, 0
LBB_test_1: ; no_exit.2
or r6, r5, r5
li r5, 0
stw r5, 0(r3)
addi r5, r6, 1
addi r3, r3, 4
add r7, r2, r5 ;; should be addi r7, r5, 1
cmpwi cr0, r7, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r6, 2
stw r2, 0(r4)
blr
now we emit this:
_test:
li r2, 0
LBB_test_1: ; no_exit.2
or r5, r2, r2
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 ;; whoa, fold those adds!
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2
stw r2, 0(r4)
blr
more improvement coming.
llvm-svn: 23306
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
in building maximal expressions before simplifying them. In particular, i
cases like this:
X-(A+B+X)
the code would consider A+B+X to be a maximal expression (not understanding
that the single use '-' would be turned into a + later), simplify it (a noop)
then later get simplified again.
Each of these simplify steps is where the cost of reassociation comes from,
so this patch should speed up the already fast pass a bit.
Thanks to Dan for noticing this!
llvm-svn: 23214
|
| |
|
|
|
|
| |
to where we need it when converting -(A+B+C) -> -A + -B + -C.
llvm-svn: 23213
|
| |
|
|
|
|
| |
Ops vector out of range
llvm-svn: 23211
|
| |
|
|
| |
llvm-svn: 23019
|
| |
|
|
|
|
|
| |
Regression/Transforms/SimplifyLibCalls/floor.ll. This triggers 19 times in
177.mesa.
llvm-svn: 23017
|
| |
|
|
|
|
| |
on 177.mesa
llvm-svn: 22843
|
| |
|
|
|
|
|
|
| |
Do not claim to not change the CFG. We do change the cfg to split critical
edges. This isn't causing us a problem now, but could likely do so in the
future.
llvm-svn: 22824
|
| |
|
|
|
|
|
| |
loop, because a IV-dependent value was used outside of the loop and didn't
have immediate-folding capability
llvm-svn: 22798
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
.LBB_foo_3: ; no_exit.1
lfd f2, 0(r9)
lfd f3, 8(r9)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r9)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfd f2, 0(r9)
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22782
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
|
| |
|
|
|
|
|
| |
a problem in LoopStrengthReduction, where it would split critical edges
then confused itself with outdated loop information.
llvm-svn: 22776
|
| |
|
|
|
|
| |
need to be updated. This code is a relic from when it did.
llvm-svn: 22775
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
middle of the loop. This turns a critical loop in gzip into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
bne .LBB_test_8 ; loopentry.loopexit_crit_edge
.LBB_test_2: ; shortcirc_next.0
add r28, r3, r27
lhz r28, 5(r28)
add r26, r4, r27
lhz r26, 5(r26)
cmpw cr0, r28, r26
bne .LBB_test_7 ; shortcirc_next.0.loopexit_crit_edge
.LBB_test_3: ; shortcirc_next.1
add r28, r3, r27
lhz r28, 7(r28)
add r26, r4, r27
lhz r26, 7(r26)
cmpw cr0, r28, r26
bne .LBB_test_6 ; shortcirc_next.1.loopexit_crit_edge
.LBB_test_4: ; shortcirc_next.2
add r28, r3, r27
lhz r26, 9(r28)
add r28, r4, r27
lhz r25, 9(r28)
addi r28, r27, 8
cmpw cr7, r26, r25
mfcr r26, 1
rlwinm r26, r26, 31, 31, 31
add r25, r8, r27
cmpw cr7, r25, r7
mfcr r25, 1
rlwinm r25, r25, 29, 31, 31
and. r26, r26, r25
bne .LBB_test_1 ; loopentry
instead of this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_3: ; shortcirc_next.0
add r28, r3, r27
lhz r28, 5(r28)
add r26, r4, r27
lhz r26, 5(r26)
cmpw cr0, r28, r26
beq .LBB_test_5 ; shortcirc_next.1
.LBB_test_4: ; shortcirc_next.0.loopexit_crit_edge
add r2, r11, r27
add r8, r12, r27
b .LBB_test_9 ; loopexit
.LBB_test_5: ; shortcirc_next.1
add r28, r3, r27
lhz r28, 7(r28)
add r26, r4, r27
lhz r26, 7(r26)
cmpw cr0, r28, r26
beq .LBB_test_7 ; shortcirc_next.2
.LBB_test_6: ; shortcirc_next.1.loopexit_crit_edge
add r2, r9, r27
add r8, r10, r27
b .LBB_test_9 ; loopexit
.LBB_test_7: ; shortcirc_next.2
add r28, r3, r27
lhz r26, 9(r28)
add r28, r4, r27
lhz r25, 9(r28)
addi r28, r27, 8
cmpw cr7, r26, r25
mfcr r26, 1
rlwinm r26, r26, 31, 31, 31
add r25, r8, r27
cmpw cr7, r25, r7
mfcr r25, 1
rlwinm r25, r25, 29, 31, 31
and. r26, r26, r25
bne .LBB_test_1 ; loopentry
Next up, improve the code for the loop.
llvm-svn: 22769
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
edge so that the code is not always executed for both operands. This
prevents LSR from inserting code into loops whose exit blocks contain
PHI uses of IV expressions (which are outside of loops). On gzip, for
example, we turn this ugly code:
.LBB_test_1: ; loopentry
add r27, r3, r28
lhz r27, 3(r27)
add r26, r4, r28
lhz r26, 3(r26)
add r25, r30, r28 ;; Only live if exiting the loop
add r24, r29, r28 ;; Only live if exiting the loop
cmpw cr0, r27, r26
bne .LBB_test_5 ; loopexit
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_2: ; shortcirc_next.0
...
blt .LBB_test_1
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_t_3: ; shortcirc_next.0
.LBB_test_3: ; shortcirc_next.0
...
blt .LBB_test_1
Next step: get the block out of the loop so that the loop is all
fall-throughs again.
llvm-svn: 22766
|
| |
|
|
|
|
|
| |
Instead, just update the BB in-place. This is both faster, and it prevents
split-critical-edges from shuffling the PHI argument list unneccesarily.
llvm-svn: 22765
|
| |
|
|
| |
llvm-svn: 22751
|
| |
|
|
|
|
|
| |
into just Y. This often occurs when it seperates loops that have collapsed loop
headers. This implements LoopSimplify/phi-node-simplify.ll
llvm-svn: 22746
|
| |
|
|
|
|
| |
constant stride. This implements Transforms/IndVarsSimplify/variable-stride-ivs.ll
llvm-svn: 22744
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
|
| |
|
|
|
|
| |
by being more careful about updating PHI nodes
llvm-svn: 22739
|