[SelectionDAG] Improve the legalisation lowering of UMULO.

There is no way in the universe, that doing a full-width division in software will be faster than doing overflowing multiplication in software in the first place, especially given that this same full-width multiplication needs to be done anyway. This patch replaces the previous implementation with a direct lowering into an overflowing multiplication algorithm based on half-width operations. Correctness of the algorithm was verified by exhaustively checking the output of this algorithm for overflowing multiplication of 16 bit integers against an obviously correct widening multiplication. Baring any oversights introduced by porting the algorithm to DAG, confidence in correctness of this algorithm is extremely high. Following table shows the change in both t = runtime and s = space. The change is expressed as a multiplier of original, so anything under 1 is “better” and anything above 1 is worse. +-------+-----------+-----------+-------------+-------------+ | Arch | u64*u64 t | u64*u64 s | u128*u128 t | u128*u128 s | +-------+-----------+-----------+-------------+-------------+ | X64 | - | - | ~0.5 | ~0.64 | | i686 | ~0.5 | ~0.6666 | ~0.05 | ~0.9 | | armv7 | - | ~0.75 | - | ~1.4 | +-------+-----------+-----------+-------------+-------------+ Performance numbers have been collected by running overflowing multiplication in a loop under `perf` on two x86_64 (one Intel Haswell, other AMD Ryzen) based machines. Size numbers have been collected by looking at the size of function containing an overflowing multiply in a loop. All in all, it can be seen that both performance and size has improved except in the case of armv7 where code size has regressed for 128-bit multiply. u128*u128 overflowing multiply on 32-bit platforms seem to benefit from this change a lot, taking only 5% of the time compared to original algorithm to calculate the same thing. The final benefit of this change is that LLVM is now capable of lowering the overflowing unsigned multiply for integers of any bit-width as long as the target is capable of lowering regular multiplication for the same bit-width. Previously, 128-bit overflowing multiply was the widest possible. Patch by Simonas Kazlauskas! Differential Revision: https://reviews.llvm.org/D50310 llvm-svn: 339922
author: Eli Friedman <efriedma@codeaurora.org> 2018-08-16 18:39:39 +0000
committer: Eli Friedman <efriedma@codeaurora.org> 2018-08-16 18:39:39 +0000
commit: 73e8a784e62f945a51363c8b5ec4eaedcf9f87e8 (patch)
tree: 15afc4a44bf8511135900d38ebc78216dcb23b1b /llvm/test/CodeGen/RISCV
parent: d1767dc56f5be75bdff23f3fe33e54428fed704f (diff)
download: bcm5719-llvm-73e8a784e62f945a51363c8b5ec4eaedcf9f87e8.tar.gz
bcm5719-llvm-73e8a784e62f945a51363c8b5ec4eaedcf9f87e8.zip
1 files changed, 175 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
new file mode 100644
index 00000000000..51c9a35c9a9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RISCV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RISCV64
+
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+; RISCV32-LABEL: muloti_test:
+; RISCV32:       # %bb.0: # %start
+; RISCV32-NEXT:    addi sp, sp, -80
+; RISCV32-NEXT:    sw ra, 76(sp)
+; RISCV32-NEXT:    sw s1, 72(sp)
+; RISCV32-NEXT:    sw s2, 68(sp)
+; RISCV32-NEXT:    sw s3, 64(sp)
+; RISCV32-NEXT:    sw s4, 60(sp)
+; RISCV32-NEXT:    sw s5, 56(sp)
+; RISCV32-NEXT:    sw s6, 52(sp)
+; RISCV32-NEXT:    sw s7, 48(sp)
+; RISCV32-NEXT:    mv s3, a2
+; RISCV32-NEXT:    mv s1, a1
+; RISCV32-NEXT:    mv s2, a0
+; RISCV32-NEXT:    sw zero, 12(sp)
+; RISCV32-NEXT:    sw zero, 8(sp)
+; RISCV32-NEXT:    sw zero, 28(sp)
+; RISCV32-NEXT:    sw zero, 24(sp)
+; RISCV32-NEXT:    lw s5, 4(a2)
+; RISCV32-NEXT:    sw s5, 4(sp)
+; RISCV32-NEXT:    lw s6, 0(a2)
+; RISCV32-NEXT:    sw s6, 0(sp)
+; RISCV32-NEXT:    lw s4, 4(a1)
+; RISCV32-NEXT:    sw s4, 20(sp)
+; RISCV32-NEXT:    lw s7, 0(a1)
+; RISCV32-NEXT:    sw s7, 16(sp)
+; RISCV32-NEXT:    addi a0, sp, 32
+; RISCV32-NEXT:    addi a1, sp, 16
+; RISCV32-NEXT:    mv a2, sp
+; RISCV32-NEXT:    call __multi3
+; RISCV32-NEXT:    lw t1, 12(s1)
+; RISCV32-NEXT:    lw a1, 8(s1)
+; RISCV32-NEXT:    mul a0, s5, a1
+; RISCV32-NEXT:    mul a2, t1, s6
+; RISCV32-NEXT:    add a0, a2, a0
+; RISCV32-NEXT:    lw t5, 12(s3)
+; RISCV32-NEXT:    lw a3, 8(s3)
+; RISCV32-NEXT:    mul a2, s4, a3
+; RISCV32-NEXT:    mul a4, t5, s7
+; RISCV32-NEXT:    add a2, a4, a2
+; RISCV32-NEXT:    mul a4, a3, s7
+; RISCV32-NEXT:    mul a5, a1, s6
+; RISCV32-NEXT:    add s1, a5, a4
+; RISCV32-NEXT:    sltu a4, s1, a5
+; RISCV32-NEXT:    mulhu a6, a3, s7
+; RISCV32-NEXT:    add a7, a6, a2
+; RISCV32-NEXT:    mulhu t2, a1, s6
+; RISCV32-NEXT:    add t4, t2, a0
+; RISCV32-NEXT:    add a0, t4, a7
+; RISCV32-NEXT:    add a0, a0, a4
+; RISCV32-NEXT:    xor a2, s5, zero
+; RISCV32-NEXT:    snez a2, a2
+; RISCV32-NEXT:    xor a4, t1, zero
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a2, a4, a2
+; RISCV32-NEXT:    xor a4, s4, zero
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    xor a5, t5, zero
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    and a4, a5, a4
+; RISCV32-NEXT:    mulhu a5, t5, s7
+; RISCV32-NEXT:    xor a5, a5, zero
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    or t0, a4, a5
+; RISCV32-NEXT:    mulhu a4, t1, s6
+; RISCV32-NEXT:    xor a4, a4, zero
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    or t3, a2, a4
+; RISCV32-NEXT:    lw a4, 44(sp)
+; RISCV32-NEXT:    add a5, a4, a0
+; RISCV32-NEXT:    lw a2, 40(sp)
+; RISCV32-NEXT:    add a0, a2, s1
+; RISCV32-NEXT:    sltu t6, a0, a2
+; RISCV32-NEXT:    add s1, a5, t6
+; RISCV32-NEXT:    beq s1, a4, .LBB0_2
+; RISCV32-NEXT:  # %bb.1: # %start
+; RISCV32-NEXT:    sltu t6, s1, a4
+; RISCV32-NEXT:  .LBB0_2: # %start
+; RISCV32-NEXT:    xor a4, s1, a4
+; RISCV32-NEXT:    xor a2, a0, a2
+; RISCV32-NEXT:    or a2, a2, a4
+; RISCV32-NEXT:    sltu t2, t4, t2
+; RISCV32-NEXT:    mulhu a4, s5, a1
+; RISCV32-NEXT:    xor a4, a4, zero
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    or t3, t3, a4
+; RISCV32-NEXT:    sltu a6, a7, a6
+; RISCV32-NEXT:    mulhu a4, s4, a3
+; RISCV32-NEXT:    xor a4, a4, zero
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    or a4, t0, a4
+; RISCV32-NEXT:    lw a5, 36(sp)
+; RISCV32-NEXT:    sw a5, 4(s2)
+; RISCV32-NEXT:    lw a5, 32(sp)
+; RISCV32-NEXT:    sw a5, 0(s2)
+; RISCV32-NEXT:    sw a0, 8(s2)
+; RISCV32-NEXT:    sw s1, 12(s2)
+; RISCV32-NEXT:    mv a0, zero
+; RISCV32-NEXT:    beqz a2, .LBB0_4
+; RISCV32-NEXT:  # %bb.3: # %start
+; RISCV32-NEXT:    mv a0, t6
+; RISCV32-NEXT:  .LBB0_4: # %start
+; RISCV32-NEXT:    or a2, a4, a6
+; RISCV32-NEXT:    or a4, t3, t2
+; RISCV32-NEXT:    or a3, a3, t5
+; RISCV32-NEXT:    or a1, a1, t1
+; RISCV32-NEXT:    xor a1, a1, zero
+; RISCV32-NEXT:    xor a3, a3, zero
+; RISCV32-NEXT:    snez a3, a3
+; RISCV32-NEXT:    snez a1, a1
+; RISCV32-NEXT:    and a1, a1, a3
+; RISCV32-NEXT:    or a1, a1, a4
+; RISCV32-NEXT:    or a1, a1, a2
+; RISCV32-NEXT:    or a0, a1, a0
+; RISCV32-NEXT:    andi a0, a0, 1
+; RISCV32-NEXT:    sb a0, 16(s2)
+; RISCV32-NEXT:    lw s7, 48(sp)
+; RISCV32-NEXT:    lw s6, 52(sp)
+; RISCV32-NEXT:    lw s5, 56(sp)
+; RISCV32-NEXT:    lw s4, 60(sp)
+; RISCV32-NEXT:    lw s3, 64(sp)
+; RISCV32-NEXT:    lw s2, 68(sp)
+; RISCV32-NEXT:    lw s1, 72(sp)
+; RISCV32-NEXT:    lw ra, 76(sp)
+; RISCV32-NEXT:    addi sp, sp, 80
+; RISCV32-NEXT:    ret
+;
+; RISCV64-LABEL: muloti_test:
+; RISCV64:       # %bb.0: # %start
+; RISCV64-NEXT:    mul a6, a4, a1
+; RISCV64-NEXT:    mul a5, a2, a3
+; RISCV64-NEXT:    add a6, a5, a6
+; RISCV64-NEXT:    mul a5, a1, a3
+; RISCV64-NEXT:    sw a5, 0(a0)
+; RISCV64-NEXT:    mulhu a7, a1, a3
+; RISCV64-NEXT:    add a5, a7, a6
+; RISCV64-NEXT:    sw a5, 8(a0)
+; RISCV64-NEXT:    sltu a6, a5, a7
+; RISCV64-NEXT:    xor a5, a4, zero
+; RISCV64-NEXT:    snez a7, a5
+; RISCV64-NEXT:    xor a5, a2, zero
+; RISCV64-NEXT:    snez a5, a5
+; RISCV64-NEXT:    and a5, a5, a7
+; RISCV64-NEXT:    mulhu a2, a2, a3
+; RISCV64-NEXT:    xor a2, a2, zero
+; RISCV64-NEXT:    snez a2, a2
+; RISCV64-NEXT:    or a2, a5, a2
+; RISCV64-NEXT:    mulhu a1, a4, a1
+; RISCV64-NEXT:    xor a1, a1, zero
+; RISCV64-NEXT:    snez a1, a1
+; RISCV64-NEXT:    or a1, a2, a1
+; RISCV64-NEXT:    or a1, a1, a6
+; RISCV64-NEXT:    sb a1, 16(a0)
+; RISCV64-NEXT:    ret
+start:
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
+  %1 = extractvalue { i128, i1 } %0, 0
+  %2 = extractvalue { i128, i1 } %0, 1
+  %3 = zext i1 %2 to i8
+  %4 = insertvalue { i128, i8 } undef, i128 %1, 0
+  %5 = insertvalue { i128, i8 } %4, i8 %3, 1
+  ret { i128, i8 } %5
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128) #1
+
+attributes #0 = { nounwind readnone uwtable }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind }
author	Eli Friedman <efriedma@codeaurora.org>	2018-08-16 18:39:39 +0000
committer	Eli Friedman <efriedma@codeaurora.org>	2018-08-16 18:39:39 +0000
commit	73e8a784e62f945a51363c8b5ec4eaedcf9f87e8 (patch)
tree	15afc4a44bf8511135900d38ebc78216dcb23b1b /llvm/test/CodeGen/RISCV
parent	d1767dc56f5be75bdff23f3fe33e54428fed704f (diff)
download	bcm5719-llvm-73e8a784e62f945a51363c8b5ec4eaedcf9f87e8.tar.gz bcm5719-llvm-73e8a784e62f945a51363c8b5ec4eaedcf9f87e8.zip