[Intrinsics] define funnel shift IR intrinsics + DAG builder support

As discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-May/123292.html http://lists.llvm.org/pipermail/llvm-dev/2018-July/124400.html We want to add rotate intrinsics because the IR expansion of that pattern is 4+ instructions, and we can lose pieces of the pattern before it gets to the backend. Generalizing the operation by allowing 2 different input values (plus the 3rd shift/rotate amount) gives us a "funnel shift" operation which may also be a single hardware instruction. Initially, I thought we needed to define new DAG nodes for these ops, and I spent time working on that (much larger patch), but then I concluded that we don't need it. At least as a first step, we have all of the backend support necessary to match these ops...because it was required. And shepherding these through the IR optimizer is the primary concern, so the IR intrinsics are likely all that we'll ever need. There was also a question about converting the intrinsics to the existing ROTL/ROTR DAG nodes (along with improving the oversized shift documentation). Again, I don't think that's strictly necessary (as the test results here prove). That can be an efficiency improvement as a small follow-up patch. So all we're left with is documentation, definition of the IR intrinsics, and DAG builder support. Differential Revision: https://reviews.llvm.org/D49242 llvm-svn: 337221
author: Sanjay Patel <spatel@rotateright.com> 2018-07-16 22:59:31 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2018-07-16 22:59:31 +0000
commit: c71adc8040b1e382b195a0096015cb5c39628b23 (patch)
tree: 8711ea739eab9d1354abf5fed2412f7d00f75293 /llvm/test/CodeGen/PowerPC/funnel-shift.ll
parent: c4846a551e0c1499e67f4aa287abe89be20ffe5f (diff)
download: bcm5719-llvm-c71adc8040b1e382b195a0096015cb5c39628b23.tar.gz
bcm5719-llvm-c71adc8040b1e382b195a0096015cb5c39628b23.zip
1 files changed, 271 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
new file mode 100644
index 00000000000..9acc1ac5221
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; General case - all operands can be variables.
+
+define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: fshl_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 6, 5, 32
+; CHECK-NEXT:    andi. 5, 5, 31
+; CHECK-NEXT:    clrlwi 6, 6, 27
+; CHECK-NEXT:    slw 5, 3, 5
+; CHECK-NEXT:    srw 4, 4, 6
+; CHECK-NEXT:    or 4, 5, 4
+; CHECK-NEXT:    isel 3, 3, 4, 2
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshl.i37(i37, i37, i37)
+define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
+; CHECK-LABEL: fshl_i37:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis 6, -8857
+; CHECK-NEXT:    subfic 7, 5, 37
+; CHECK-NEXT:    clrldi 5, 5, 27
+; CHECK-NEXT:    clrldi 4, 4, 27
+; CHECK-NEXT:    ori 6, 6, 51366
+; CHECK-NEXT:    clrldi 7, 7, 27
+; CHECK-NEXT:    sldi 6, 6, 32
+; CHECK-NEXT:    oris 6, 6, 3542
+; CHECK-NEXT:    ori 6, 6, 31883
+; CHECK-NEXT:    mulhdu 8, 7, 6
+; CHECK-NEXT:    mulhdu 6, 5, 6
+; CHECK-NEXT:    rldicl 8, 8, 59, 5
+; CHECK-NEXT:    rldicl 6, 6, 59, 5
+; CHECK-NEXT:    mulli 8, 8, 37
+; CHECK-NEXT:    mulli 6, 6, 37
+; CHECK-NEXT:    sub 7, 7, 8
+; CHECK-NEXT:    subf. 5, 6, 5
+; CHECK-NEXT:    srd 4, 4, 7
+; CHECK-NEXT:    sld 5, 3, 5
+; CHECK-NEXT:    or 4, 5, 4
+; CHECK-NEXT:    isel 3, 3, 4, 2
+; CHECK-NEXT:    blr
+  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
+
+declare i7 @llvm.fshl.i7(i7, i7, i7)
+define i7 @fshl_i7_const_fold() {
+; CHECK-LABEL: fshl_i7_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 67
+; CHECK-NEXT:    blr
+  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+; With constant shift amount, this is rotate + insert (missing extended mnemonics).
+
+define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 9, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 9, 0, 22
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount.
+
+define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 9, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 9, 0, 22
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work.
+
+define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
+; CHECK-LABEL: fshl_i64_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 4, 4, 41
+; CHECK-NEXT:    rldimi 4, 3, 41, 0
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshl_i8_const_fold() {
+; CHECK-LABEL: fshl_i8_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 128
+; CHECK-NEXT:    blr
+  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+; Repeat everything for funnel shift right.
+
+; General case - all operands can be variables.
+
+define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: fshr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 6, 5, 32
+; CHECK-NEXT:    andi. 5, 5, 31
+; CHECK-NEXT:    clrlwi 6, 6, 27
+; CHECK-NEXT:    srw 5, 4, 5
+; CHECK-NEXT:    slw 3, 3, 6
+; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    isel 3, 4, 3, 2
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshr.i37(i37, i37, i37)
+define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
+; CHECK-LABEL: fshr_i37:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis 6, -8857
+; CHECK-NEXT:    subfic 7, 5, 37
+; CHECK-NEXT:    clrldi 5, 5, 27
+; CHECK-NEXT:    clrldi 9, 4, 27
+; CHECK-NEXT:    ori 6, 6, 51366
+; CHECK-NEXT:    clrldi 7, 7, 27
+; CHECK-NEXT:    sldi 6, 6, 32
+; CHECK-NEXT:    oris 6, 6, 3542
+; CHECK-NEXT:    ori 6, 6, 31883
+; CHECK-NEXT:    mulhdu 8, 5, 6
+; CHECK-NEXT:    mulhdu 6, 7, 6
+; CHECK-NEXT:    rldicl 8, 8, 59, 5
+; CHECK-NEXT:    rldicl 6, 6, 59, 5
+; CHECK-NEXT:    mulli 8, 8, 37
+; CHECK-NEXT:    mulli 6, 6, 37
+; CHECK-NEXT:    subf. 5, 8, 5
+; CHECK-NEXT:    sub 6, 7, 6
+; CHECK-NEXT:    srd 5, 9, 5
+; CHECK-NEXT:    sld 3, 3, 6
+; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    isel 3, 4, 3, 2
+; CHECK-NEXT:    blr
+  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
+
+declare i7 @llvm.fshr.i7(i7, i7, i7)
+define i7 @fshr_i7_const_fold() {
+; CHECK-LABEL: fshr_i7_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 31
+; CHECK-NEXT:    blr
+  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+; With constant shift amount, this is rotate + insert (missing extended mnemonics).
+
+define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 23, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 23, 0, 8
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount. 41-32=9.
+
+define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 23, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 23, 0, 8
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work. 105-64 = 41.
+
+define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
+; CHECK-LABEL: fshr_i64_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 4, 4, 23
+; CHECK-NEXT:    rldimi 4, 3, 23, 0
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshr_i8_const_fold() {
+; CHECK-LABEL: fshr_i8_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 254
+; CHECK-NEXT:    blr
+  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: fshr_v4i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
author	Sanjay Patel <spatel@rotateright.com>	2018-07-16 22:59:31 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2018-07-16 22:59:31 +0000
commit	c71adc8040b1e382b195a0096015cb5c39628b23 (patch)
tree	8711ea739eab9d1354abf5fed2412f7d00f75293 /llvm/test/CodeGen/PowerPC/funnel-shift.ll
parent	c4846a551e0c1499e67f4aa287abe89be20ffe5f (diff)
download	bcm5719-llvm-c71adc8040b1e382b195a0096015cb5c39628b23.tar.gz bcm5719-llvm-c71adc8040b1e382b195a0096015cb5c39628b23.zip