[SDAG] Add a fallback multiplication expansion

LegalizeIntegerTypes does not have a way to expand multiplications for large integer types (i.e. larger than twice the native bit width). There's no standard runtime call to use in that case, and so we'd just assert. Unfortunately, as it turns out, it is possible to hit this case from standard-ish C code in rare cases. A particular case a user ran into yesterday involved an __int128 induction variable and a loop with a quadratic (not linear) recurrence which triggered some backend logic using SCEVExpander. In this case, the BinomialCoefficient code in SCEV generates some i129 variables, which get widened to i256. At a high level, this is not actually good (i.e. the underlying optimization, PPCLoopPreIncPrep, should not be transforming the loop in question for performance reasons), but regardless, the backend shouldn't crash because of cost-modeling issues in the optimizer. This is a straightforward implementation of the multiplication expansion, based on the algorithm in Hacker's Delight. I validated it against the code for the mul256b function from http://locklessinc.com/articles/256bit_arithmetic/ using random inputs. There should be no functional change for previously-working code (the new expansion code only replaces an assert). Fixes PR19797. llvm-svn: 270720
author: Hal Finkel <hfinkel@anl.gov> 2016-05-25 16:50:22 +0000
committer: Hal Finkel <hfinkel@anl.gov> 2016-05-25 16:50:22 +0000
commit: 6f3387f43468926d7299464f080d117aa758cade (patch)
tree: 7457b4db59b01ec9282c769b264bcc83ec685e55
parent: bef0eb001be6ec33caae8984c7a7fe9b7a841b13 (diff)
download: bcm5719-llvm-6f3387f43468926d7299464f080d117aa758cade.tar.gz
bcm5719-llvm-6f3387f43468926d7299464f080d117aa758cade.zip
2 files changed, 70 insertions, 1 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 22ab8497f4c..b69f1c99935 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2133,7 +2133,49 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
     LC = RTLIB::MUL_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::MUL_I128;
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL) {
+    // We'll expand the multiplication by brute force because we have no other
+    // options. This is a trivially-generalized version of the code from
+    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+    // 4.3.1).
+    SDValue Mask =
+      DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
+                                           NVT.getSizeInBits() >> 1), dl, NVT);
+    SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
+    SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
+
+    SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
+    SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
+
+    SDValue Shift =
+      DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
+    SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
+    SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
+    SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
+
+    SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL);
+    SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
+    SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
+
+    SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
+    SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
+
+    SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LL, RL),
+                            DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, TH,
+                     DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
+
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
+                     DAG.getNode(ISD::ADD, dl, NVT,
+                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL), 
+                                 DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
+    return;
+  }
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
new file mode 100644
index 00000000000..8b8b10aa179
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i256* %a, i256* %b, i256* %out) #0 {
+entry:
+  %av = load i256, i256* %a
+  %bv = load i256, i256* %b
+  %r = mul i256 %av, %bv
+  store i256 %r, i256* %out
+  ret void
+}
+
+; CHECK-LABEL: @test
+; There is a lot of inter-register motion, and so matching the instruction
+; sequence will be fragile. There should be 6 underlying multiplications.
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK-NOT: imulq
+; CHECK: retq
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
author	Hal Finkel <hfinkel@anl.gov>	2016-05-25 16:50:22 +0000
committer	Hal Finkel <hfinkel@anl.gov>	2016-05-25 16:50:22 +0000
commit	6f3387f43468926d7299464f080d117aa758cade (patch)
tree	7457b4db59b01ec9282c769b264bcc83ec685e55
parent	bef0eb001be6ec33caae8984c7a7fe9b7a841b13 (diff)
download	bcm5719-llvm-6f3387f43468926d7299464f080d117aa758cade.tar.gz bcm5719-llvm-6f3387f43468926d7299464f080d117aa758cade.zip