summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorDavid Green <david.green@arm.com>2019-11-27 11:01:27 +0000
committerDavid Green <david.green@arm.com>2019-11-27 13:32:29 +0000
commit9f15fcc2718f95f1dac9e6e57aa93d84e9709930 (patch)
treeb68530b63955d00f6a60613b59397b9f3f7114ea /llvm
parent3c1912a733bae09585d88315a7eec39cd3318fde (diff)
downloadbcm5719-llvm-9f15fcc2718f95f1dac9e6e57aa93d84e9709930.tar.gz
bcm5719-llvm-9f15fcc2718f95f1dac9e6e57aa93d84e9709930.zip
[ARM] Replace arm_neon_vqadds with sadd_sat
This replaces the A32 NEON vqadds, vqaddu, vqsubs and vqsubu intrinsics with the target independent sadd_sat, uadd_sat, ssub_sat and usub_sat. This helps generate vqadds from standard IR nodes, which might be produced from the vectoriser. The old variants are removed in the process. Differential Revision: https://reviews.llvm.org/D69350
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/IR/IntrinsicsARM.td4
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp20
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp3
-rw-r--r--llvm/lib/Target/ARM/ARMInstrNEON.td56
-rw-r--r--llvm/test/CodeGen/ARM/addsubo-legalization.ll110
-rw-r--r--llvm/test/CodeGen/ARM/neon-v8.1a.ll48
-rw-r--r--llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll330
-rw-r--r--llvm/test/CodeGen/ARM/vmul.ll4
-rw-r--r--llvm/test/CodeGen/ARM/vqadd.ll64
-rw-r--r--llvm/test/CodeGen/ARM/vqdmul.ll24
-rw-r--r--llvm/test/CodeGen/ARM/vqsub.ll64
11 files changed, 501 insertions, 226 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 10417411edc..31069666b1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -426,8 +426,6 @@ let IntrProperties = [IntrNoMem, Commutative] in {
def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
- def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
- def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
// Vector Multiply.
@@ -459,8 +457,6 @@ let IntrProperties = [IntrNoMem, Commutative] in {
// Vector Subtract.
def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
-def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
-def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
// Vector Absolute Compare.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d2dd2a69bea..5aaf90df6f6 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -559,6 +559,26 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
return true;
}
+ if (Name.startswith("arm.neon.vqadds.")) {
+ NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sadd_sat,
+ F->arg_begin()->getType());
+ return true;
+ }
+ if (Name.startswith("arm.neon.vqaddu.")) {
+ NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::uadd_sat,
+ F->arg_begin()->getType());
+ return true;
+ }
+ if (Name.startswith("arm.neon.vqsubs.")) {
+ NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ssub_sat,
+ F->arg_begin()->getType());
+ return true;
+ }
+ if (Name.startswith("arm.neon.vqsubu.")) {
+ NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::usub_sat,
+ F->arg_begin()->getType());
+ return true;
+ }
if (Name.startswith("aarch64.neon.addp")) {
if (F->arg_size() != 2)
break; // Invalid IR.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c153e786e2d..83a06767a57 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -209,6 +209,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
VT != MVT::v2i64 && VT != MVT::v1i64)
for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
+ if (!VT.isFloatingPoint())
+ for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
+ setOperationAction(Opcode, VT, Legal);
}
void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 94bb45bde57..1653ce1275c 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -4287,10 +4287,10 @@ defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
// VQADD : Vector Saturating Add
defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
- "vqadd", "s", int_arm_neon_vqadds, 1>;
+ "vqadd", "s", saddsat, 1>;
defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
- "vqadd", "u", int_arm_neon_vqaddu, 1>;
+ "vqadd", "u", uaddsat, 1>;
// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q)
defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
@@ -4527,22 +4527,22 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqadds
+ def : Pat<(v4i16 (saddsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v2i32 (int_arm_neon_vqadds
+ def : Pat<(v2i32 (saddsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v8i16 (int_arm_neon_vqadds
+ def : Pat<(v8i16 (saddsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
(v8i16 QPR:$Vm))))),
(v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
- def : Pat<(v4i32 (int_arm_neon_vqadds
+ def : Pat<(v4i32 (saddsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
(v4i32 QPR:$Vm))))),
@@ -4551,7 +4551,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqadds
+ def : Pat<(v4i16 (saddsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
@@ -4559,7 +4559,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
imm:$lane))>;
- def : Pat<(v2i32 (int_arm_neon_vqadds
+ def : Pat<(v2i32 (saddsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
@@ -4567,7 +4567,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
- def : Pat<(v8i16 (int_arm_neon_vqadds
+ def : Pat<(v8i16 (saddsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
@@ -4579,7 +4579,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
- def : Pat<(v4i32 (int_arm_neon_vqadds
+ def : Pat<(v4i32 (saddsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
@@ -4597,22 +4597,22 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqsubs
+ def : Pat<(v4i16 (ssubsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v2i32 (int_arm_neon_vqsubs
+ def : Pat<(v2i32 (ssubsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v8i16 (int_arm_neon_vqsubs
+ def : Pat<(v8i16 (ssubsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
(v8i16 QPR:$Vm))))),
(v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
- def : Pat<(v4i32 (int_arm_neon_vqsubs
+ def : Pat<(v4i32 (ssubsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
(v4i32 QPR:$Vm))))),
@@ -4621,14 +4621,14 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqsubs
+ def : Pat<(v4i16 (ssubsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
- def : Pat<(v2i32 (int_arm_neon_vqsubs
+ def : Pat<(v2i32 (ssubsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
@@ -4636,7 +4636,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
- def : Pat<(v8i16 (int_arm_neon_vqsubs
+ def : Pat<(v8i16 (ssubsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
@@ -4648,7 +4648,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
- def : Pat<(v4i32 (int_arm_neon_vqsubs
+ def : Pat<(v4i32 (ssubsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
@@ -4667,20 +4667,20 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
let Predicates = [HasNEON] in {
-def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
-def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
@@ -4759,20 +4759,20 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
let Predicates = [HasNEON] in {
-def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
-def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
@@ -5045,10 +5045,10 @@ defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
// VQSUB : Vector Saturing Subtract
defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vqsub", "s", int_arm_neon_vqsubs, 0>;
+ "vqsub", "s", ssubsat, 0>;
defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vqsub", "u", int_arm_neon_vqsubu, 0>;
+ "vqsub", "u", usubsat, 0>;
// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q)
defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index e9143d814d3..e3a48ed0c14 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -95,48 +95,19 @@ define <2 x i1> @usubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
; CHECK-LABEL: saddo:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: movs r3, #0
-; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: vadd.i64 q8, q10, q9
-; CHECK-NEXT: vmov.32 r2, d20[0]
-; CHECK-NEXT: vmov.32 r1, d20[1]
-; CHECK-NEXT: vmov.32 r12, d16[0]
-; CHECK-NEXT: vmov.32 r8, d16[1]
-; CHECK-NEXT: vmov.32 lr, d17[0]
-; CHECK-NEXT: vmov.32 r4, d21[0]
-; CHECK-NEXT: vmov.32 r5, d17[1]
-; CHECK-NEXT: vmov.32 r6, d18[1]
-; CHECK-NEXT: vmov.32 r7, d21[1]
-; CHECK-NEXT: subs.w r2, r12, r2
-; CHECK-NEXT: vmov.32 r2, d19[1]
-; CHECK-NEXT: sbcs.w r1, r8, r1
-; CHECK-NEXT: mov.w r1, #0
-; CHECK-NEXT: it lt
-; CHECK-NEXT: movlt r1, #1
-; CHECK-NEXT: subs.w r4, lr, r4
-; CHECK-NEXT: sbcs.w r7, r5, r7
-; CHECK-NEXT: it lt
-; CHECK-NEXT: movlt r3, #1
-; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r3, #-1
-; CHECK-NEXT: asrs r7, r6, #31
-; CHECK-NEXT: vdup.32 d21, r3
-; CHECK-NEXT: cmp r1, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r1, #-1
-; CHECK-NEXT: vdup.32 d20, r1
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vqadd.s64 q10, q9, q8
+; CHECK-NEXT: vadd.i64 q8, q9, q8
+; CHECK-NEXT: vceq.i32 q9, q8, q10
; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
-; CHECK-NEXT: asrs r2, r2, #31
-; CHECK-NEXT: vdup.32 d19, r2
-; CHECK-NEXT: vdup.32 d18, r7
-; CHECK-NEXT: veor q9, q9, q10
+; CHECK-NEXT: vrev64.32 q10, q9
+; CHECK-NEXT: vand q9, q9, q10
+; CHECK-NEXT: vmvn q9, q9
; CHECK-NEXT: vmovn.i64 d18, q9
; CHECK-NEXT: vmov r2, r1, d18
; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: bx lr
%x = load <2 x i64>, <2 x i64>* %ptr, align 8
%y = load <2 x i64>, <2 x i64>* %ptr2, align 8
%s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -149,64 +120,19 @@ define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
; CHECK-LABEL: ssubo:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vsub.i64 q8, q10, q9
-; CHECK-NEXT: vmov.32 r1, d20[0]
-; CHECK-NEXT: vmov.32 r12, d20[1]
-; CHECK-NEXT: vmov.32 r3, d16[0]
-; CHECK-NEXT: vmov.32 lr, d16[1]
-; CHECK-NEXT: vmov.32 r4, d21[0]
-; CHECK-NEXT: vmov.32 r5, d17[0]
-; CHECK-NEXT: vmov.32 r6, d21[1]
-; CHECK-NEXT: vmov.32 r7, d17[1]
-; CHECK-NEXT: vmov.32 r8, d18[1]
-; CHECK-NEXT: subs r1, r3, r1
-; CHECK-NEXT: vmov.32 r3, d18[0]
-; CHECK-NEXT: sbcs.w r1, lr, r12
-; CHECK-NEXT: vmov.32 r12, d19[0]
-; CHECK-NEXT: mov.w r1, #0
-; CHECK-NEXT: it lt
-; CHECK-NEXT: movlt r1, #1
-; CHECK-NEXT: subs r5, r5, r4
-; CHECK-NEXT: vmov.32 r5, d19[1]
-; CHECK-NEXT: sbcs r7, r6
-; CHECK-NEXT: mov.w r7, #0
-; CHECK-NEXT: it lt
-; CHECK-NEXT: movlt r7, #1
-; CHECK-NEXT: cmp r7, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r7, #-1
-; CHECK-NEXT: vdup.32 d21, r7
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: sbcs.w r3, r2, r8
-; CHECK-NEXT: mov.w r3, #0
-; CHECK-NEXT: it lt
-; CHECK-NEXT: movlt r3, #1
-; CHECK-NEXT: rsbs.w r6, r12, #0
-; CHECK-NEXT: sbcs.w r6, r2, r5
-; CHECK-NEXT: it lt
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r2, #-1
-; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: vdup.32 d19, r2
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r3, #-1
-; CHECK-NEXT: cmp r1, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r1, #-1
-; CHECK-NEXT: vdup.32 d18, r3
-; CHECK-NEXT: vdup.32 d20, r1
-; CHECK-NEXT: veor q9, q9, q10
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vqsub.s64 q10, q9, q8
+; CHECK-NEXT: vsub.i64 q8, q9, q8
+; CHECK-NEXT: vceq.i32 q9, q8, q10
; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vrev64.32 q10, q9
+; CHECK-NEXT: vand q9, q9, q10
+; CHECK-NEXT: vmvn q9, q9
; CHECK-NEXT: vmovn.i64 d18, q9
; CHECK-NEXT: vmov r2, r1, d18
; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: bx lr
%x = load <2 x i64>, <2 x i64>* %ptr, align 8
%y = load <2 x i64>, <2 x i64>* %ptr2, align 8
%s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
diff --git a/llvm/test/CodeGen/ARM/neon-v8.1a.ll b/llvm/test/CodeGen/ARM/neon-v8.1a.ll
index 91259139d44..95d20858008 100644
--- a/llvm/test/CodeGen/ARM/neon-v8.1a.ll
+++ b/llvm/test/CodeGen/ARM/neon-v8.1a.ll
@@ -8,20 +8,20 @@ declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmlah_v4i16:
%prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
- %retval = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+ %retval = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret <4 x i16> %retval
}
@@ -29,7 +29,7 @@ define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16>
define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmlah_v8i16:
%prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
- %retval = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+ %retval = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
ret <8 x i16> %retval
}
@@ -37,7 +37,7 @@ define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16>
define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmlah_v2i32:
%prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
- %retval = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+ %retval = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret <2 x i32> %retval
}
@@ -45,7 +45,7 @@ define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32>
define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmlah_v4i32:
%prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
- %retval = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+ %retval = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
ret <4 x i32> %retval
}
@@ -53,7 +53,7 @@ define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32>
define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmlsh_v4i16:
%prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
- %retval = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+ %retval = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret <4 x i16> %retval
}
@@ -61,7 +61,7 @@ define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16>
define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmlsh_v8i16:
%prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
- %retval = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+ %retval = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
ret <8 x i16> %retval
}
@@ -69,7 +69,7 @@ define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16>
define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmlsh_v2i32:
%prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
- %retval = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+ %retval = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret <2 x i32> %retval
}
@@ -77,7 +77,7 @@ define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32>
define <4 x i32> @test_vqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmlsh_v4i32:
%prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
- %retval = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+ %retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
ret <4 x i32> %retval
}
@@ -90,7 +90,7 @@ define <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16>
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
- %retval = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+ %retval = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
ret <4 x i16> %retval
}
@@ -100,7 +100,7 @@ define <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
%prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
- %retval = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+ %retval = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
ret <8 x i16> %retval
}
@@ -110,7 +110,7 @@ define <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32>
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
- %retval = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+ %retval = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
ret <2 x i32> %retval
}
@@ -120,7 +120,7 @@ define <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32>
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
- %retval = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+ %retval = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
ret <4 x i32> %retval
}
@@ -130,7 +130,7 @@ define <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16>
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
- %retval = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+ %retval = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
ret <4 x i16> %retval
}
@@ -140,7 +140,7 @@ define <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
%prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
- %retval = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+ %retval = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
ret <8 x i16> %retval
}
@@ -150,7 +150,7 @@ define <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32>
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
- %retval = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+ %retval = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
ret <2 x i32> %retval
}
@@ -160,7 +160,7 @@ define <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32>
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
- %retval = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+ %retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
ret <4 x i32> %retval
}
diff --git a/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll b/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll
new file mode 100644
index 00000000000..a1323810151
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll
@@ -0,0 +1,330 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqadds8:
+;CHECK: vqadd.s8
+ %tmp1 = load <8 x i8>, <8 x i8>* %A
+ %tmp2 = load <8 x i8>, <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqadds16:
+;CHECK: vqadd.s16
+ %tmp1 = load <4 x i16>, <4 x i16>* %A
+ %tmp2 = load <4 x i16>, <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqadds32:
+;CHECK: vqadd.s32
+ %tmp1 = load <2 x i32>, <2 x i32>* %A
+ %tmp2 = load <2 x i32>, <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqadds64:
+;CHECK: vqadd.s64
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = load <1 x i64>, <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqaddu8:
+;CHECK: vqadd.u8
+ %tmp1 = load <8 x i8>, <8 x i8>* %A
+ %tmp2 = load <8 x i8>, <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqaddu16:
+;CHECK: vqadd.u16
+ %tmp1 = load <4 x i16>, <4 x i16>* %A
+ %tmp2 = load <4 x i16>, <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqaddu32:
+;CHECK: vqadd.u32
+ %tmp1 = load <2 x i32>, <2 x i32>* %A
+ %tmp2 = load <2 x i32>, <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqaddu64:
+;CHECK: vqadd.u64
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = load <1 x i64>, <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqaddQs8:
+;CHECK: vqadd.s8
+ %tmp1 = load <16 x i8>, <16 x i8>* %A
+ %tmp2 = load <16 x i8>, <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqaddQs16:
+;CHECK: vqadd.s16
+ %tmp1 = load <8 x i16>, <8 x i16>* %A
+ %tmp2 = load <8 x i16>, <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqaddQs32:
+;CHECK: vqadd.s32
+ %tmp1 = load <4 x i32>, <4 x i32>* %A
+ %tmp2 = load <4 x i32>, <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqaddQs64:
+;CHECK: vqadd.s64
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = load <2 x i64>, <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqaddQu8:
+;CHECK: vqadd.u8
+ %tmp1 = load <16 x i8>, <16 x i8>* %A
+ %tmp2 = load <16 x i8>, <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqaddQu16:
+;CHECK: vqadd.u16
+ %tmp1 = load <8 x i16>, <8 x i16>* %A
+ %tmp2 = load <8 x i16>, <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqaddQu32:
+;CHECK: vqadd.u32
+ %tmp1 = load <4 x i32>, <4 x i32>* %A
+ %tmp2 = load <4 x i32>, <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqaddQu64:
+;CHECK: vqadd.u64
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = load <2 x i64>, <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+
+define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubs8:
+;CHECK: vqsub.s8
+ %tmp1 = load <8 x i8>, <8 x i8>* %A
+ %tmp2 = load <8 x i8>, <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubs16:
+;CHECK: vqsub.s16
+ %tmp1 = load <4 x i16>, <4 x i16>* %A
+ %tmp2 = load <4 x i16>, <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubs32:
+;CHECK: vqsub.s32
+ %tmp1 = load <2 x i32>, <2 x i32>* %A
+ %tmp2 = load <2 x i32>, <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubs64:
+;CHECK: vqsub.s64
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = load <1 x i64>, <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubu8:
+;CHECK: vqsub.u8
+ %tmp1 = load <8 x i8>, <8 x i8>* %A
+ %tmp2 = load <8 x i8>, <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubu16:
+;CHECK: vqsub.u16
+ %tmp1 = load <4 x i16>, <4 x i16>* %A
+ %tmp2 = load <4 x i16>, <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubu32:
+;CHECK: vqsub.u32
+ %tmp1 = load <2 x i32>, <2 x i32>* %A
+ %tmp2 = load <2 x i32>, <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubu64:
+;CHECK: vqsub.u64
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = load <1 x i64>, <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubQs8:
+;CHECK: vqsub.s8
+ %tmp1 = load <16 x i8>, <16 x i8>* %A
+ %tmp2 = load <16 x i8>, <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubQs16:
+;CHECK: vqsub.s16
+ %tmp1 = load <8 x i16>, <8 x i16>* %A
+ %tmp2 = load <8 x i16>, <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubQs32:
+;CHECK: vqsub.s32
+ %tmp1 = load <4 x i32>, <4 x i32>* %A
+ %tmp2 = load <4 x i32>, <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubQs64:
+;CHECK: vqsub.s64
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = load <2 x i64>, <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubQu8:
+;CHECK: vqsub.u8
+ %tmp1 = load <16 x i8>, <16 x i8>* %A
+ %tmp2 = load <16 x i8>, <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubQu16:
+;CHECK: vqsub.u16
+ %tmp1 = load <8 x i16>, <8 x i16>* %A
+ %tmp2 = load <8 x i16>, <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubQu32:
+;CHECK: vqsub.u32
+ %tmp1 = load <4 x i32>, <4 x i32>* %A
+ %tmp2 = load <4 x i32>, <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubQu64:
+;CHECK: vqsub.u64
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = load <2 x i64>, <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/vmul.ll b/llvm/test/CodeGen/ARM/vmul.ll
index fcffe175e2b..e8cf8d9b27b 100644
--- a/llvm/test/CodeGen/ARM/vmul.ll
+++ b/llvm/test/CodeGen/ARM/vmul.ll
@@ -574,7 +574,7 @@ for.body33: ; preds = %for.body33, %for.bo
%vmovl.i225 = zext <8 x i8> undef to <8 x i16>
%mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249
%vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
- %vqsub2.i216 = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
+ %vqsub2.i216 = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
%mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>
%vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -608,7 +608,7 @@ for.end179: ; preds = %for.cond.loopexit,
}
declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8),
diff --git a/llvm/test/CodeGen/ARM/vqadd.ll b/llvm/test/CodeGen/ARM/vqadd.ll
index d1e90cb2094..47432c7b732 100644
--- a/llvm/test/CodeGen/ARM/vqadd.ll
+++ b/llvm/test/CodeGen/ARM/vqadd.ll
@@ -5,7 +5,7 @@ define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqadd.s8
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp3 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
@@ -14,7 +14,7 @@ define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqadd.s16
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp3 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
@@ -23,7 +23,7 @@ define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqadd.s32
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp3 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
@@ -32,7 +32,7 @@ define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqadd.s64
%tmp1 = load <1 x i64>, <1 x i64>* %A
%tmp2 = load <1 x i64>, <1 x i64>* %B
- %tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ %tmp3 = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
@@ -41,7 +41,7 @@ define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqadd.u8
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp3 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
@@ -50,7 +50,7 @@ define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqadd.u16
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp3 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
@@ -59,7 +59,7 @@ define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqadd.u32
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp3 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
@@ -68,7 +68,7 @@ define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqadd.u64
%tmp1 = load <1 x i64>, <1 x i64>* %A
%tmp2 = load <1 x i64>, <1 x i64>* %B
- %tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ %tmp3 = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
@@ -77,7 +77,7 @@ define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqadd.s8
%tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B
- %tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ %tmp3 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
@@ -86,7 +86,7 @@ define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqadd.s16
%tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ %tmp3 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
@@ -95,7 +95,7 @@ define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqadd.s32
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ %tmp3 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
@@ -104,7 +104,7 @@ define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqadd.s64
%tmp1 = load <2 x i64>, <2 x i64>* %A
%tmp2 = load <2 x i64>, <2 x i64>* %B
- %tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ %tmp3 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
@@ -113,7 +113,7 @@ define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqadd.u8
%tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B
- %tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ %tmp3 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
@@ -122,7 +122,7 @@ define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqadd.u16
%tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ %tmp3 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
@@ -131,7 +131,7 @@ define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqadd.u32
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ %tmp3 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
@@ -140,26 +140,26 @@ define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqadd.u64
%tmp1 = load <2 x i64>, <2 x i64>* %A
%tmp2 = load <2 x i64>, <2 x i64>* %B
- %tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ %tmp3 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
-declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/vqdmul.ll b/llvm/test/CodeGen/ARM/vqdmul.ll
index 6da080012a1..fa938d45bec 100644
--- a/llvm/test/CodeGen/ARM/vqdmul.ll
+++ b/llvm/test/CodeGen/ARM/vqdmul.ll
@@ -204,7 +204,7 @@ define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = load <4 x i16>, <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
- %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+ %tmp5 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
ret <4 x i32> %tmp5
}
@@ -215,7 +215,7 @@ define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C
%tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = load <2 x i32>, <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
- %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+ %tmp5 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
ret <2 x i64> %tmp5
}
@@ -225,7 +225,7 @@ entry:
; CHECK: vqdmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
- %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+ %2 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
ret <4 x i32> %2
}
@@ -235,12 +235,12 @@ entry:
; CHECK: vqdmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
- %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+ %2 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
ret <2 x i64> %2
}
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK-LABEL: vqdmlsls16_natural:
@@ -249,7 +249,7 @@ define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = load <4 x i16>, <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
- %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+ %tmp5 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
ret <4 x i32> %tmp5
}
@@ -260,7 +260,7 @@ define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C
%tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = load <2 x i32>, <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
- %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+ %tmp5 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
ret <2 x i64> %tmp5
}
@@ -270,7 +270,7 @@ entry:
; CHECK: vqdmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
- %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+ %2 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
ret <4 x i32> %2
}
@@ -280,9 +280,9 @@ entry:
; CHECK: vqdmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
- %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+ %2 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
ret <2 x i64> %2
}
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/vqsub.ll b/llvm/test/CodeGen/ARM/vqsub.ll
index 40963ce8248..9864f6421cb 100644
--- a/llvm/test/CodeGen/ARM/vqsub.ll
+++ b/llvm/test/CodeGen/ARM/vqsub.ll
@@ -5,7 +5,7 @@ define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqsub.s8
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp3 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
@@ -14,7 +14,7 @@ define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqsub.s16
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp3 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
@@ -23,7 +23,7 @@ define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqsub.s32
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp3 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
@@ -32,7 +32,7 @@ define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqsub.s64
%tmp1 = load <1 x i64>, <1 x i64>* %A
%tmp2 = load <1 x i64>, <1 x i64>* %B
- %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ %tmp3 = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
@@ -41,7 +41,7 @@ define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqsub.u8
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp3 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
@@ -50,7 +50,7 @@ define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqsub.u16
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp3 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
@@ -59,7 +59,7 @@ define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqsub.u32
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp3 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
@@ -68,7 +68,7 @@ define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqsub.u64
%tmp1 = load <1 x i64>, <1 x i64>* %A
%tmp2 = load <1 x i64>, <1 x i64>* %B
- %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ %tmp3 = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
@@ -77,7 +77,7 @@ define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqsub.s8
%tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B
- %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ %tmp3 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
@@ -86,7 +86,7 @@ define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqsub.s16
%tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ %tmp3 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
@@ -95,7 +95,7 @@ define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqsub.s32
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ %tmp3 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
@@ -104,7 +104,7 @@ define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqsub.s64
%tmp1 = load <2 x i64>, <2 x i64>* %A
%tmp2 = load <2 x i64>, <2 x i64>* %B
- %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ %tmp3 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
@@ -113,7 +113,7 @@ define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqsub.u8
%tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B
- %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ %tmp3 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
@@ -122,7 +122,7 @@ define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqsub.u16
%tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ %tmp3 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
@@ -131,7 +131,7 @@ define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqsub.u32
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ %tmp3 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
@@ -140,26 +140,26 @@ define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqsub.u64
%tmp1 = load <2 x i64>, <2 x i64>* %A
%tmp2 = load <2 x i64>, <2 x i64>* %B
- %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ %tmp3 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
-declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.usub.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
OpenPOWER on IntegriCloud