diff options
| author | David Green <david.green@arm.com> | 2019-11-27 11:01:27 +0000 |
|---|---|---|
| committer | David Green <david.green@arm.com> | 2019-11-27 13:32:29 +0000 |
| commit | 9f15fcc2718f95f1dac9e6e57aa93d84e9709930 (patch) | |
| tree | b68530b63955d00f6a60613b59397b9f3f7114ea /llvm | |
| parent | 3c1912a733bae09585d88315a7eec39cd3318fde (diff) | |
| download | bcm5719-llvm-9f15fcc2718f95f1dac9e6e57aa93d84e9709930.tar.gz bcm5719-llvm-9f15fcc2718f95f1dac9e6e57aa93d84e9709930.zip | |
[ARM] Replace arm_neon_vqadds with sadd_sat
This replaces the A32 NEON vqadds, vqaddu, vqsubs and vqsubu intrinsics
with the target independent sadd_sat, uadd_sat, ssub_sat and usub_sat.
This helps generate vqadds from standard IR nodes, which might be
produced from the vectoriser. The old variants are removed in the
process.
Differential Revision: https://reviews.llvm.org/D69350
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsARM.td | 4 | ||||
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 20 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrNEON.td | 56 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/addsubo-legalization.ll | 110 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/neon-v8.1a.ll | 48 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll | 330 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/vmul.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/vqadd.ll | 64 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/vqdmul.ll | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/vqsub.ll | 64 |
11 files changed, 501 insertions, 226 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 10417411edc..31069666b1e 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -426,8 +426,6 @@ let IntrProperties = [IntrNoMem, Commutative] in { def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic; def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic; - def int_arm_neon_vqadds : Neon_2Arg_Intrinsic; - def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic; // Vector Multiply. @@ -459,8 +457,6 @@ let IntrProperties = [IntrNoMem, Commutative] in { // Vector Subtract. def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic; def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic; -def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic; -def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic; def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic; // Vector Absolute Compare. diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index d2dd2a69bea..5aaf90df6f6 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -559,6 +559,26 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer); return true; } + if (Name.startswith("arm.neon.vqadds.")) { + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sadd_sat, + F->arg_begin()->getType()); + return true; + } + if (Name.startswith("arm.neon.vqaddu.")) { + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::uadd_sat, + F->arg_begin()->getType()); + return true; + } + if (Name.startswith("arm.neon.vqsubs.")) { + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ssub_sat, + F->arg_begin()->getType()); + return true; + } + if (Name.startswith("arm.neon.vqsubu.")) { + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::usub_sat, + F->arg_begin()->getType()); + return true; + } if (Name.startswith("aarch64.neon.addp")) { if (F->arg_size() != 2) break; // Invalid IR. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index c153e786e2d..83a06767a57 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -209,6 +209,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, VT != MVT::v2i64 && VT != MVT::v1i64) for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); + if (!VT.isFloatingPoint()) + for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) + setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 94bb45bde57..1653ce1275c 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4287,10 +4287,10 @@ defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, // VQADD : Vector Saturating Add defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, - "vqadd", "s", int_arm_neon_vqadds, 1>; + "vqadd", "s", saddsat, 1>; defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, - "vqadd", "u", int_arm_neon_vqaddu, 1>; + "vqadd", "u", uaddsat, 1>; // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>; // VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) @@ -4527,22 +4527,22 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqadds + def : Pat<(v4i16 (saddsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (int_arm_neon_vqadds + def : Pat<(v2i32 (saddsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (int_arm_neon_vqadds + def : Pat<(v8i16 (saddsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))))), (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (int_arm_neon_vqadds + def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))))), @@ -4551,7 +4551,7 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqadds + def : Pat<(v4i16 (saddsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), @@ -4559,7 +4559,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (int_arm_neon_vqadds + def : Pat<(v2i32 (saddsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), @@ -4567,7 +4567,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (int_arm_neon_vqadds + def : Pat<(v8i16 (saddsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), @@ -4579,7 +4579,7 @@ let Predicates = [HasNEON, HasV8_1a] in { QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (int_arm_neon_vqadds + def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), @@ -4597,22 +4597,22 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqsubs + def : Pat<(v4i16 (ssubsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (int_arm_neon_vqsubs + def : Pat<(v2i32 (ssubsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (int_arm_neon_vqsubs + def : Pat<(v8i16 (ssubsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))))), (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (int_arm_neon_vqsubs + def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))))), @@ -4621,14 +4621,14 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqsubs + def : Pat<(v4i16 (ssubsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (int_arm_neon_vqsubs + def : Pat<(v2i32 (ssubsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), @@ -4636,7 +4636,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (int_arm_neon_vqsubs + def : Pat<(v8i16 (ssubsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), @@ -4648,7 +4648,7 @@ let Predicates = [HasNEON, HasV8_1a] in { QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (int_arm_neon_vqsubs + def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), @@ -4667,20 +4667,20 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>; let Predicates = [HasNEON] in { -def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), +def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), +def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), +def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; -def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), +def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), @@ -4759,20 +4759,20 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>; let Predicates = [HasNEON] in { -def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), +def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), +def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), +def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; -def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), +def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), @@ -5045,10 +5045,10 @@ defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, // VQSUB : Vector Saturing Subtract defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vqsub", "s", int_arm_neon_vqsubs, 0>; + "vqsub", "s", ssubsat, 0>; defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vqsub", "u", int_arm_neon_vqsubu, 0>; + "vqsub", "u", usubsat, 0>; // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>; // VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll index e9143d814d3..e3a48ed0c14 100644 --- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll +++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll @@ -95,48 +95,19 @@ define <2 x i1> @usubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) { define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) { ; CHECK-LABEL: saddo: ; CHECK: @ %bb.0: -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vadd.i64 q8, q10, q9 -; CHECK-NEXT: vmov.32 r2, d20[0] -; CHECK-NEXT: vmov.32 r1, d20[1] -; CHECK-NEXT: vmov.32 r12, d16[0] -; CHECK-NEXT: vmov.32 r8, d16[1] -; CHECK-NEXT: vmov.32 lr, d17[0] -; CHECK-NEXT: vmov.32 r4, d21[0] -; CHECK-NEXT: vmov.32 r5, d17[1] -; CHECK-NEXT: vmov.32 r6, d18[1] -; CHECK-NEXT: vmov.32 r7, d21[1] -; CHECK-NEXT: subs.w r2, r12, r2 -; CHECK-NEXT: vmov.32 r2, d19[1] -; CHECK-NEXT: sbcs.w r1, r8, r1 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: subs.w r4, lr, r4 -; CHECK-NEXT: sbcs.w r7, r5, r7 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r3, #-1 -; CHECK-NEXT: asrs r7, r6, #31 -; CHECK-NEXT: vdup.32 d21, r3 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: vdup.32 d20, r1 +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vqadd.s64 q10, q9, q8 +; CHECK-NEXT: vadd.i64 q8, q9, q8 +; CHECK-NEXT: vceq.i32 q9, q8, q10 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0] -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vdup.32 d19, r2 -; CHECK-NEXT: vdup.32 d18, r7 -; CHECK-NEXT: veor q9, q9, q10 +; CHECK-NEXT: vrev64.32 q10, q9 +; CHECK-NEXT: vand q9, q9, q10 +; CHECK-NEXT: vmvn q9, q9 ; CHECK-NEXT: vmovn.i64 d18, q9 ; CHECK-NEXT: vmov r2, r1, d18 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: bx lr %x = load <2 x i64>, <2 x i64>* %ptr, align 8 %y = load <2 x i64>, <2 x i64>* %ptr2, align 8 %s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y) @@ -149,64 +120,19 @@ define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) { define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) { ; CHECK-LABEL: ssubo: ; CHECK: @ %bb.0: -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vsub.i64 q8, q10, q9 -; CHECK-NEXT: vmov.32 r1, d20[0] -; CHECK-NEXT: vmov.32 r12, d20[1] -; CHECK-NEXT: vmov.32 r3, d16[0] -; CHECK-NEXT: vmov.32 lr, d16[1] -; CHECK-NEXT: vmov.32 r4, d21[0] -; CHECK-NEXT: vmov.32 r5, d17[0] -; CHECK-NEXT: vmov.32 r6, d21[1] -; CHECK-NEXT: vmov.32 r7, d17[1] -; CHECK-NEXT: vmov.32 r8, d18[1] -; CHECK-NEXT: subs r1, r3, r1 -; CHECK-NEXT: vmov.32 r3, d18[0] -; CHECK-NEXT: sbcs.w r1, lr, r12 -; CHECK-NEXT: vmov.32 r12, d19[0] -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: subs r5, r5, r4 -; CHECK-NEXT: vmov.32 r5, d19[1] -; CHECK-NEXT: sbcs r7, r6 -; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r7, #-1 -; CHECK-NEXT: vdup.32 d21, r7 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: sbcs.w r3, r2, r8 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: rsbs.w r6, r12, #0 -; CHECK-NEXT: sbcs.w r6, r2, r5 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vdup.32 d19, r2 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r3, #-1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: vdup.32 d18, r3 -; CHECK-NEXT: vdup.32 d20, r1 -; CHECK-NEXT: veor q9, q9, q10 +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vqsub.s64 q10, q9, q8 +; CHECK-NEXT: vsub.i64 q8, q9, q8 +; CHECK-NEXT: vceq.i32 q9, q8, q10 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: vrev64.32 q10, q9 +; CHECK-NEXT: vand q9, q9, q10 +; CHECK-NEXT: vmvn q9, q9 ; CHECK-NEXT: vmovn.i64 d18, q9 ; CHECK-NEXT: vmov r2, r1, d18 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: bx lr %x = load <2 x i64>, <2 x i64>* %ptr, align 8 %y = load <2 x i64>, <2 x i64>* %ptr2, align 8 %s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y) diff --git a/llvm/test/CodeGen/ARM/neon-v8.1a.ll b/llvm/test/CodeGen/ARM/neon-v8.1a.ll index 91259139d44..95d20858008 100644 --- a/llvm/test/CodeGen/ARM/neon-v8.1a.ll +++ b/llvm/test/CodeGen/ARM/neon-v8.1a.ll @@ -8,20 +8,20 @@ declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) -declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) -declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) -declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) -declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) -declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) -declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) -declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) -declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmlah_v4i16: %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) - %retval = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc, <4 x i16> %prod) + %retval = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <4 x i16> %retval } @@ -29,7 +29,7 @@ define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmlah_v8i16: %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) - %retval = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod) + %retval = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} ret <8 x i16> %retval } @@ -37,7 +37,7 @@ define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmlah_v2i32: %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) - %retval = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod) + %retval = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <2 x i32> %retval } @@ -45,7 +45,7 @@ define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmlah_v4i32: %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) - %retval = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod) + %retval = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} ret <4 x i32> %retval } @@ -53,7 +53,7 @@ define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmlsh_v4i16: %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) - %retval = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod) + %retval = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <4 x i16> %retval } @@ -61,7 +61,7 @@ define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmlsh_v8i16: %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) - %retval = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod) + %retval = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} ret <8 x i16> %retval } @@ -69,7 +69,7 @@ define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmlsh_v2i32: %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) - %retval = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod) + %retval = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <2 x i32> %retval } @@ -77,7 +77,7 @@ define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> define <4 x i32> @test_vqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmlsh_v4i32: %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) - %retval = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod) + %retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} ret <4 x i32> %retval } @@ -90,7 +90,7 @@ define <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) - %retval = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc, <4 x i16> %prod) + %retval = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3] ret <4 x i16> %retval } @@ -100,7 +100,7 @@ define <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16 entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) - %retval = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod) + %retval = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2] ret <8 x i16> %retval } @@ -110,7 +110,7 @@ define <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) - %retval = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod) + %retval = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1] ret <2 x i32> %retval } @@ -120,7 +120,7 @@ define <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) - %retval = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod) + %retval = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0] ret <4 x i32> %retval } @@ -130,7 +130,7 @@ define <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) - %retval = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod) + %retval = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3] ret <4 x i16> %retval } @@ -140,7 +140,7 @@ define <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16 entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) - %retval = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod) + %retval = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2] ret <8 x i16> %retval } @@ -150,7 +150,7 @@ define <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) - %retval = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod) + %retval = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1] ret <2 x i32> %retval } @@ -160,7 +160,7 @@ define <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) - %retval = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod) + %retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0] ret <4 x i32> %retval } diff --git a/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll b/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll new file mode 100644 index 00000000000..a1323810151 --- /dev/null +++ b/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll @@ -0,0 +1,330 @@ +; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s + +define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vqadds8: +;CHECK: vqadd.s8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vqadds16: +;CHECK: vqadd.s16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vqadds32: +;CHECK: vqadd.s32 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vqadds64: +;CHECK: vqadd.s64 + %tmp1 = load <1 x i64>, <1 x i64>* %A + %tmp2 = load <1 x i64>, <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vqaddu8: +;CHECK: vqadd.u8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vqaddu16: +;CHECK: vqadd.u16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vqaddu32: +;CHECK: vqadd.u32 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vqaddu64: +;CHECK: vqadd.u64 + %tmp1 = load <1 x i64>, <1 x i64>* %A + %tmp2 = load <1 x i64>, <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vqaddQs8: +;CHECK: vqadd.s8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vqaddQs16: +;CHECK: vqadd.s16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vqaddQs32: +;CHECK: vqadd.s32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK-LABEL: vqaddQs64: +;CHECK: vqadd.s64 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vqaddQu8: +;CHECK: vqadd.u8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vqaddQu16: +;CHECK: vqadd.u16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vqaddQu32: +;CHECK: vqadd.u32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK-LABEL: vqaddQu64: +;CHECK: vqadd.u64 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + + +define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vqsubs8: +;CHECK: vqsub.s8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vqsubs16: +;CHECK: vqsub.s16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vqsubs32: +;CHECK: vqsub.s32 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vqsubs64: +;CHECK: vqsub.s64 + %tmp1 = load <1 x i64>, <1 x i64>* %A + %tmp2 = load <1 x i64>, <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vqsubu8: +;CHECK: vqsub.u8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vqsubu16: +;CHECK: vqsub.u16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vqsubu32: +;CHECK: vqsub.u32 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vqsubu64: +;CHECK: vqsub.u64 + %tmp1 = load <1 x i64>, <1 x i64>* %A + %tmp2 = load <1 x i64>, <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vqsubQs8: +;CHECK: vqsub.s8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vqsubQs16: +;CHECK: vqsub.s16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vqsubQs32: +;CHECK: vqsub.s32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK-LABEL: vqsubQs64: +;CHECK: vqsub.s64 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vqsubQu8: +;CHECK: vqsub.u8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vqsubQu16: +;CHECK: vqsub.u16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vqsubQu32: +;CHECK: vqsub.u32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK-LABEL: vqsubQu64: +;CHECK: vqsub.u64 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/llvm/test/CodeGen/ARM/vmul.ll b/llvm/test/CodeGen/ARM/vmul.ll index fcffe175e2b..e8cf8d9b27b 100644 --- a/llvm/test/CodeGen/ARM/vmul.ll +++ b/llvm/test/CodeGen/ARM/vmul.ll @@ -574,7 +574,7 @@ for.body33: ; preds = %for.body33, %for.bo %vmovl.i225 = zext <8 x i8> undef to <8 x i16> %mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249 %vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> - %vqsub2.i216 = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind + %vqsub2.i216 = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind %mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80> %vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> %vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -608,7 +608,7 @@ for.end179: ; preds = %for.cond.loopexit, } declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone ; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8), diff --git a/llvm/test/CodeGen/ARM/vqadd.ll b/llvm/test/CodeGen/ARM/vqadd.ll index d1e90cb2094..47432c7b732 100644 --- a/llvm/test/CodeGen/ARM/vqadd.ll +++ b/llvm/test/CodeGen/ARM/vqadd.ll @@ -5,7 +5,7 @@ define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vqadd.s8 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) ret <8 x i8> %tmp3 } @@ -14,7 +14,7 @@ define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vqadd.s16 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) ret <4 x i16> %tmp3 } @@ -23,7 +23,7 @@ define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vqadd.s32 %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) ret <2 x i32> %tmp3 } @@ -32,7 +32,7 @@ define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind { ;CHECK: vqadd.s64 %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + %tmp3 = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) ret <1 x i64> %tmp3 } @@ -41,7 +41,7 @@ define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vqadd.u8 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) ret <8 x i8> %tmp3 } @@ -50,7 +50,7 @@ define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vqadd.u16 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) ret <4 x i16> %tmp3 } @@ -59,7 +59,7 @@ define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vqadd.u32 %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) ret <2 x i32> %tmp3 } @@ -68,7 +68,7 @@ define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { ;CHECK: vqadd.u64 %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + %tmp3 = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) ret <1 x i64> %tmp3 } @@ -77,7 +77,7 @@ define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vqadd.s8 %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) ret <16 x i8> %tmp3 } @@ -86,7 +86,7 @@ define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vqadd.s16 %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) ret <8 x i16> %tmp3 } @@ -95,7 +95,7 @@ define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vqadd.s32 %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) ret <4 x i32> %tmp3 } @@ -104,7 +104,7 @@ define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { ;CHECK: vqadd.s64 %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + %tmp3 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) ret <2 x i64> %tmp3 } @@ -113,7 +113,7 @@ define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vqadd.u8 %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) ret <16 x i8> %tmp3 } @@ -122,7 +122,7 @@ define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vqadd.u16 %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) ret <8 x i16> %tmp3 } @@ -131,7 +131,7 @@ define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vqadd.u32 %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) ret <4 x i32> %tmp3 } @@ -140,26 +140,26 @@ define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { ;CHECK: vqadd.u64 %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + %tmp3 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) ret <2 x i64> %tmp3 } -declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone +declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone +declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone -declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone +declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone -declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone +declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/llvm/test/CodeGen/ARM/vqdmul.ll b/llvm/test/CodeGen/ARM/vqdmul.ll index 6da080012a1..fa938d45bec 100644 --- a/llvm/test/CodeGen/ARM/vqdmul.ll +++ b/llvm/test/CodeGen/ARM/vqdmul.ll @@ -204,7 +204,7 @@ define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i16>, <4 x i16>* %C %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) - %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) + %tmp5 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) ret <4 x i32> %tmp5 } @@ -215,7 +215,7 @@ define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i32>, <2 x i32>* %C %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) - %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) + %tmp5 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) ret <2 x i64> %tmp5 } @@ -225,7 +225,7 @@ entry: ; CHECK: vqdmlal.s16 q0, d2, d3[1] %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) - %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) + %2 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) ret <4 x i32> %2 } @@ -235,12 +235,12 @@ entry: ; CHECK: vqdmlal.s32 q0, d2, d3[1] %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) - %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) + %2 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) ret <2 x i64> %2 } -declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone +declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ;CHECK-LABEL: vqdmlsls16_natural: @@ -249,7 +249,7 @@ define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i16>, <4 x i16>* %C %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) - %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) + %tmp5 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) ret <4 x i32> %tmp5 } @@ -260,7 +260,7 @@ define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i32>, <2 x i32>* %C %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) - %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) + %tmp5 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) ret <2 x i64> %tmp5 } @@ -270,7 +270,7 @@ entry: ; CHECK: vqdmlsl.s16 q0, d2, d3[1] %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) - %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) + %2 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) ret <4 x i32> %2 } @@ -280,9 +280,9 @@ entry: ; CHECK: vqdmlsl.s32 q0, d2, d3[1] %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) - %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) + %2 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) ret <2 x i64> %2 } -declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/llvm/test/CodeGen/ARM/vqsub.ll b/llvm/test/CodeGen/ARM/vqsub.ll index 40963ce8248..9864f6421cb 100644 --- a/llvm/test/CodeGen/ARM/vqsub.ll +++ b/llvm/test/CodeGen/ARM/vqsub.ll @@ -5,7 +5,7 @@ define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vqsub.s8 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) ret <8 x i8> %tmp3 } @@ -14,7 +14,7 @@ define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vqsub.s16 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) ret <4 x i16> %tmp3 } @@ -23,7 +23,7 @@ define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vqsub.s32 %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) ret <2 x i32> %tmp3 } @@ -32,7 +32,7 @@ define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind { ;CHECK: vqsub.s64 %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + %tmp3 = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) ret <1 x i64> %tmp3 } @@ -41,7 +41,7 @@ define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vqsub.u8 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) ret <8 x i8> %tmp3 } @@ -50,7 +50,7 @@ define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vqsub.u16 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) ret <4 x i16> %tmp3 } @@ -59,7 +59,7 @@ define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vqsub.u32 %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) ret <2 x i32> %tmp3 } @@ -68,7 +68,7 @@ define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { ;CHECK: vqsub.u64 %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + %tmp3 = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) ret <1 x i64> %tmp3 } @@ -77,7 +77,7 @@ define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vqsub.s8 %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) ret <16 x i8> %tmp3 } @@ -86,7 +86,7 @@ define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vqsub.s16 %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) ret <8 x i16> %tmp3 } @@ -95,7 +95,7 @@ define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vqsub.s32 %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) ret <4 x i32> %tmp3 } @@ -104,7 +104,7 @@ define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { ;CHECK: vqsub.s64 %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + %tmp3 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) ret <2 x i64> %tmp3 } @@ -113,7 +113,7 @@ define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vqsub.u8 %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) ret <16 x i8> %tmp3 } @@ -122,7 +122,7 @@ define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vqsub.u16 %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) ret <8 x i16> %tmp3 } @@ -131,7 +131,7 @@ define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vqsub.u32 %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) ret <4 x i32> %tmp3 } @@ -140,26 +140,26 @@ define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { ;CHECK: vqsub.u64 %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + %tmp3 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) ret <2 x i64> %tmp3 } -declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone +declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone +declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.usub.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone -declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone +declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone -declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone +declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone |

