diff options
| author | Hao Liu <Hao.Liu@arm.com> | 2014-05-08 07:38:13 +0000 | 
|---|---|---|
| committer | Hao Liu <Hao.Liu@arm.com> | 2014-05-08 07:38:13 +0000 | 
| commit | 1187a3d8db35afec80423dca192c9826a403771e (patch) | |
| tree | 58281bac8ffb6785364ee84cca121d236baf911c | |
| parent | ecfe9d06ebf5474fb40de27ed883d2b4277e38f4 (diff) | |
| download | bcm5719-llvm-1187a3d8db35afec80423dca192c9826a403771e.tar.gz bcm5719-llvm-1187a3d8db35afec80423dca192c9826a403771e.zip  | |
AArch64/ARM64: Port NEON post-increment load/store with 2/3/4 vectors to ARM64 backend.
llvm-svn: 208284
| -rw-r--r-- | llvm/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp | 584 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM64/ARM64ISelLowering.cpp | 190 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM64/ARM64ISelLowering.h | 25 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll | 5077 | 
4 files changed, 5820 insertions, 56 deletions
diff --git a/llvm/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/llvm/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp index d9c945ccd92..f216f79255e 100644 --- a/llvm/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp @@ -150,10 +150,15 @@ public:    SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,                       unsigned SubRegIdx); +  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, +                         unsigned SubRegIdx);    SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); +  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);    SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); +  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);    SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); +  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);    SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);    SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node); @@ -952,33 +957,43 @@ SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,    SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);    SDValue SuperReg = SDValue(Ld, 0); - -  // MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); -  // MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); -  // cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); - -  switch (NumVecs) { -  case 4: -    ReplaceUses(SDValue(N, 3), CurDAG->getTargetExtractSubreg(SubRegIdx + 3, dl, -                                                              VT, SuperReg)); -  // FALLTHROUGH -  case 3: -    ReplaceUses(SDValue(N, 2), CurDAG->getTargetExtractSubreg(SubRegIdx + 2, dl, -                                                              VT, SuperReg)); -  // FALLTHROUGH -  case 2: -    ReplaceUses(SDValue(N, 1), CurDAG->getTargetExtractSubreg(SubRegIdx + 1, dl, -                                                              VT, SuperReg)); -    ReplaceUses(SDValue(N, 0), -                CurDAG->getTargetExtractSubreg(SubRegIdx, dl, VT, SuperReg)); -    break; -  case 1: -    ReplaceUses(SDValue(N, 0), SuperReg); -    break; -  } +  for (unsigned i = 0; i < NumVecs; ++i) +    ReplaceUses(SDValue(N, i), +        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));    ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); +  return nullptr; +} + +SDNode *ARM64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, +                                          unsigned Opc, unsigned SubRegIdx) { +  SDLoc dl(N); +  EVT VT = N->getValueType(0); +  SDValue Chain = N->getOperand(0); + +  SmallVector<SDValue, 6> Ops; +  Ops.push_back(N->getOperand(1)); // Mem operand +  Ops.push_back(N->getOperand(2)); // Incremental +  Ops.push_back(Chain); + +  std::vector<EVT> ResTys; +  ResTys.push_back(MVT::i64); // Type of the write back register +  ResTys.push_back(MVT::Untyped); +  ResTys.push_back(MVT::Other); + +  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + +  // Update uses of write back register +  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + +  // Update uses of vector list +  SDValue SuperReg = SDValue(Ld, 1); +  for (unsigned i = 0; i < NumVecs; ++i) +    ReplaceUses(SDValue(N, i), +        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); +  // Update the chain +  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));    return nullptr;  } @@ -1001,6 +1016,29 @@ SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,    return St;  } +SDNode *ARM64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, +                                               unsigned Opc) { +  SDLoc dl(N); +  EVT VT = N->getOperand(2)->getValueType(0); +  SmallVector<EVT, 2> ResTys; +  ResTys.push_back(MVT::i64);   // Type of the write back register +  ResTys.push_back(MVT::Other); // Type for the Chain + +  // Form a REG_SEQUENCE to force register allocation. +  bool Is128Bit = VT.getSizeInBits() == 128; +  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); +  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); + +  SmallVector<SDValue, 6> Ops; +  Ops.push_back(RegSeq); +  Ops.push_back(N->getOperand(NumVecs + 1)); // base register +  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental +  Ops.push_back(N->getOperand(0)); // Chain +  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + +  return St; +} +  /// WidenVector - Given a value in the V64 register class, produce the  /// equivalent value in the V128 register class.  class WidenVector { @@ -1065,42 +1103,68 @@ SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,    SDValue SuperReg = SDValue(Ld, 0);    EVT WideVT = RegSeq.getOperand(1)->getValueType(0); -  switch (NumVecs) { -  case 4: { -    SDValue NV3 = -        CurDAG->getTargetExtractSubreg(ARM64::qsub3, dl, WideVT, SuperReg); +  static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, +                              ARM64::qsub3 }; +  for (unsigned i = 0; i < NumVecs; ++i) { +    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);      if (Narrow) -      ReplaceUses(SDValue(N, 3), NarrowVector(NV3, *CurDAG)); -    else -      ReplaceUses(SDValue(N, 3), NV3); +      NV = NarrowVector(NV, *CurDAG); +    ReplaceUses(SDValue(N, i), NV);    } -  // FALLTHROUGH -  case 3: { -    SDValue NV2 = -        CurDAG->getTargetExtractSubreg(ARM64::qsub2, dl, WideVT, SuperReg); + +  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + +  return Ld; +} + +SDNode *ARM64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, +                                              unsigned Opc) { +  SDLoc dl(N); +  EVT VT = N->getValueType(0); +  bool Narrow = VT.getSizeInBits() == 64; + +  // Form a REG_SEQUENCE to force register allocation. +  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + +  if (Narrow) +    std::transform(Regs.begin(), Regs.end(), Regs.begin(), +                   WidenVector(*CurDAG)); + +  SDValue RegSeq = createQTuple(Regs); + +  std::vector<EVT> ResTys; +  ResTys.push_back(MVT::i64); // Type of the write back register +  ResTys.push_back(MVT::Untyped); +  ResTys.push_back(MVT::Other); + +  unsigned LaneNo = +      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); + +  SmallVector<SDValue, 6> Ops; +  Ops.push_back(RegSeq); +  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number +  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register +  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental +  Ops.push_back(N->getOperand(0)); +  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + +  // Update uses of the write back register +  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + +  // Update uses of the vector list +  SDValue SuperReg = SDValue(Ld, 1); +  EVT WideVT = RegSeq.getOperand(1)->getValueType(0); +  static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, +                              ARM64::qsub3 }; +  for (unsigned i = 0; i < NumVecs; ++i) { +    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);      if (Narrow) -      ReplaceUses(SDValue(N, 2), NarrowVector(NV2, *CurDAG)); -    else -      ReplaceUses(SDValue(N, 2), NV2); -  } -  // FALLTHROUGH -  case 2: { -    SDValue NV1 = -        CurDAG->getTargetExtractSubreg(ARM64::qsub1, dl, WideVT, SuperReg); -    SDValue NV0 = -        CurDAG->getTargetExtractSubreg(ARM64::qsub0, dl, WideVT, SuperReg); -    if (Narrow) { -      ReplaceUses(SDValue(N, 1), NarrowVector(NV1, *CurDAG)); -      ReplaceUses(SDValue(N, 0), NarrowVector(NV0, *CurDAG)); -    } else { -      ReplaceUses(SDValue(N, 1), NV1); -      ReplaceUses(SDValue(N, 0), NV0); -    } -    break; -  } +      NV = NarrowVector(NV, *CurDAG); +    ReplaceUses(SDValue(N, i), NV);    } -  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); +  // Update the Chain +  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));    return Ld;  } @@ -1138,6 +1202,44 @@ SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,    return St;  } +SDNode *ARM64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, +                                               unsigned Opc) { +  SDLoc dl(N); +  EVT VT = N->getOperand(2)->getValueType(0); +  bool Narrow = VT.getSizeInBits() == 64; + +  // Form a REG_SEQUENCE to force register allocation. +  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + +  if (Narrow) +    std::transform(Regs.begin(), Regs.end(), Regs.begin(), +                   WidenVector(*CurDAG)); + +  SDValue RegSeq = createQTuple(Regs); + +  SmallVector<EVT, 2> ResTys; +  ResTys.push_back(MVT::i64);   // Type of the write back register +  ResTys.push_back(MVT::Other); + +  unsigned LaneNo = +      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); + +  SmallVector<SDValue, 6> Ops; +  Ops.push_back(RegSeq); +  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); +  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register +  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental +  Ops.push_back(N->getOperand(0)); +  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + +  // Transfer memoperands. +  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); +  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); +  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + +  return St; +} +  static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,                                         unsigned &Opc, SDValue &Opd0,                                         unsigned &LSB, unsigned &MSB, @@ -2441,6 +2543,378 @@ SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) {      }      }    } +  case ARM64ISD::LD2post: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 2, ARM64::LD2Twov2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD3post: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 3, ARM64::LD3Threev2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD4post: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 4, ARM64::LD4Fourv2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD1x2post: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 2, ARM64::LD1Twov2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD1x3post: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 3, ARM64::LD1Threev2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD1x4post: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 4, ARM64::LD1Fourv2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD2DUPpost: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 2, ARM64::LD2Rv2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD3DUPpost: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 3, ARM64::LD3Rv2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD4DUPpost: { +    if (VT == MVT::v8i8) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv8b_POST, ARM64::dsub0); +    else if (VT == MVT::v16i8) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv16b_POST, ARM64::qsub0); +    else if (VT == MVT::v4i16) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv4h_POST, ARM64::dsub0); +    else if (VT == MVT::v8i16) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv8h_POST, ARM64::qsub0); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv2s_POST, ARM64::dsub0); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv4s_POST, ARM64::qsub0); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv1d_POST, ARM64::dsub0); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostLoad(Node, 4, ARM64::LD4Rv2d_POST, ARM64::qsub0); +    break; +  } +  case ARM64ISD::LD2LANEpost: { +    if (VT == MVT::v16i8 || VT == MVT::v8i8) +      return SelectPostLoadLane(Node, 2, ARM64::LD2i8_POST); +    else if (VT == MVT::v8i16 || VT == MVT::v4i16) +      return SelectPostLoadLane(Node, 2, ARM64::LD2i16_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || +             VT == MVT::v2f32) +      return SelectPostLoadLane(Node, 2, ARM64::LD2i32_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || +             VT == MVT::v1f64) +      return SelectPostLoadLane(Node, 2, ARM64::LD2i64_POST); +    break; +  } +  case ARM64ISD::LD3LANEpost: { +    if (VT == MVT::v16i8 || VT == MVT::v8i8) +      return SelectPostLoadLane(Node, 3, ARM64::LD3i8_POST); +    else if (VT == MVT::v8i16 || VT == MVT::v4i16) +      return SelectPostLoadLane(Node, 3, ARM64::LD3i16_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || +             VT == MVT::v2f32) +      return SelectPostLoadLane(Node, 3, ARM64::LD3i32_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || +             VT == MVT::v1f64) +      return SelectPostLoadLane(Node, 3, ARM64::LD3i64_POST); +    break; +  } +  case ARM64ISD::LD4LANEpost: { +    if (VT == MVT::v16i8 || VT == MVT::v8i8) +      return SelectPostLoadLane(Node, 4, ARM64::LD4i8_POST); +    else if (VT == MVT::v8i16 || VT == MVT::v4i16) +      return SelectPostLoadLane(Node, 4, ARM64::LD4i16_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || +             VT == MVT::v2f32) +      return SelectPostLoadLane(Node, 4, ARM64::LD4i32_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || +             VT == MVT::v1f64) +      return SelectPostLoadLane(Node, 4, ARM64::LD4i64_POST); +    break; +  } +  case ARM64ISD::ST2post: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v8i8) +      return SelectPostStore(Node, 2, ARM64::ST2Twov8b_POST); +    else if (VT == MVT::v16i8) +      return SelectPostStore(Node, 2, ARM64::ST2Twov16b_POST); +    else if (VT == MVT::v4i16) +      return SelectPostStore(Node, 2, ARM64::ST2Twov4h_POST); +    else if (VT == MVT::v8i16) +      return SelectPostStore(Node, 2, ARM64::ST2Twov8h_POST); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostStore(Node, 2, ARM64::ST2Twov2s_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostStore(Node, 2, ARM64::ST2Twov4s_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostStore(Node, 2, ARM64::ST2Twov2d_POST); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST); +    break; +  } +  case ARM64ISD::ST3post: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v8i8) +      return SelectPostStore(Node, 3, ARM64::ST3Threev8b_POST); +    else if (VT == MVT::v16i8) +      return SelectPostStore(Node, 3, ARM64::ST3Threev16b_POST); +    else if (VT == MVT::v4i16) +      return SelectPostStore(Node, 3, ARM64::ST3Threev4h_POST); +    else if (VT == MVT::v8i16) +      return SelectPostStore(Node, 3, ARM64::ST3Threev8h_POST); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostStore(Node, 3, ARM64::ST3Threev2s_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostStore(Node, 3, ARM64::ST3Threev4s_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostStore(Node, 3, ARM64::ST3Threev2d_POST); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST); +    break; +  } +  case ARM64ISD::ST4post: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v8i8) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv8b_POST); +    else if (VT == MVT::v16i8) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv16b_POST); +    else if (VT == MVT::v4i16) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv4h_POST); +    else if (VT == MVT::v8i16) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv8h_POST); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv2s_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv4s_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostStore(Node, 4, ARM64::ST4Fourv2d_POST); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST); +    break; +  } +  case ARM64ISD::ST1x2post: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v8i8) +      return SelectPostStore(Node, 2, ARM64::ST1Twov8b_POST); +    else if (VT == MVT::v16i8) +      return SelectPostStore(Node, 2, ARM64::ST1Twov16b_POST); +    else if (VT == MVT::v4i16) +      return SelectPostStore(Node, 2, ARM64::ST1Twov4h_POST); +    else if (VT == MVT::v8i16) +      return SelectPostStore(Node, 2, ARM64::ST1Twov8h_POST); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostStore(Node, 2, ARM64::ST1Twov2s_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostStore(Node, 2, ARM64::ST1Twov4s_POST); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostStore(Node, 2, ARM64::ST1Twov2d_POST); +    break; +  } +  case ARM64ISD::ST1x3post: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v8i8) +      return SelectPostStore(Node, 3, ARM64::ST1Threev8b_POST); +    else if (VT == MVT::v16i8) +      return SelectPostStore(Node, 3, ARM64::ST1Threev16b_POST); +    else if (VT == MVT::v4i16) +      return SelectPostStore(Node, 3, ARM64::ST1Threev4h_POST); +    else if (VT == MVT::v8i16) +      return SelectPostStore(Node, 3, ARM64::ST1Threev8h_POST); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostStore(Node, 3, ARM64::ST1Threev2s_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostStore(Node, 3, ARM64::ST1Threev4s_POST); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostStore(Node, 3, ARM64::ST1Threev2d_POST); +    break; +  } +  case ARM64ISD::ST1x4post: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v8i8) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv8b_POST); +    else if (VT == MVT::v16i8) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv16b_POST); +    else if (VT == MVT::v4i16) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv4h_POST); +    else if (VT == MVT::v8i16) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv8h_POST); +    else if (VT == MVT::v2i32 || VT == MVT::v2f32) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv2s_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v4f32) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv4s_POST); +    else if (VT == MVT::v1i64 || VT == MVT::v1f64) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v2f64) +      return SelectPostStore(Node, 4, ARM64::ST1Fourv2d_POST); +    break; +  } +  case ARM64ISD::ST2LANEpost: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v16i8 || VT == MVT::v8i8) +      return SelectPostStoreLane(Node, 2, ARM64::ST2i8_POST); +    else if (VT == MVT::v8i16 || VT == MVT::v4i16) +      return SelectPostStoreLane(Node, 2, ARM64::ST2i16_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || +             VT == MVT::v2f32) +      return SelectPostStoreLane(Node, 2, ARM64::ST2i32_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || +             VT == MVT::v1f64) +      return SelectPostStoreLane(Node, 2, ARM64::ST2i64_POST); +    break; +  } +  case ARM64ISD::ST3LANEpost: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v16i8 || VT == MVT::v8i8) +      return SelectPostStoreLane(Node, 3, ARM64::ST3i8_POST); +    else if (VT == MVT::v8i16 || VT == MVT::v4i16) +      return SelectPostStoreLane(Node, 3, ARM64::ST3i16_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || +             VT == MVT::v2f32) +      return SelectPostStoreLane(Node, 3, ARM64::ST3i32_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || +             VT == MVT::v1f64) +      return SelectPostStoreLane(Node, 3, ARM64::ST3i64_POST); +    break; +  } +  case ARM64ISD::ST4LANEpost: { +    VT = Node->getOperand(1).getValueType(); +    if (VT == MVT::v16i8 || VT == MVT::v8i8) +      return SelectPostStoreLane(Node, 4, ARM64::ST4i8_POST); +    else if (VT == MVT::v8i16 || VT == MVT::v4i16) +      return SelectPostStoreLane(Node, 4, ARM64::ST4i16_POST); +    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || +             VT == MVT::v2f32) +      return SelectPostStoreLane(Node, 4, ARM64::ST4i32_POST); +    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || +             VT == MVT::v1f64) +      return SelectPostStoreLane(Node, 4, ARM64::ST4i64_POST); +    break; +  }    case ISD::FCEIL:    case ISD::FFLOOR: diff --git a/llvm/lib/Target/ARM64/ARM64ISelLowering.cpp b/llvm/lib/Target/ARM64/ARM64ISelLowering.cpp index 4c6d7648d57..bff6ba060fb 100644 --- a/llvm/lib/Target/ARM64/ARM64ISelLowering.cpp +++ b/llvm/lib/Target/ARM64/ARM64ISelLowering.cpp @@ -369,6 +369,9 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)    setTargetDAGCombine(ISD::SELECT);    setTargetDAGCombine(ISD::VSELECT); +  setTargetDAGCombine(ISD::INTRINSIC_VOID); +  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); +    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; @@ -729,6 +732,27 @@ const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {    case ARM64ISD::URSHR_I:           return "ARM64ISD::URSHR_I";    case ARM64ISD::SQSHLU_I:          return "ARM64ISD::SQSHLU_I";    case ARM64ISD::WrapperLarge:      return "ARM64ISD::WrapperLarge"; +  case ARM64ISD::LD2post:           return "ARM64ISD::LD2post"; +  case ARM64ISD::LD3post:           return "ARM64ISD::LD3post"; +  case ARM64ISD::LD4post:           return "ARM64ISD::LD4post"; +  case ARM64ISD::ST2post:           return "ARM64ISD::ST2post"; +  case ARM64ISD::ST3post:           return "ARM64ISD::ST3post"; +  case ARM64ISD::ST4post:           return "ARM64ISD::ST4post"; +  case ARM64ISD::LD1x2post:         return "ARM64ISD::LD1x2post"; +  case ARM64ISD::LD1x3post:         return "ARM64ISD::LD1x3post"; +  case ARM64ISD::LD1x4post:         return "ARM64ISD::LD1x4post"; +  case ARM64ISD::ST1x2post:         return "ARM64ISD::ST1x2post"; +  case ARM64ISD::ST1x3post:         return "ARM64ISD::ST1x3post"; +  case ARM64ISD::ST1x4post:         return "ARM64ISD::ST1x4post"; +  case ARM64ISD::LD2DUPpost:        return "ARM64ISD::LD2DUPpost"; +  case ARM64ISD::LD3DUPpost:        return "ARM64ISD::LD3DUPpost"; +  case ARM64ISD::LD4DUPpost:        return "ARM64ISD::LD4DUPpost"; +  case ARM64ISD::LD2LANEpost:       return "ARM64ISD::LD2LANEpost"; +  case ARM64ISD::LD3LANEpost:       return "ARM64ISD::LD3LANEpost"; +  case ARM64ISD::LD4LANEpost:       return "ARM64ISD::LD4LANEpost"; +  case ARM64ISD::ST2LANEpost:       return "ARM64ISD::ST2LANEpost"; +  case ARM64ISD::ST3LANEpost:       return "ARM64ISD::ST3LANEpost"; +  case ARM64ISD::ST4LANEpost:       return "ARM64ISD::ST4LANEpost";    }  } @@ -5683,6 +5707,9 @@ bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,    case Intrinsic::arm64_neon_ld2:    case Intrinsic::arm64_neon_ld3:    case Intrinsic::arm64_neon_ld4: +  case Intrinsic::arm64_neon_ld1x2: +  case Intrinsic::arm64_neon_ld1x3: +  case Intrinsic::arm64_neon_ld1x4:    case Intrinsic::arm64_neon_ld2lane:    case Intrinsic::arm64_neon_ld3lane:    case Intrinsic::arm64_neon_ld4lane: @@ -5704,6 +5731,9 @@ bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,    case Intrinsic::arm64_neon_st2:    case Intrinsic::arm64_neon_st3:    case Intrinsic::arm64_neon_st4: +  case Intrinsic::arm64_neon_st1x2: +  case Intrinsic::arm64_neon_st1x3: +  case Intrinsic::arm64_neon_st1x4:    case Intrinsic::arm64_neon_st2lane:    case Intrinsic::arm64_neon_st3lane:    case Intrinsic::arm64_neon_st4lane: { @@ -7038,6 +7068,138 @@ static SDValue performSTORECombine(SDNode *N,                        S->getAlignment());  } +/// Target-specific DAG combine function for NEON load/store intrinsics +/// to merge base address updates. +static SDValue performNEONPostLDSTCombine(SDNode *N, +                                          TargetLowering::DAGCombinerInfo &DCI, +                                          SelectionDAG &DAG) { +  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) +    return SDValue(); + +  unsigned AddrOpIdx = N->getNumOperands() - 1; +  SDValue Addr = N->getOperand(AddrOpIdx); + +  // Search for a use of the address operand that is an increment. +  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), +       UE = Addr.getNode()->use_end(); UI != UE; ++UI) { +    SDNode *User = *UI; +    if (User->getOpcode() != ISD::ADD || +        UI.getUse().getResNo() != Addr.getResNo()) +      continue; + +    // Check that the add is independent of the load/store.  Otherwise, folding +    // it would create a cycle. +    if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) +      continue; + +    // Find the new opcode for the updating load/store. +    bool IsStore = false; +    bool IsLaneOp = false; +    bool IsDupOp = false; +    unsigned NewOpc = 0; +    unsigned NumVecs = 0; +    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); +    switch (IntNo) { +    default: llvm_unreachable("unexpected intrinsic for Neon base update"); +    case Intrinsic::arm64_neon_ld2:       NewOpc = ARM64ISD::LD2post; +      NumVecs = 2; break; +    case Intrinsic::arm64_neon_ld3:       NewOpc = ARM64ISD::LD3post; +      NumVecs = 3; break; +    case Intrinsic::arm64_neon_ld4:       NewOpc = ARM64ISD::LD4post; +      NumVecs = 4; break; +    case Intrinsic::arm64_neon_st2:       NewOpc = ARM64ISD::ST2post; +      NumVecs = 2; IsStore = true; break; +    case Intrinsic::arm64_neon_st3:       NewOpc = ARM64ISD::ST3post; +      NumVecs = 3; IsStore = true; break; +    case Intrinsic::arm64_neon_st4:       NewOpc = ARM64ISD::ST4post; +      NumVecs = 4; IsStore = true; break; +    case Intrinsic::arm64_neon_ld1x2:     NewOpc = ARM64ISD::LD1x2post; +      NumVecs = 2; break; +    case Intrinsic::arm64_neon_ld1x3:     NewOpc = ARM64ISD::LD1x3post; +      NumVecs = 3; break; +    case Intrinsic::arm64_neon_ld1x4:     NewOpc = ARM64ISD::LD1x4post; +      NumVecs = 4; break; +    case Intrinsic::arm64_neon_st1x2:     NewOpc = ARM64ISD::ST1x2post; +      NumVecs = 2; IsStore = true; break; +    case Intrinsic::arm64_neon_st1x3:     NewOpc = ARM64ISD::ST1x3post; +      NumVecs = 3; IsStore = true; break; +    case Intrinsic::arm64_neon_st1x4:     NewOpc = ARM64ISD::ST1x4post; +      NumVecs = 4; IsStore = true; break; +    case Intrinsic::arm64_neon_ld2r:      NewOpc = ARM64ISD::LD2DUPpost; +      NumVecs = 2; IsDupOp = true; break; +    case Intrinsic::arm64_neon_ld3r:      NewOpc = ARM64ISD::LD3DUPpost; +      NumVecs = 3; IsDupOp = true; break; +    case Intrinsic::arm64_neon_ld4r:      NewOpc = ARM64ISD::LD4DUPpost; +      NumVecs = 4; IsDupOp = true; break; +    case Intrinsic::arm64_neon_ld2lane:   NewOpc = ARM64ISD::LD2LANEpost; +      NumVecs = 2; IsLaneOp = true; break; +    case Intrinsic::arm64_neon_ld3lane:   NewOpc = ARM64ISD::LD3LANEpost; +      NumVecs = 3; IsLaneOp = true; break; +    case Intrinsic::arm64_neon_ld4lane:   NewOpc = ARM64ISD::LD4LANEpost; +      NumVecs = 4; IsLaneOp = true; break; +    case Intrinsic::arm64_neon_st2lane:   NewOpc = ARM64ISD::ST2LANEpost; +      NumVecs = 2; IsStore = true; IsLaneOp = true; break; +    case Intrinsic::arm64_neon_st3lane:   NewOpc = ARM64ISD::ST3LANEpost; +      NumVecs = 3; IsStore = true; IsLaneOp = true; break; +    case Intrinsic::arm64_neon_st4lane:   NewOpc = ARM64ISD::ST4LANEpost; +      NumVecs = 4; IsStore = true; IsLaneOp = true; break; +    } + +    EVT VecTy; +    if (IsStore) +      VecTy = N->getOperand(2).getValueType(); +    else +      VecTy = N->getValueType(0); + +    // If the increment is a constant, it must match the memory ref size. +    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); +    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { +      uint32_t IncVal = CInc->getZExtValue(); +      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; +      if (IsLaneOp || IsDupOp) +        NumBytes /= VecTy.getVectorNumElements(); +      if (IncVal != NumBytes) +        continue; +      Inc = DAG.getRegister(ARM64::XZR, MVT::i64); +    } +    SmallVector<SDValue, 8> Ops; +    Ops.push_back(N->getOperand(0)); // Incoming chain +    // Load lane and store have vector list as input. +    if (IsLaneOp || IsStore) +      for (unsigned i = 2; i < AddrOpIdx; ++i) +        Ops.push_back(N->getOperand(i)); +    Ops.push_back(N->getOperand(AddrOpIdx)); // Base register +    Ops.push_back(Inc); + +    // Return Types. +    EVT Tys[6]; +    unsigned NumResultVecs = (IsStore ? 0 : NumVecs); +    unsigned n; +    for (n = 0; n < NumResultVecs; ++n) +      Tys[n] = VecTy; +    Tys[n++] = MVT::i64;  // Type of write back register +    Tys[n] = MVT::Other;  // Type of the chain +    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2)); + +    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); +    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, +                                           MemInt->getMemoryVT(), +                                           MemInt->getMemOperand()); + +    // Update the uses. +    std::vector<SDValue> NewResults; +    for (unsigned i = 0; i < NumResultVecs; ++i) { +      NewResults.push_back(SDValue(UpdN.getNode(), i)); +    } +    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); +    DCI.CombineTo(N, NewResults); +    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + +    break; +  } +  return SDValue(); +} +  // Optimize compare with zero and branch.  static SDValue performBRCONDCombine(SDNode *N,                                      TargetLowering::DAGCombinerInfo &DCI, @@ -7196,6 +7358,34 @@ SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,      return performSTORECombine(N, DCI, DAG, Subtarget);    case ARM64ISD::BRCOND:      return performBRCONDCombine(N, DCI, DAG); +  case ISD::INTRINSIC_VOID: +  case ISD::INTRINSIC_W_CHAIN: +    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { +    case Intrinsic::arm64_neon_ld2: +    case Intrinsic::arm64_neon_ld3: +    case Intrinsic::arm64_neon_ld4: +    case Intrinsic::arm64_neon_ld1x2: +    case Intrinsic::arm64_neon_ld1x3: +    case Intrinsic::arm64_neon_ld1x4: +    case Intrinsic::arm64_neon_ld2lane: +    case Intrinsic::arm64_neon_ld3lane: +    case Intrinsic::arm64_neon_ld4lane: +    case Intrinsic::arm64_neon_ld2r: +    case Intrinsic::arm64_neon_ld3r: +    case Intrinsic::arm64_neon_ld4r: +    case Intrinsic::arm64_neon_st2: +    case Intrinsic::arm64_neon_st3: +    case Intrinsic::arm64_neon_st4: +    case Intrinsic::arm64_neon_st1x2: +    case Intrinsic::arm64_neon_st1x3: +    case Intrinsic::arm64_neon_st1x4: +    case Intrinsic::arm64_neon_st2lane: +    case Intrinsic::arm64_neon_st3lane: +    case Intrinsic::arm64_neon_st4lane: +      return performNEONPostLDSTCombine(N, DCI, DAG); +    default: +      break; +    }    }    return SDValue();  } diff --git a/llvm/lib/Target/ARM64/ARM64ISelLowering.h b/llvm/lib/Target/ARM64/ARM64ISelLowering.h index aa27b2d43d2..8b321ee9d03 100644 --- a/llvm/lib/Target/ARM64/ARM64ISelLowering.h +++ b/llvm/lib/Target/ARM64/ARM64ISelLowering.h @@ -160,7 +160,30 @@ enum {    // {s|u}int to FP within a FP register.    SITOF, -  UITOF +  UITOF, + +  // NEON Load/Store with post-increment base updates +  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, +  LD3post, +  LD4post, +  ST2post, +  ST3post, +  ST4post, +  LD1x2post, +  LD1x3post, +  LD1x4post, +  ST1x2post, +  ST1x3post, +  ST1x4post, +  LD2DUPpost, +  LD3DUPpost, +  LD4DUPpost, +  LD2LANEpost, +  LD3LANEpost, +  LD4LANEpost, +  ST2LANEpost, +  ST3LANEpost, +  ST4LANEpost  };  } // end namespace ARM64ISD diff --git a/llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll b/llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll index c909a447e1e..4e951f9d2f7 100644 --- a/llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll @@ -611,3 +611,5080 @@ define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {    %newaddr = getelementptr float* %addr, i32 2    ret float* %newaddr  } + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld2: +;CHECK: ld2.16b { v0, v1 }, [x0], #32 +  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld2: +;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld2: +;CHECK: ld2.8b { v0, v1 }, [x0], #16 +  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 16 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld2: +;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld2: +;CHECK: ld2.8h { v0, v1 }, [x0], #32 +  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld2: +;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld2: +;CHECK: ld2.4h { v0, v1 }, [x0], #16 +  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 8 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld2: +;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], #32 +  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], #16 +  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], #32 +  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 +  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 2 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], #32 +  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 8 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], #16 +  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 4 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], #32 +  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 +  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 2 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld3: +;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48 +  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 48 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld3: +;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld3: +;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24 +  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 24 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld3: +;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld3: +;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48 +  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 24 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld3: +;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld3: +;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24 +  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 12 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld3: +;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48 +  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 12 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24 +  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 6 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48 +  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 6 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 +  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 3 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48 +  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 12 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24 +  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 6 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48 +  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 6 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 +  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 3 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld4: +;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64 +  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 64 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld4: +;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld4: +;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld4: +;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld4: +;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64 +  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 32 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld4: +;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld4: +;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld4: +;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64 +  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 16 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64 +  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 8 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64 +  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 16 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 8 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64 +  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 8 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double*) + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld1x2: +;CHECK: ld1.16b { v0, v1 }, [x0], #32 +  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld1x2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld1x2: +;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld1x2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld1x2: +;CHECK: ld1.8b { v0, v1 }, [x0], #16 +  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 16 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld1x2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld1x2: +;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld1x2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld1x2: +;CHECK: ld1.8h { v0, v1 }, [x0], #32 +  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld1x2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld1x2: +;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld1x2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld1x2: +;CHECK: ld1.4h { v0, v1 }, [x0], #16 +  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 8 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld1x2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld1x2: +;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld1x2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], #32 +  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld1x2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld1x2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], #16 +  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld1x2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld1x2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], #32 +  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld1x2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld1x2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 +  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 2 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld1x2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld1x2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], #32 +  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 8 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld1x2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld1x2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], #16 +  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 4 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld1x2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld1x2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], #32 +  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld1x2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld1x2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 +  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 2 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld1x2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld1x2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld1x3: +;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48 +  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 48 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld1x3: +;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld1x3: +;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24 +  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 24 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld1x3: +;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld1x3: +;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48 +  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 24 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld1x3: +;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld1x3: +;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24 +  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 12 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld1x3: +;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48 +  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 12 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24 +  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 6 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48 +  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 6 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 +  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 3 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48 +  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 12 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24 +  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 6 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48 +  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 6 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 +  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 3 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld1x4: +;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64 +  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 64 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld1x4: +;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld1x4: +;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32 +  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld1x4: +;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld1x4: +;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64 +  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 32 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld1x4: +;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld1x4: +;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32 +  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld1x4: +;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64 +  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 16 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32 +  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64 +  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 8 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 +  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64 +  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 16 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32 +  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 8 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64 +  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 8 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 +  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld2r: +;CHECK: ld2r.16b { v0, v1 }, [x0], #2 +  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld2r: +;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld2r: +;CHECK: ld2r.8b { v0, v1 }, [x0], #2 +  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld2r: +;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld2r: +;CHECK: ld2r.8h { v0, v1 }, [x0], #4 +  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 2 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld2r: +;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld2r: +;CHECK: ld2r.4h { v0, v1 }, [x0], #4 +  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 2 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld2r: +;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], #8 +  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 2 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], #8 +  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 2 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], #16 +  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 2 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], #16 +  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 2 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], #8 +  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 2 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], #8 +  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 2 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], #16 +  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 2 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], #16 +  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 2 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}} +  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld3r: +;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3 +  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 3 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld3r: +;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld3r: +;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3 +  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 3 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld3r: +;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld3r: +;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6 +  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 3 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld3r: +;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld3r: +;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6 +  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 3 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld3r: +;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12 +  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 3 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12 +  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 3 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24 +  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 3 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24 +  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 3 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12 +  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 3 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12 +  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 3 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24 +  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 3 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24 +  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 3 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld4r: +;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4 +  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 4 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld4r: +;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld4r: +;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4 +  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i32 4 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld4r: +;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld4r: +;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8 +  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 4 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld4r: +;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld4r: +;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8 +  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i32 4 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld4r: +;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16 +  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16 +  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16 +  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 4 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16 +  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i32 4 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32 +  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly + + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], #2 +  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8> } %ld2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], #2 +  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8> } %ld2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], #4 +  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 2 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16> } %ld2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], #4 +  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 2 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16> } %ld2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 +  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 2 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32> } %ld2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 +  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 2 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32> } %ld2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 +  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i32 2 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64> } %ld2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly + + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 +  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i32 2 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64> } %ld2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 +  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 2 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float> } %ld2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly + + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 +  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 2 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float> } %ld2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 +  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i32 2 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double> } %ld2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly + + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 +  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i32 2 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double> } %ld2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3 +  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 3 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3 +  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 3 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6 +  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 3 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6 +  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 3 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 +  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 3 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 +  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 3 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 +  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i32 3 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly + + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 +  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i32 3 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 +  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 3 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly + + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 +  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 3 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 +  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i32 3 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly + + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 +  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i32 3 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4 +  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 4 +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4 +  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 4 +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  store i8* %tmp, i8** %ptr +  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8 +  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 4 +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8 +  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 4 +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  store i16* %tmp, i16** %ptr +  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 +  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 +  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  store i32* %tmp, i32** %ptr +  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 +  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly + + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 +  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i32 4 +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  store i64* %tmp, i64** %ptr +  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 +  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 4 +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly + + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 +  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 4 +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  store float* %tmp, float** %ptr +  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 +  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly + + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 +  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i32 4 +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  store double* %tmp, double** %ptr +  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly + + +define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st2: +;CHECK: st2.16b { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st2: +;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st2: +;CHECK: st2.8b { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i32 16 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st2: +;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st2: +;CHECK: st2.8h { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st2: +;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st2: +;CHECK: st2.4h { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i32 8 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st2: +;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st2: +;CHECK: st2.4s { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st2: +;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st2: +;CHECK: st2.2s { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st2: +;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st2: +;CHECK: st2.2d { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 4 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st2: +;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 2 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st2: +;CHECK: st2.4s { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i32 8 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st2: +;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st2: +;CHECK: st2.2s { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i32 4 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st2: +;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st2: +;CHECK: st2.2d { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 4 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st2: +;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 2 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st3: +;CHECK: st3.16b { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i32 48 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st3: +;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st3: +;CHECK: st3.8b { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i32 24 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st3: +;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st3: +;CHECK: st3.8h { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i32 24 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st3: +;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st3: +;CHECK: st3.4h { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i32 12 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st3: +;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i32 12 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i32 6 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 6 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 3 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i32 12 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i32 6 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 6 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 3 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st4: +;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i32 64 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st4: +;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st4: +;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st4: +;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st4: +;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i32 32 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st4: +;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st4: +;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st4: +;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*) + + +define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i32 16 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*) + + +define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 8 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*) + + +define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 4 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*) + + +define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i32 16 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i32 8 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 8 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*) + + +define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 4 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st1x2: +;CHECK: st1.16b { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st1x2: +;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st1x2: +;CHECK: st1.8b { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i32 16 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st1x2: +;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st1x2: +;CHECK: st1.8h { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st1x2: +;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st1x2: +;CHECK: st1.4h { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i32 8 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st1x2: +;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 4 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 2 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i32 8 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i32 4 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], #32 +  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 4 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 +  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 2 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st1x3: +;CHECK: st1.16b { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i32 48 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st1x3: +;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st1x3: +;CHECK: st1.8b { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i32 24 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st1x3: +;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st1x3: +;CHECK: st1.8h { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i32 24 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st1x3: +;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st1x3: +;CHECK: st1.4h { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i32 12 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st1x3: +;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i32 12 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i32 6 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 6 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 3 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i32 12 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i32 6 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], #48 +  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 6 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 +  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 3 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st1x4: +;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i32 64 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st1x4: +;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st1x4: +;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i32 32 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st1x4: +;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st1x4: +;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i32 32 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st1x4: +;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st1x4: +;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i32 16 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st1x4: +;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*) + + +define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i32 16 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*) + + +define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i32 8 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 8 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*) + + +define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 4 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*) + + +define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i32 16 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i32 8 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64 +  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 8 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*) + + +define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 +  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 4 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) { +  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) { +  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone + + +define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], #2 +  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) + + +define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], #2 +  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 2 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) + + +define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], #4 +  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 2 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) + + +define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], #4 +  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 2 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) + + +define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 +  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 2 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) + + +define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 +  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 2 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) + + +define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 +  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 2 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) + + +define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 +  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 2 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) + + +define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 +  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 2 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) + + +define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 +  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 2 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) + + +define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 +  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 2 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) + + +define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 +  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 2 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) + + +define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3 +  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 3 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) + + +define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3 +  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 3 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) + + +define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6 +  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 3 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) + + +define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6 +  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 3 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) + + +define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 +  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 3 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) + + +define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 +  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 3 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) + + +define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 +  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 3 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) + + +define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 +  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 3 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) + + +define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 +  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 3 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) + + +define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 +  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 3 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) + + +define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 +  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 3 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) + + +define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 +  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 3 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) + + +define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4 +  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 4 +  ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) + + +define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4 +  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i32 4 +  ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) +  %tmp = getelementptr i8* %A, i64 %inc +  ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) + + +define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8 +  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 4 +  ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) + + +define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8 +  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i32 4 +  ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) +  %tmp = getelementptr i16* %A, i64 %inc +  ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) + + +define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 +  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) + + +define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 +  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i32 4 +  ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) +  %tmp = getelementptr i32* %A, i64 %inc +  ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) + + +define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 +  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 4 +  ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) + + +define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 +  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 4 +  ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) +  %tmp = getelementptr i64* %A, i64 %inc +  ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) + + +define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 +  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 4 +  ret float* %tmp +} + +define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) + + +define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 +  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i32 4 +  ret float* %tmp +} + +define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) +  %tmp = getelementptr float* %A, i64 %inc +  ret float* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) + + +define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 +  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 4 +  ret double* %tmp +} + +define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) + + +define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 +  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 4 +  ret double* %tmp +} + +define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} +  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) +  %tmp = getelementptr double* %A, i64 %inc +  ret double* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
\ No newline at end of file  | 

