diff options
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsARM.td | 14 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 76 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 49 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrNEON.td | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/arm-vst1.ll | 363 | 
6 files changed, 542 insertions, 8 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 29fdb6ab07f..3c7e1de8597 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -671,6 +671,20 @@ def int_arm_neon_vst4 : Intrinsic<[],                                     LLVMMatchType<1>, llvm_i32_ty],                                    [IntrArgMemOnly]>; +def int_arm_neon_vst1x2 : Intrinsic<[], +                                    [llvm_anyptr_ty, llvm_anyvector_ty, +                                     LLVMMatchType<1>], +                                    [IntrArgMemOnly, NoCapture<0>]>; +def int_arm_neon_vst1x3 : Intrinsic<[], +                                    [llvm_anyptr_ty, llvm_anyvector_ty, +                                     LLVMMatchType<1>, LLVMMatchType<1>], +                                    [IntrArgMemOnly, NoCapture<0>]>; +def int_arm_neon_vst1x4 : Intrinsic<[], +                                    [llvm_anyptr_ty, llvm_anyvector_ty, +                                     LLVMMatchType<1>, LLVMMatchType<1>, +                                     LLVMMatchType<1>], +                                    [IntrArgMemOnly, NoCapture<0>]>; +  // Vector store N-element structure from one lane.  // Source operands are: the address, the N vectors, the lane number, and  // the alignment. diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index e3d1b1d6968..d82bef5b759 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -284,12 +284,34 @@ static const NEONLdStTableEntry NEONLdStTable[] = {  { ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, false, EvenDblSpc, 1, 8 ,true},  { ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true, true,  EvenDblSpc, 1, 8 ,true}, +{ ARM::VST1d16QPseudo,      ARM::VST1d16Q,     false, false, false, SingleSpc,  4, 4 ,false}, +{ ARM::VST1d16TPseudo,      ARM::VST1d16T,     false, false, false, SingleSpc,  3, 4 ,false}, +{ ARM::VST1d32QPseudo,      ARM::VST1d32Q,     false, false, false, SingleSpc,  4, 2 ,false}, +{ ARM::VST1d32TPseudo,      ARM::VST1d32T,     false, false, false, SingleSpc,  3, 2 ,false},  { ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,false},  { ARM::VST1d64QPseudoWB_fixed,  ARM::VST1d64Qwb_fixed, false, true, false,  SingleSpc,  4, 1 ,false},  { ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true,  SingleSpc,  4, 1 ,false},  { ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,false},  { ARM::VST1d64TPseudoWB_fixed,  ARM::VST1d64Twb_fixed, false, true, false,  SingleSpc,  3, 1 ,false},  { ARM::VST1d64TPseudoWB_register,  ARM::VST1d64Twb_register, false, true, true,  SingleSpc,  3, 1 ,false}, +{ ARM::VST1d8QPseudo,       ARM::VST1d8Q,      false, false, false, SingleSpc,  4, 8 ,false}, +{ ARM::VST1d8TPseudo,       ARM::VST1d8T,      false, false, false, SingleSpc,  3, 8 ,false}, +{ ARM::VST1q16HighQPseudo,  ARM::VST1d16Q,      false, false, false, SingleHighQSpc,   4, 4 ,false}, +{ ARM::VST1q16HighTPseudo,  ARM::VST1d16T,      false, false, false, SingleHighTSpc,   3, 4 ,false}, +{ ARM::VST1q16LowQPseudo_UPD,   ARM::VST1d16Qwb_fixed,  false, true, true, SingleLowSpc,   4, 4 ,false}, +{ ARM::VST1q16LowTPseudo_UPD,   ARM::VST1d16Twb_fixed,  false, true, true, SingleLowSpc,   3, 4 ,false}, +{ ARM::VST1q32HighQPseudo,  ARM::VST1d32Q,      false, false, false, SingleHighQSpc,   4, 2 ,false}, +{ ARM::VST1q32HighTPseudo,  ARM::VST1d32T,      false, false, false, SingleHighTSpc,   3, 2 ,false}, +{ ARM::VST1q32LowQPseudo_UPD,   ARM::VST1d32Qwb_fixed,  false, true, true, SingleLowSpc,   4, 2 ,false}, +{ ARM::VST1q32LowTPseudo_UPD,   ARM::VST1d32Twb_fixed,  false, true, true, SingleLowSpc,   3, 2 ,false}, +{ ARM::VST1q64HighQPseudo,  ARM::VST1d64Q,      false, false, false, SingleHighQSpc,   4, 1 ,false}, +{ ARM::VST1q64HighTPseudo,  ARM::VST1d64T,      false, false, false, SingleHighTSpc,   3, 1 ,false}, +{ ARM::VST1q64LowQPseudo_UPD,   ARM::VST1d64Qwb_fixed,  false, true, true, SingleLowSpc,   4, 1 ,false}, +{ ARM::VST1q64LowTPseudo_UPD,   ARM::VST1d64Twb_fixed,  false, true, true, SingleLowSpc,   3, 1 ,false}, +{ ARM::VST1q8HighQPseudo,   ARM::VST1d8Q,      false, false, false, SingleHighQSpc,   4, 8 ,false}, +{ ARM::VST1q8HighTPseudo,   ARM::VST1d8T,      false, false, false, SingleHighTSpc,   3, 8 ,false}, +{ ARM::VST1q8LowQPseudo_UPD,   ARM::VST1d8Qwb_fixed,  false, true, true, SingleLowSpc,   4, 8 ,false}, +{ ARM::VST1q8LowTPseudo_UPD,   ARM::VST1d8Twb_fixed,  false, true, true, SingleLowSpc,   3, 8 ,false},  { ARM::VST2LNd16Pseudo,     ARM::VST2LNd16,     false, false, false, SingleSpc, 2, 4 ,true},  { ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true,  SingleSpc, 2, 4 ,true}, @@ -465,7 +487,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {      // and register forms. Some real instructions, however, do not rely on      // am6offset and have separate definitions for such forms. When this is the      // case, fixed forms do not take any offset nodes, so here we skip them for -    // such intructions. Once all real and pseudo writing-back instructions are +    // such instructions. Once all real and pseudo writing-back instructions are      // rewritten without use of am6offset nodes, this code will go away.      const MachineOperand &AM6Offset = MI.getOperand(OpIdx++);      if (TableEntry->RealOpc == ARM::VLD1d8Qwb_fixed || @@ -477,7 +499,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {          TableEntry->RealOpc == ARM::VLD1d32Twb_fixed ||          TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) {        assert(AM6Offset.getReg() == 0 && -             "A fixed writing-back pseudo intruction provides an offset " +             "A fixed writing-back pseudo instruction provides an offset "               "register!");      } else {        MIB.add(AM6Offset); @@ -534,9 +556,31 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {    // Copy the addrmode6 operands.    MIB.add(MI.getOperand(OpIdx++));    MIB.add(MI.getOperand(OpIdx++)); -  // Copy the am6offset operand. -  if (TableEntry->hasWritebackOperand) -    MIB.add(MI.getOperand(OpIdx++)); + +  if (TableEntry->hasWritebackOperand) { +    // TODO: The writing-back pseudo instructions we translate here are all +    // defined to take am6offset nodes that are capable to represent both fixed +    // and register forms. Some real instructions, however, do not rely on +    // am6offset and have separate definitions for such forms. When this is the +    // case, fixed forms do not take any offset nodes, so here we skip them for +    // such instructions. Once all real and pseudo writing-back instructions are +    // rewritten without use of am6offset nodes, this code will go away. +    const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); +    if (TableEntry->RealOpc == ARM::VST1d8Qwb_fixed || +        TableEntry->RealOpc == ARM::VST1d16Qwb_fixed || +        TableEntry->RealOpc == ARM::VST1d32Qwb_fixed || +        TableEntry->RealOpc == ARM::VST1d64Qwb_fixed || +        TableEntry->RealOpc == ARM::VST1d8Twb_fixed || +        TableEntry->RealOpc == ARM::VST1d16Twb_fixed || +        TableEntry->RealOpc == ARM::VST1d32Twb_fixed || +        TableEntry->RealOpc == ARM::VST1d64Twb_fixed) { +      assert(AM6Offset.getReg() == 0 && +             "A fixed writing-back pseudo instruction provides an offset " +             "register!"); +    } else { +      MIB.add(AM6Offset); +    } +  }    bool SrcIsKill = MI.getOperand(OpIdx).isKill();    bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); @@ -1645,6 +1689,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,      case ARM::VST3d8Pseudo:      case ARM::VST3d16Pseudo:      case ARM::VST3d32Pseudo: +    case ARM::VST1d8TPseudo: +    case ARM::VST1d16TPseudo: +    case ARM::VST1d32TPseudo:      case ARM::VST1d64TPseudo:      case ARM::VST3d8Pseudo_UPD:      case ARM::VST3d16Pseudo_UPD: @@ -1663,12 +1710,31 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,      case ARM::VST4d8Pseudo:      case ARM::VST4d16Pseudo:      case ARM::VST4d32Pseudo: +    case ARM::VST1d8QPseudo: +    case ARM::VST1d16QPseudo: +    case ARM::VST1d32QPseudo:      case ARM::VST1d64QPseudo:      case ARM::VST4d8Pseudo_UPD:      case ARM::VST4d16Pseudo_UPD:      case ARM::VST4d32Pseudo_UPD:      case ARM::VST1d64QPseudoWB_fixed:      case ARM::VST1d64QPseudoWB_register: +    case ARM::VST1q8HighQPseudo: +    case ARM::VST1q8LowQPseudo_UPD: +    case ARM::VST1q8HighTPseudo: +    case ARM::VST1q8LowTPseudo_UPD: +    case ARM::VST1q16HighQPseudo: +    case ARM::VST1q16LowQPseudo_UPD: +    case ARM::VST1q16HighTPseudo: +    case ARM::VST1q16LowTPseudo_UPD: +    case ARM::VST1q32HighQPseudo: +    case ARM::VST1q32LowQPseudo_UPD: +    case ARM::VST1q32HighTPseudo: +    case ARM::VST1q32LowTPseudo_UPD: +    case ARM::VST1q64HighQPseudo: +    case ARM::VST1q64LowQPseudo_UPD: +    case ARM::VST1q64HighTPseudo: +    case ARM::VST1q64LowTPseudo_UPD:      case ARM::VST4q8Pseudo_UPD:      case ARM::VST4q16Pseudo_UPD:      case ARM::VST4q32Pseudo_UPD: diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 1a0ffe4e176..7d6963c3608 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1903,9 +1903,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,    case MVT::v4f32:    case MVT::v4i32: OpcodeIndex = 2; break;    case MVT::v2f64: -  case MVT::v2i64: OpcodeIndex = 3; -    assert(NumVecs == 1 && "v2i64 type only supported for VST1"); -    break; +  case MVT::v2i64: OpcodeIndex = 3; break;    }    std::vector<EVT> ResTys; @@ -3562,6 +3560,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {        return;      } +    case Intrinsic::arm_neon_vst1x2: { +      static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16, +                                           ARM::VST1q32, ARM::VST1q64 }; +      static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo, +                                           ARM::VST1d16QPseudo, +                                           ARM::VST1d32QPseudo, +                                           ARM::VST1d64QPseudo }; +      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); +      return; +    } + +    case Intrinsic::arm_neon_vst1x3: { +      static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo, +                                           ARM::VST1d16TPseudo, +                                           ARM::VST1d32TPseudo, +                                           ARM::VST1d64TPseudo }; +      static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD, +                                            ARM::VST1q16LowTPseudo_UPD, +                                            ARM::VST1q32LowTPseudo_UPD, +                                            ARM::VST1q64LowTPseudo_UPD }; +      static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo, +                                            ARM::VST1q16HighTPseudo, +                                            ARM::VST1q32HighTPseudo, +                                            ARM::VST1q64HighTPseudo }; +      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); +      return; +    } + +    case Intrinsic::arm_neon_vst1x4: { +      static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo, +                                           ARM::VST1d16QPseudo, +                                           ARM::VST1d32QPseudo, +                                           ARM::VST1d64QPseudo }; +      static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD, +                                            ARM::VST1q16LowQPseudo_UPD, +                                            ARM::VST1q32LowQPseudo_UPD, +                                            ARM::VST1q64LowQPseudo_UPD }; +      static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo, +                                            ARM::VST1q16HighQPseudo, +                                            ARM::VST1q32HighQPseudo, +                                            ARM::VST1q64HighQPseudo }; +      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); +      return; +    } +      case Intrinsic::arm_neon_vst2: {        static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,                                             ARM::VST2d32, ARM::VST1q64 }; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 03ccaa33366..673bc8dd47a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12773,6 +12773,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,      case Intrinsic::arm_neon_vld3lane:      case Intrinsic::arm_neon_vld4lane:      case Intrinsic::arm_neon_vst1: +    case Intrinsic::arm_neon_vst1x2: +    case Intrinsic::arm_neon_vst1x3: +    case Intrinsic::arm_neon_vst1x4:      case Intrinsic::arm_neon_vst2:      case Intrinsic::arm_neon_vst3:      case Intrinsic::arm_neon_vst4: @@ -14118,6 +14121,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,      Info.flags = MachineMemOperand::MOStore;      return true;    } +  case Intrinsic::arm_neon_vst1x2: +  case Intrinsic::arm_neon_vst1x3: +  case Intrinsic::arm_neon_vst1x4: { +    Info.opc = ISD::INTRINSIC_VOID; +    // Conservatively set memVT to the entire set of vectors stored. +    auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); +    unsigned NumElts = 0; +    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { +      Type *ArgTy = I.getArgOperand(ArgI)->getType(); +      if (!ArgTy->isVectorTy()) +        break; +      NumElts += DL.getTypeSizeInBits(ArgTy) / 64; +    } +    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); +    Info.ptrVal = I.getArgOperand(0); +    Info.offset = 0; +    Info.align = 0; +    // volatile stores with NEON intrinsics not supported +    Info.flags = MachineMemOperand::MOStore; +    return true; +  }    case Intrinsic::arm_ldaex:    case Intrinsic::arm_ldrex: {      auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index ade303505d3..8010352d343 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -1801,10 +1801,22 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;  defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;  defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>; +def VST1d8TPseudo             : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1d16TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1d32TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;  def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;  def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;  def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>; +def VST1q8HighTPseudo     : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q8LowTPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q16HighTPseudo    : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q32HighTPseudo    : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q64HighTPseudo    : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q64LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +  // ...with 4 registers  class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>    : NLdSt<0, 0b00, 0b0010, op7_4, (outs), @@ -1844,10 +1856,22 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;  defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;  defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; +def VST1d8QPseudo             : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1d16QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1d32QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;  def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;  def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;  def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>; +def VST1q8HighQPseudo     : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q8LowQPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q16HighQPseudo    : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q32HighQPseudo    : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q64HighQPseudo    : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q64LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +  //   VST2     : Vector Store (multiple 2-element structures)  class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,              InstrItinClass itin, Operand AddrMode> diff --git a/llvm/test/CodeGen/ARM/arm-vst1.ll b/llvm/test/CodeGen/ARM/arm-vst1.ll new file mode 100644 index 00000000000..3e8f6d76c31 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arm-vst1.ll @@ -0,0 +1,363 @@ +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \ +; RUN:     -asm-verbose=false | FileCheck %s + +; %struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> } +; %struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } +; %struct.uint16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } + +; %struct.uint32x2x2_t = type { <2 x i32>, <2 x i32> } +; %struct.uint32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } +; %struct.uint32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } + +; %struct.uint64x1x2_t = type { <1 x i64>, <1 x i64> } +; %struct.uint64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> } +; %struct.uint64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } + +; %struct.uint8x8x2_t = type { <8 x i8>, <8 x i8> } +; %struct.uint8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } +; %struct.uint8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + +; %struct.uint16x8x2_t = type { <8 x i16>, <8 x i16> } +; %struct.uint16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } +; %struct.uint16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } + +; %struct.uint32x4x2_t = type { <4 x i32>, <4 x i32> } +; %struct.uint32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } +; %struct.uint32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } + +; %struct.uint64x2x2_t = type { <2 x i64>, <2 x i64> } +; %struct.uint64x2x3_t = type { <2 x i64>, <2 x i64>, <2 x i64> } +; %struct.uint64x2x4_t = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } + +; %struct.uint8x16x2_t = type { <16 x i8>, <16 x i8> } +; %struct.uint8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> } +; %struct.uint8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } + +%struct.uint16x4x2_t = type { [2 x <4 x i16>] } +%struct.uint16x4x3_t = type { [3 x <4 x i16>] } +%struct.uint16x4x4_t = type { [4 x <4 x i16>] } +%struct.uint32x2x2_t = type { [2 x <2 x i32>] } +%struct.uint32x2x3_t = type { [3 x <2 x i32>] } +%struct.uint32x2x4_t = type { [4 x <2 x i32>] } +%struct.uint64x1x2_t = type { [2 x <1 x i64>] } +%struct.uint64x1x3_t = type { [3 x <1 x i64>] } +%struct.uint64x1x4_t = type { [4 x <1 x i64>] } +%struct.uint8x8x2_t = type { [2 x <8 x i8>] } +%struct.uint8x8x3_t = type { [3 x <8 x i8>] } +%struct.uint8x8x4_t = type { [4 x <8 x i8>] } +%struct.uint16x8x2_t = type { [2 x <8 x i16>] } +%struct.uint16x8x3_t = type { [3 x <8 x i16>] } +%struct.uint16x8x4_t = type { [4 x <8 x i16>] } +%struct.uint32x4x2_t = type { [2 x <4 x i32>] } +%struct.uint32x4x3_t = type { [3 x <4 x i32>] } +%struct.uint32x4x4_t = type { [4 x <4 x i32>] } +%struct.uint64x2x2_t = type { [2 x <2 x i64>] } +%struct.uint64x2x3_t = type { [3 x <2 x i64>] } +%struct.uint64x2x4_t = type { [4 x <2 x i64>] } +%struct.uint8x16x2_t = type { [2 x <16 x i8>] } +%struct.uint8x16x3_t = type { [3 x <16 x i8>] } +%struct.uint8x16x4_t = type { [4 x <16 x i8>] } + +declare void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* nocapture, <4 x i16>, <4 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* nocapture, <4 x i16>, <4 x i16>, <4 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* nocapture, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* nocapture, <2 x i32>, <2 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* nocapture, <2 x i32>, <2 x i32>, <2 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* nocapture, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* nocapture, <1 x i64>, <1 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* nocapture, <1 x i64>, <1 x i64>, <1 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* nocapture, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* nocapture, <8 x i8>, <8 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* nocapture, <8 x i8>, <8 x i8>, <8 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* nocapture, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* nocapture, <8 x i16>, <8 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* nocapture, <8 x i16>, <8 x i16>, <8 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* nocapture, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* nocapture, <4 x i32>, <4 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* nocapture, <4 x i32>, <4 x i32>, <4 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* nocapture, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* nocapture, <2 x i64>, <2 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* nocapture, <2 x i64>, <2 x i64>, <2 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* nocapture, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>, <16 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) argmemonly nounwind + +; CHECK-LABEL: test_vst1_u16_x2 +; CHECK: vst1.16 {d16, d17}, [r0:64] +define void @test_vst1_u16_x2(i16* %a, %struct.uint16x4x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* %a, <4 x i16> %b0, <4 x i16> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1_u16_x3 +; CHECK: vst1.16 {d16, d17, d18}, [r0:64] +define void @test_vst1_u16_x3(i16* %a, %struct.uint16x4x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint16x4x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* %a, <4 x i16> %b0, <4 x i16> %b1, <4 x i16> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1_u16_x4 +; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u16_x4(i16* %a, %struct.uint16x4x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint16x4x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint16x4x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* %a, <4 x i16> %b0, <4 x i16> %b1, <4 x i16> %b2, <4 x i16> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1_u32_x2 +; CHECK: vst1.32 {d16, d17}, [r0:64] +define void @test_vst1_u32_x2(i32* %a, %struct.uint32x2x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint32x2x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* %a, <2 x i32> %b0, <2 x i32> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1_u32_x3 +; CHECK: vst1.32 {d16, d17, d18}, [r0:64] +define void @test_vst1_u32_x3(i32* %a, %struct.uint32x2x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint32x2x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint32x2x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* %a, <2 x i32> %b0, <2 x i32> %b1, <2 x i32> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1_u32_x4 +; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u32_x4(i32* %a, %struct.uint32x2x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint32x2x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint32x2x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint32x2x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* %a, <2 x i32> %b0, <2 x i32> %b1, <2 x i32> %b2, <2 x i32> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1_u64_x2 +; CHECK: vst1.64 {d16, d17}, [r0:64] +define void @test_vst1_u64_x2(i64* %a, %struct.uint64x1x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint64x1x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* %a, <1 x i64> %b0, <1 x i64> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1_u64_x3 +; CHECK: vst1.64 {d16, d17, d18}, [r0:64] +define void @test_vst1_u64_x3(i64* %a, %struct.uint64x1x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint64x1x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint64x1x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* %a, <1 x i64> %b0, <1 x i64> %b1, <1 x i64> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1_u64_x4 +; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u64_x4(i64* %a, %struct.uint64x1x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint64x1x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint64x1x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint64x1x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* %a, <1 x i64> %b0, <1 x i64> %b1, <1 x i64> %b2, <1 x i64> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1_u8_x2 +; CHECK: vst1.8 {d16, d17}, [r0:64] +define void @test_vst1_u8_x2(i8* %a, %struct.uint8x8x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint8x8x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> %b0, <8 x i8> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1_u8_x3 +; CHECK: vst1.8 {d16, d17, d18}, [r0:64] +define void @test_vst1_u8_x3(i8* %a, %struct.uint8x8x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint8x8x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint8x8x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> %b0, <8 x i8> %b1, <8 x i8> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1_u8_x4 +; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u8_x4(i8* %a, %struct.uint8x8x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint8x8x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint8x8x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint8x8x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> %b0, <8 x i8> %b1, <8 x i8> %b2, <8 x i8> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1q_u16_x2 +; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u16_x2(i16* %a, %struct.uint16x8x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint16x8x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* %a, <8 x i16> %b0, <8 x i16> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1q_u16_x3 +; CHECK: vst1.16 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.16 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u16_x3(i16* %a, %struct.uint16x8x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint16x8x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint16x8x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* %a, <8 x i16> %b0, <8 x i16> %b1, <8 x i16> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1q_u16_x4 +; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.16 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u16_x4(i16* %a, %struct.uint16x8x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint16x8x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint16x8x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint16x8x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* %a, <8 x i16> %b0, <8 x i16> %b1, <8 x i16> %b2, <8 x i16> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1q_u32_x2 +; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u32_x2(i32* %a, %struct.uint32x4x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint32x4x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* %a, <4 x i32> %b0, <4 x i32> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1q_u32_x3 +; CHECK: vst1.32 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.32 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u32_x3(i32* %a, %struct.uint32x4x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint32x4x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint32x4x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* %a, <4 x i32> %b0, <4 x i32> %b1, <4 x i32> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1q_u32_x4 +; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.32 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u32_x4(i32* %a, %struct.uint32x4x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint32x4x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint32x4x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint32x4x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* %a, <4 x i32> %b0, <4 x i32> %b1, <4 x i32> %b2, <4 x i32> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1q_u64_x2 +; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u64_x2(i64* %a, %struct.uint64x2x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint64x2x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* %a, <2 x i64> %b0, <2 x i64> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1q_u64_x3 +; CHECK: vst1.64 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.64 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u64_x3(i64* %a, %struct.uint64x2x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint64x2x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint64x2x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* %a, <2 x i64> %b0, <2 x i64> %b1, <2 x i64> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1q_u64_x4 +; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.64 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u64_x4(i64* %a, %struct.uint64x2x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint64x2x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint64x2x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint64x2x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* %a, <2 x i64> %b0, <2 x i64> %b1, <2 x i64> %b2, <2 x i64> %b3) +  ret void +} + +; CHECK-LABEL: test_vst1q_u8_x2 +; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u8_x2(i8* %a, %struct.uint8x16x2_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0 +  %b1 = extractvalue %struct.uint8x16x2_t %b, 0, 1 +  tail call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1) +  ret void +} + +; CHECK-LABEL: test_vst1q_u8_x3 +; CHECK: vst1.8 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.8 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u8_x3(i8* %a, %struct.uint8x16x3_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0 +  %b1 = extractvalue %struct.uint8x16x3_t %b, 0, 1 +  %b2 = extractvalue %struct.uint8x16x3_t %b, 0, 2 +  tail call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1, <16 x i8> %b2) +  ret void +} + +; CHECK-LABEL: test_vst1q_u8_x4 +; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.8 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u8_x4(i8* %a, %struct.uint8x16x4_t %b) nounwind { +entry: +  %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0 +  %b1 = extractvalue %struct.uint8x16x4_t %b, 0, 1 +  %b2 = extractvalue %struct.uint8x16x4_t %b, 0, 2 +  %b3 = extractvalue %struct.uint8x16x4_t %b, 0, 3 +  tail call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1, <16 x i8> %b2, <16 x i8> %b3) +  ret void +}  | 

