From ab99b59e8ca28f5366fb95b497e64ae44d67a9ca Mon Sep 17 00:00:00 2001
From: Jeroen Ketema <j.ketema@imperial.ac.uk>
Date: Wed, 30 Sep 2015 10:56:37 +0000
Subject: [ARM][NEON] Use address space in vld([1234]|[234]lane) and
 vst([1234]|[234]lane) instructions

This commit changes the interface of the vld[1234], vld[234]lane, and vst[1234],
vst[234]lane ARM neon intrinsics and associates an address space with the
pointer that these intrinsics take. This changes, e.g.,

<2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)

to

<2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32)

This change ensures that address spaces are fully taken into account in the ARM
target during lowering of interleaved loads and stores.

Differential Revision: http://reviews.llvm.org/D12985

llvm-svn: 248887
---
 llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll |  24 ++--
 llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll    |   4 +-
 .../test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll |   4 +-
 .../ARM/2010-06-29-PartialRedefFastAlloc.ll        |   4 +-
 .../test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll |   4 +-
 .../CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll |  10 +-
 .../CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll   |   4 +-
 .../CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll     |  14 +--
 llvm/test/CodeGen/ARM/2013-10-11-select-stalls.ll  |   6 +-
 .../ARM/2014-01-09-pseudo_expand_implicit_reg.ll   |   4 +-
 llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll  |  21 ++++
 llvm/test/CodeGen/ARM/coalesce-subregs.ll          |  38 +++---
 llvm/test/CodeGen/ARM/dagcombine-concatvector.ll   |   4 +-
 llvm/test/CodeGen/ARM/neon_spill.ll                |   6 +-
 llvm/test/CodeGen/ARM/out-of-registers.ll          |   8 +-
 llvm/test/CodeGen/ARM/reg_sequence.ll              |  64 +++++-----
 llvm/test/CodeGen/ARM/spill-q.ll                   |  28 ++---
 llvm/test/CodeGen/ARM/vcge.ll                      |   4 +-
 llvm/test/CodeGen/ARM/vector-DAGCombine.ll         |   4 +-
 llvm/test/CodeGen/ARM/vld-vst-upgrade.ll           | 139 +++++++++++++++++++++
 llvm/test/CodeGen/ARM/vld1.ll                      |  52 ++++----
 llvm/test/CodeGen/ARM/vld2.ll                      |  40 +++---
 llvm/test/CodeGen/ARM/vld3.ll                      |  42 +++----
 llvm/test/CodeGen/ARM/vld4.ll                      |  42 +++----
 llvm/test/CodeGen/ARM/vlddup.ll                    |  30 ++---
 llvm/test/CodeGen/ARM/vldlane.ll                   |  92 +++++++-------
 llvm/test/CodeGen/ARM/vmov.ll                      |   4 +-
 llvm/test/CodeGen/ARM/vmul.ll                      |  14 +--
 llvm/test/CodeGen/ARM/vst1.ll                      |  48 +++----
 llvm/test/CodeGen/ARM/vst2.ll                      |  44 +++----
 llvm/test/CodeGen/ARM/vst3.ll                      |  42 +++----
 llvm/test/CodeGen/ARM/vst4.ll                      |  42 +++----
 llvm/test/CodeGen/ARM/vstlane.ll                   |  90 ++++++-------
 33 files changed, 568 insertions(+), 408 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/vld-vst-upgrade.ll

(limited to 'llvm/test/CodeGen/ARM')

diff --git a/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
index cfaffd8234b..171b6d2bcc5 100644
--- a/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
+++ b/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@@ -1,36 +1,36 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null
 
 ; This test would crash the rewriter when trying to handle a spill after one of
-; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register.
+; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register.
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
 
 define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
-  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
   %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
   %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
   %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
   %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
   %tmp2bd = add <8 x i8> %tmp2b, %tmp2d           ; <<8 x i8>> [#uses=1]
   %tmp4bd = add <8 x i8> %tmp4b, %tmp4d           ; <<8 x i8>> [#uses=1]
   %tmp2abcd = mul <8 x i8> undef, %tmp2bd         ; <<8 x i8>> [#uses=1]
   %tmp4abcd = mul <8 x i8> undef, %tmp4bd         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
+  call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
   %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f           ; <<8 x i8>> [#uses=1]
   %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h           ; <<8 x i8>> [#uses=1]
   %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h  ; <<8 x i8>> [#uses=1]
@@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A
   %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh       ; <<8 x i8>> [#uses=1]
   %tmp3efgh = mul <8 x i8> undef, %tmp3gh         ; <<8 x i8>> [#uses=1]
   %tmp4efgh = mul <8 x i8> %tmp4ef, undef         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
+  call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
   %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd       ; <<8 x i8>> [#uses=1]
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
+  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
   ret <8 x i8> %tmp4
 }
diff --git a/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll
index 6a6ccf3d0a0..c6c0e2caee4 100644
--- a/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll
+++ b/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@@ -36,8 +36,8 @@ entry:
   %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
   %19 = fmul <4 x float> %tmp5, %2
   %20 = bitcast float* %fltp to i8*
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll b/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
index f86c3ba9ef6..1deb98631a4 100644
--- a/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
+++ b/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
@@ -12,8 +12,8 @@ entry:
  %tmp9 = trunc i128 %tmp8 to i64                 ; <i64> [#uses=1]
  %tmp16.i = bitcast i64 %tmp6 to <8 x i8>        ; <<8 x i8>> [#uses=1]
  %tmp20.i = bitcast i64 %tmp9 to <8 x i8>        ; <<8 x i8>> [#uses=1]
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
+ tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
  ret void
 }
 
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll b/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
index 1aee5088eee..130221d38c2 100644
--- a/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
+++ b/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
@@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10"
 
 define i32 @test(i8* %arg) nounwind {
 entry:
- %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1)
+ %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1)
  %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> <i32 1, i32 2>
  store <2 x i64> %1, <2 x i64>* undef, align 16
  ret i32 undef
 }
 
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
diff --git a/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll b/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll
index 3cbc4cdcd70..d702af7c0c7 100644
--- a/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll
+++ b/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll
@@ -4,9 +4,9 @@
 
 define void @test_vmovqqqq_pseudo() nounwind ssp {
 entry:
-  %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
+  %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
   store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef
   ret void
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
diff --git a/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
index b70b7f6f3b2..f622ceb584e 100644
--- a/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
+++ b/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
@@ -52,8 +52,8 @@ cond.end295:                                      ; preds = %entry
   %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
   %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> <i32 0, i32 1>
   %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float>
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
   unreachable
 
 for.end:                                          ; preds = %entry
@@ -63,10 +63,10 @@ for.end:                                          ; preds = %entry
 ; Check that pseudo-expansion preserves <undef> flags.
 define void @foo3(i8* %p) nounwind ssp {
 entry:
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
   ret void
 }
 
 declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
index 7f30ae10e43..606af47a3d8 100644
--- a/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
+++ b/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -7,8 +7,8 @@ entry:
   %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
   %0 = bitcast i32* %p to i8*
-  tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
index 545bfc73c59..6cff67614c6 100644
--- a/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
+++ b/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
@@ -5,9 +5,9 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios5.1.0"
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
 
 define void @findEdges(i8*) nounwind ssp {
   %2 = icmp sgt i32 undef, 0
@@ -19,16 +19,16 @@ define void @findEdges(i8*) nounwind ssp {
 
 ; <label>:5                                       ; preds = %5, %1
   %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
-  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* null, i32 1)
   %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
   %9 = getelementptr inbounds i8, i8* null, i32 3
-  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %9, i32 1)
   %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
-  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %6, i32 1)
   %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
   %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
   %15 = getelementptr inbounds i8, i8* %6, i32 3
-  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %15, i32 1)
   %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
   %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
   %19 = getelementptr inbounds i8, i8* %6, i32 48
@@ -111,7 +111,7 @@ define void @findEdges(i8*) nounwind ssp {
   %96 = bitcast <8 x i8> %94 to <1 x i64>
   %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
   %98 = bitcast <2 x i64> %97 to <16 x i8>
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* null, <16 x i8> %98, i32 1)
   %99 = icmp slt i32 undef, undef
   br i1 %99, label %5, label %3
 }
diff --git a/llvm/test/CodeGen/ARM/2013-10-11-select-stalls.ll b/llvm/test/CodeGen/ARM/2013-10-11-select-stalls.ll
index c41fe905aa8..2c15c1a943b 100644
--- a/llvm/test/CodeGen/ARM/2013-10-11-select-stalls.ll
+++ b/llvm/test/CodeGen/ARM/2013-10-11-select-stalls.ll
@@ -10,12 +10,12 @@
 ; CHECK-NOT: Number of pipeline stalls
 define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
 entry:
-  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1)
-  %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %foo, i32 1)
+  %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %bar, i32 1)
   %and = and i32 %avail, 3
   %tobool = icmp eq i32 %and, 0
   %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
   ret <16 x i8> %retv
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* , i32 )
diff --git a/llvm/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll b/llvm/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
index ef575f4c41e..be87a2fb1c8 100644
--- a/llvm/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
+++ b/llvm/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
@@ -27,7 +27,7 @@ entry:
   %n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0
   %n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1
 
-  call void @llvm.arm.neon.vst4.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
+  call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
 
   call void @bar(<2 x i64> %n1)
 
@@ -50,7 +50,7 @@ define <8 x i8> @vtbx4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B, <8 x i8>* %C
 	ret <8 x i8> %tmp8
 }
 
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
 declare <8 x i8>  @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
 declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>)
 declare void @bar(<2 x i64> %arg)
diff --git a/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll b/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
index 9a9885ccdd0..a1edf39ddf2 100644
--- a/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ b/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -202,3 +202,24 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
   ret void
 }
+
+; The following test cases check that address spaces are properly handled
+
+; CHECK-LABEL: load_address_space
+; CHECK: vld3.32
+define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) {
+ %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A
+ %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
+ store <2 x i32> %interleaved, <2 x i32>* %B
+ ret void
+}
+
+; CHECK-LABEL: store_address_space
+; CHECK: vst2.32
+define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) {
+ %tmp0 = load <2 x i32>, <2 x i32>* %A
+ %tmp1 = load <2 x i32>, <2 x i32>* %B
+ %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/coalesce-subregs.ll b/llvm/test/CodeGen/ARM/coalesce-subregs.ll
index 72fefeacfc5..a11976e2744 100644
--- a/llvm/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/llvm/test/CodeGen/ARM/coalesce-subregs.ll
@@ -14,11 +14,11 @@ target triple = "thumbv7-apple-ios0.0.0"
 define void @f(float* %p, i32 %c) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
   %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
   %add.ptr = getelementptr inbounds float, float* %p, i32 8
   %1 = bitcast float* %add.ptr to i8*
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
   ret void
 }
 
@@ -27,13 +27,13 @@ entry:
 define void @f1(float* %p, i32 %c) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
   %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
   %add.ptr = getelementptr inbounds float, float* %p, i32 8
   %1 = bitcast float* %add.ptr to i8*
-  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
   %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
   ret void
 }
 
@@ -42,7 +42,7 @@ entry:
 define void @f2(float* %p, i32 %c) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
   %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
   br label %do.body
 
@@ -52,10 +52,10 @@ do.body:                                          ; preds = %do.body, %entry
   %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
   %add.ptr = getelementptr inbounds float, float* %p.addr.0, i32 8
   %1 = bitcast float* %add.ptr to i8*
-  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
   %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
   %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
   %dec = add nsw i32 %c.addr.0, -1
   %tobool = icmp eq i32 %dec, 0
   br i1 %tobool, label %do.end, label %do.body
@@ -64,8 +64,8 @@ do.end:                                           ; preds = %do.body
   ret void
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
 
 ; CHECK: f3
 ; This function has lane insertions that span basic blocks.
@@ -109,12 +109,12 @@ if.end:                                           ; preds = %if.else, %if.then
   %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
   %add.ptr = getelementptr inbounds float, float* %p, i32 4
   %4 = bitcast float* %add.ptr to i8*
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %4, <2 x float> %x.0, i32 4)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
+declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
 
 ; CHECK: f4
 ; This function inserts a lane into a fully defined vector.
@@ -124,7 +124,7 @@ declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
 define void @f4(float* %p, float* %q) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %0, i32 4)
   %tobool = icmp eq float* %q, null
   br i1 %tobool, label %if.end, label %if.then
 
@@ -138,7 +138,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry, %if.then
   %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %0, <2 x float> %x.0, i32 4)
   ret void
 }
 
@@ -154,7 +154,7 @@ if.end:                                           ; preds = %entry, %if.then
 define void @f5(float* %p, float* %q) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %0, i32 4)
   %vecext = extractelement <4 x float> %vld1, i32 0
   %vecext1 = extractelement <4 x float> %vld1, i32 1
   %vecext2 = extractelement <4 x float> %vld1, i32 2
@@ -182,13 +182,13 @@ if.end:                                           ; preds = %entry, %if.then
   %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
   %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
   %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
   ret void
 }
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
 
 ; CHECK: pr13999
 define void @pr13999() nounwind readonly {
diff --git a/llvm/test/CodeGen/ARM/dagcombine-concatvector.ll b/llvm/test/CodeGen/ARM/dagcombine-concatvector.ll
index 80ef2ab7b8b..578d80d1cef 100644
--- a/llvm/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/llvm/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -19,8 +19,8 @@ bb:
   %tmp5 = bitcast i64 %tmp4 to <8 x i8>
   %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp7 = shufflevector <16 x i8> %tmp6, <16 x i8> %tmp3, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
+  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32)
diff --git a/llvm/test/CodeGen/ARM/neon_spill.ll b/llvm/test/CodeGen/ARM/neon_spill.ll
index 04f08f43840..f9282f93f4d 100644
--- a/llvm/test/CodeGen/ARM/neon_spill.ll
+++ b/llvm/test/CodeGen/ARM/neon_spill.ll
@@ -22,7 +22,7 @@ declare arm_aapcs_vfpcc %2* @func3(%2*, %2*, i32)
 declare arm_aapcs_vfpcc %2** @func4()
 
 define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
-  call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
+  call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   %2 = call arm_aapcs_vfpcc  %0** @func2() nounwind
   %3 = load %0*, %0** %2, align 4
   store float 0.000000e+00, float* undef, align 4
@@ -40,10 +40,10 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
   %10 = fmul float undef, 2.000000e+05
   %11 = fadd float %10, -1.000000e+05
   store float %11, float* undef, align 4
-  call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
+  call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   ret void
 }
 
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
 
 declare arm_aapcs_vfpcc i32 @rand()
diff --git a/llvm/test/CodeGen/ARM/out-of-registers.ll b/llvm/test/CodeGen/ARM/out-of-registers.ll
index a83923db0b3..891319881f4 100644
--- a/llvm/test/CodeGen/ARM/out-of-registers.ll
+++ b/llvm/test/CodeGen/ARM/out-of-registers.ll
@@ -8,7 +8,7 @@ target triple = "thumbv7-none-linux-gnueabi"
 
 define void @foo(float* nocapture %A) #0 {
   %1= bitcast float* %A to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* %1, i32 4)
   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
   %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
@@ -17,7 +17,7 @@ define void @foo(float* nocapture %A) #0 {
   %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
   %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
   %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
-  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
+  tail call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
  ret void
 }
 
@@ -27,8 +27,8 @@ declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
 ; Function Attrs: nounwind readonly
 
 ; Function Attrs: nounwind
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
+declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) #2
 
 ; Function Attrs: nounwind
 
diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll
index 507ee48cafc..d02f5f88379 100644
--- a/llvm/test/CodeGen/ARM/reg_sequence.ll
+++ b/llvm/test/CodeGen/ARM/reg_sequence.ll
@@ -24,7 +24,7 @@ entry:
   %2 = getelementptr inbounds %struct.int32x4_t, %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
   %3 = load <4 x i32>, <4 x i32>* %2, align 16               ; <<4 x i32>> [#uses=1]
   %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
-  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
   %6 = bitcast <8 x i16> %5 to <2 x double>       ; <<2 x double>> [#uses=2]
   %7 = extractelement <2 x double> %6, i32 0      ; <double> [#uses=1]
   %8 = bitcast double %7 to <4 x i16>             ; <<4 x i16>> [#uses=1]
@@ -40,7 +40,7 @@ entry:
   %trunc_16 = trunc <4 x i32> %16 to <4 x i16>
   %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
   %18 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %18, <8 x i16> %17, i32 1)
   ret void
 }
 
@@ -60,17 +60,17 @@ entry:
   %2 = getelementptr inbounds %struct.int16x8_t, %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
   %3 = load <8 x i16>, <8 x i16>* %2, align 16               ; <<8 x i16>> [#uses=1]
   %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
-  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
   %6 = getelementptr inbounds i16, i16* %i_ptr, i32 8  ; <i16*> [#uses=1]
   %7 = bitcast i16* %6 to i8*                     ; <i8*> [#uses=1]
-  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
+  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
   %9 = mul <8 x i16> %1, %5                       ; <<8 x i16>> [#uses=1]
   %10 = mul <8 x i16> %3, %8                      ; <<8 x i16>> [#uses=1]
   %11 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %11, <8 x i16> %9, i32 1)
   %12 = getelementptr inbounds i16, i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
   %13 = bitcast i16* %12 to i8*                   ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %13, <8 x i16> %10, i32 1)
   ret void
 }
 
@@ -81,14 +81,14 @@ define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
 ; CHECK:        vmov r
 ; CHECK-NOT:    vmov d
 ; CHECK:        vst3.8
-  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
   %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
   %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
   %tmp5 = sub <8 x i8> %tmp3, %tmp4
   %tmp6 = add <8 x i8> %tmp2, %tmp3               ; <<8 x i8>> [#uses=1]
   %tmp7 = mul <8 x i8> %tmp4, %tmp2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
+  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
   ret <8 x i8> %tmp4
 }
 
@@ -101,10 +101,10 @@ entry:
 ; CHECK-NOT:    vmov
 ; CHECK:        bne
   %tmp1 = bitcast i32* %in to i8*                 ; <i8*> [#uses=1]
-  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
   %tmp3 = getelementptr inbounds i32, i32* %in, i32 8  ; <i32*> [#uses=1]
   %tmp4 = bitcast i32* %tmp3 to i8*               ; <i8*> [#uses=1]
-  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
   %tmp8 = bitcast i32* %out to i8*                ; <i8*> [#uses=1]
   br i1 undef, label %return1, label %return2
 
@@ -120,7 +120,7 @@ return1:
   %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
   %tmp6 = add <4 x i32> %tmp52, %tmp              ; <<4 x i32>> [#uses=1]
   %tmp7 = add <4 x i32> %tmp57, %tmp39            ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
   ret void
 
 return2:
@@ -131,7 +131,7 @@ return2:
   %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
   %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
   %tmp102 = add <4 x i32> %tmp100, %tmp101              ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
   call void @llvm.trap()
   unreachable
 }
@@ -147,7 +147,7 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        vadd.i16
   %tmp0 = bitcast i16* %A to i8*                  ; <i8*> [#uses=1]
   %tmp1 = load <8 x i16>, <8 x i16>* %B                      ; <<8 x i16>> [#uses=2]
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
   %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
   %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
   %tmp5 = add <8 x i16> %tmp3, %tmp4              ; <<8 x i16>> [#uses=1]
@@ -160,7 +160,7 @@ define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
 ; CHECK:        vorr d[[D0:[0-9]+]], d[[D1:[0-9]+]]
 ; CHECK-NEXT:   vld2.8 {d[[D1]][1], d[[D0]][1]}
   %tmp1 = load <8 x i8>, <8 x i8>* %B                       ; <<8 x i8>> [#uses=2]
-  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
   %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
   %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
   %tmp5 = add <8 x i8> %tmp3, %tmp4               ; <<8 x i8>> [#uses=1]
@@ -178,14 +178,14 @@ entry:
 ; CHECK:        vuzp.32 q[[Q1]], q[[Q0]]
 ; CHECK:        vst1.32
   %0 = bitcast i32* %iptr to i8*                  ; <i8*> [#uses=2]
-  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
   %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
   %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
   %2 = bitcast i32* %optr to i8*                  ; <i8*> [#uses=2]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
-  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
+  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %2, <4 x i32> %4, i32 1)
   ret void
 }
 
@@ -307,43 +307,43 @@ bb14:                                             ; preds = %bb6
 
 ; This test crashes the coalescer because live variables were not updated properly.
 define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
-  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
   %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d  ; <<8 x i8>> [#uses=1]
   %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
   %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f  ; <<8 x i8>> [#uses=1]
   %tmp2efgh = mul <8 x i8> %tmp2ef, undef         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
+  call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
   %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd       ; <<8 x i8>> [#uses=1]
   %tmp7 = mul <8 x i8> undef, %tmp2               ; <<8 x i8>> [#uses=1]
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
+  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
   ret <8 x i8> undef
 }
 
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
 
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
 
 declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
 
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
 
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
 nounwind
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
 
 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
 
diff --git a/llvm/test/CodeGen/ARM/spill-q.ll b/llvm/test/CodeGen/ARM/spill-q.ll
index 1a102e3d971..845018ebb0e 100644
--- a/llvm/test/CodeGen/ARM/spill-q.ll
+++ b/llvm/test/CodeGen/ARM/spill-q.ll
@@ -7,7 +7,7 @@
 %quux = type { i32 (...)**, %baz*, i32 }
 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK-LABEL: aaa:
@@ -18,30 +18,30 @@ entry:
   %aligned_vec = alloca <4 x float>, align 16
   %"alloca point" = bitcast i32 0 to i32
   %vecptr = bitcast <4 x float>* %aligned_vec to i8*
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
   store float 6.300000e+01, float* undef, align 4
-  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
   store float 0.000000e+00, float* undef, align 4
-  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
-  %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
   %val173 = load <4 x float>, <4 x float>* undef               ; <<4 x float>> [#uses=1]
   br label %bb4
diff --git a/llvm/test/CodeGen/ARM/vcge.ll b/llvm/test/CodeGen/ARM/vcge.ll
index 2cd33cf3a42..e34b3e5e365 100644
--- a/llvm/test/CodeGen/ARM/vcge.ll
+++ b/llvm/test/CodeGen/ARM/vcge.ll
@@ -196,8 +196,8 @@ entry:
   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %4 = add <8 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %5 = trunc <8 x i16> %4 to <8 x i8>
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %5, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %5, i32 1)
   unreachable
 }
 
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
index 29f4bb972a2..2ef2a0697ec 100644
--- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -78,11 +78,11 @@ entry:
   %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %3 = add <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %4 = trunc <8 x i16> %3 to <8 x i8>
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %4, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %4, i32 1)
   unreachable
 }
 
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
 
 ; Test that loads and stores of i64 vector elements are handled as f64 values
 ; so they are not split up into i32 values.  Radar 8755338.
diff --git a/llvm/test/CodeGen/ARM/vld-vst-upgrade.ll b/llvm/test/CodeGen/ARM/vld-vst-upgrade.ll
new file mode 100644
index 00000000000..fe868f6cb07
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/vld-vst-upgrade.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon < %s | FileCheck %s
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+; vld[1234] auto-upgrade tests
+
+; CHECK-LABEL: test_vld1_upgrade:
+; CHECK: vld1.32 {d16}, [r0]
+define <2 x i32> @test_vld1_upgrade(i8* %ptr) {
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %ptr, i32 1)
+  ret <2 x i32> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld2_upgrade:
+; CHECK: vld2.32 {d16, d17}, [r0]
+define %struct.__neon_int32x2x2_t @test_vld2_upgrade(i8* %ptr) {
+  %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %ptr, i32 1)
+  ret %struct.__neon_int32x2x2_t %tmp1
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld3_upgrade:
+; CHECK: vld3.32 {d16, d17, d18}, [r1]
+define %struct.__neon_int32x2x3_t @test_vld3_upgrade(i8* %ptr) {
+  %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %ptr, i32 1)
+  ret %struct.__neon_int32x2x3_t %tmp1
+}
+
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld4_upgrade:
+; CHECK: vld4.32 {d16, d17, d18, d19}, [r1]
+define %struct.__neon_int32x2x4_t @test_vld4_upgrade(i8* %ptr) {
+  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %ptr, i32 1)
+  ret %struct.__neon_int32x2x4_t %tmp1
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
+
+; vld[234]lane auto-upgrade tests
+
+; CHECK-LABEL: test_vld2lane_upgrade:
+; CHECK: vld2.32 {d16[1], d17[1]}, [r0]
+define %struct.__neon_int32x2x2_t @test_vld2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
+  %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
+  ret %struct.__neon_int32x2x2_t %tmp1
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld3lane_upgrade:
+; CHECK: vld3.32 {d16[1], d17[1], d18[1]}, [r1]
+define %struct.__neon_int32x2x3_t @test_vld3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+  %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
+  ret %struct.__neon_int32x2x3_t %tmp1
+}
+
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld4lane_upgrade:
+; CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r1]
+define %struct.__neon_int32x2x4_t @test_vld4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
+  ret %struct.__neon_int32x2x4_t %tmp1
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+; vst[1234] auto-upgrade tests
+
+; CHECK-LABEL: test_vst1_upgrade:
+; CHECK: vst1.32 {d16}, [r0]
+define void @test_vst1_upgrade(i8* %ptr, <2 x i32> %A) {
+  call void @llvm.arm.neon.vst1.v2i32(i8* %ptr, <2 x i32> %A, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
+
+; CHECK-LABEL: test_vst2_upgrade:
+; CHECK: vst2.32 {d16, d17}, [r0]
+define void @test_vst2_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
+  call void @llvm.arm.neon.vst2.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+
+; CHECK-LABEL: test_vst3_upgrade:
+; CHECK: vst3.32 {d16, d17, d18}, [r0]
+define void @test_vst3_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+  call void @llvm.arm.neon.vst3.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+
+; CHECK-LABEL: test_vst4_upgrade:
+; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
+define void @test_vst4_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+  call void @llvm.arm.neon.vst4.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+
+; vst[234]lane auto-upgrade tests
+
+; CHECK-LABEL: test_vst2lane_upgrade:
+; CHECK: vst2.32 {d16[1], d17[1]}, [r0]
+define void @test_vst2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
+  call void @llvm.arm.neon.vst2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
+
+; CHECK-LABEL: test_vst3lane_upgrade:
+; CHECK: vst3.32 {d16[1], d17[1], d18[1]}, [r0]
+define void @test_vst3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+  call void @llvm.arm.neon.vst3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+
+; CHECK-LABEL: test_vst4lane_upgrade:
+; CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
+define void @test_vst4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+  call void @llvm.arm.neon.vst4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vld1.ll b/llvm/test/CodeGen/ARM/vld1.ll
index 8064ea4a320..bdb38476974 100644
--- a/llvm/test/CodeGen/ARM/vld1.ll
+++ b/llvm/test/CodeGen/ARM/vld1.ll
@@ -7,7 +7,7 @@ define <8 x i8> @vld1i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld1i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld1.8 {d16}, [r0:64]
-	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
+	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %A, i32 16)
 	ret <8 x i8> %tmp1
 }
 
@@ -15,7 +15,7 @@ define <4 x i16> @vld1i16(i16* %A) nounwind {
 ;CHECK-LABEL: vld1i16:
 ;CHECK: vld1.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
 	ret <4 x i16> %tmp1
 }
 
@@ -25,7 +25,7 @@ define <4 x i16> @vld1i16_update(i16** %ptr) nounwind {
 ;CHECK: vld1.16 {d16}, [{{r[0-9]+}}]!
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = getelementptr i16, i16* %A, i32 4
 	       store i16* %tmp2, i16** %ptr
 	ret <4 x i16> %tmp1
@@ -35,7 +35,7 @@ define <2 x i32> @vld1i32(i32* %A) nounwind {
 ;CHECK-LABEL: vld1i32:
 ;CHECK: vld1.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
 	ret <2 x i32> %tmp1
 }
 
@@ -45,7 +45,7 @@ define <2 x i32> @vld1i32_update(i32** %ptr, i32 %inc) nounwind {
 ;CHECK: vld1.32 {d16}, [{{r[0-9]+}}], {{r[0-9]+}}
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = getelementptr i32, i32* %A, i32 %inc
 	store i32* %tmp2, i32** %ptr
 	ret <2 x i32> %tmp1
@@ -55,7 +55,7 @@ define <2 x float> @vld1f(float* %A) nounwind {
 ;CHECK-LABEL: vld1f:
 ;CHECK: vld1.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %tmp0, i32 1)
 	ret <2 x float> %tmp1
 }
 
@@ -63,7 +63,7 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
 ;CHECK-LABEL: vld1i64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0, i32 1)
+	%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %tmp0, i32 1)
 	ret <1 x i64> %tmp1
 }
 
@@ -71,7 +71,7 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld1Qi8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld1.8 {d16, d17}, [r0:64]
-	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
+	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
 	ret <16 x i8> %tmp1
 }
 
@@ -80,7 +80,7 @@ define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
 ;CHECK-LABEL: vld1Qi8_update:
 ;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}:64]!
 	%A = load i8*, i8** %ptr
-	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
+	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
 	%tmp2 = getelementptr i8, i8* %A, i32 16
 	store i8* %tmp2, i8** %ptr
 	ret <16 x i8> %tmp1
@@ -91,7 +91,7 @@ define <8 x i16> @vld1Qi16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld1.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
+	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %tmp0, i32 32)
 	ret <8 x i16> %tmp1
 }
 
@@ -99,7 +99,7 @@ define <4 x i32> @vld1Qi32(i32* %A) nounwind {
 ;CHECK-LABEL: vld1Qi32:
 ;CHECK: vld1.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %tmp0, i32 1)
 	ret <4 x i32> %tmp1
 }
 
@@ -107,7 +107,7 @@ define <4 x float> @vld1Qf(float* %A) nounwind {
 ;CHECK-LABEL: vld1Qf:
 ;CHECK: vld1.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %tmp0, i32 1)
 	ret <4 x float> %tmp1
 }
 
@@ -115,7 +115,7 @@ define <2 x i64> @vld1Qi64(i64* %A) nounwind {
 ;CHECK-LABEL: vld1Qi64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %tmp0, i32 1)
 	ret <2 x i64> %tmp1
 }
 
@@ -123,28 +123,28 @@ define <2 x double> @vld1Qf64(double* %A) nounwind {
 ;CHECK-LABEL: vld1Qf64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast double* %A to i8*
-	%tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8* %tmp0, i32 1)
 	ret <2 x double> %tmp1
 }
 
-declare <8 x i8>  @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly
+declare <8 x i8>  @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32) nounwind readonly
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8*, i32) nounwind readonly
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32) nounwind readonly
+declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) nounwind readonly
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
+declare <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8*, i32) nounwind readonly
 
 ; Radar 8355607
 ; Do not crash if the vld1 result is not used.
 define void @unused_vld1_result() {
 entry:
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) 
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1)
   call void @llvm.trap()
   unreachable
 }
diff --git a/llvm/test/CodeGen/ARM/vld2.ll b/llvm/test/CodeGen/ARM/vld2.ll
index 391b49152cd..1ca16587bd9 100644
--- a/llvm/test/CodeGen/ARM/vld2.ll
+++ b/llvm/test/CodeGen/ARM/vld2.ll
@@ -15,7 +15,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld2i8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld2.8 {d16, d17}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
+	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
         %tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -27,7 +27,7 @@ define <4 x i16> @vld2i16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld2.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
+	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
         %tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -38,7 +38,7 @@ define <2 x i32> @vld2i32(i32* %A) nounwind {
 ;CHECK-LABEL: vld2i32:
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
         %tmp4 = add <2 x i32> %tmp2, %tmp3
@@ -49,7 +49,7 @@ define <2 x float> @vld2f(float* %A) nounwind {
 ;CHECK-LABEL: vld2f:
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
         %tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -62,7 +62,7 @@ define <2 x float> @vld2f_update(float** %ptr) nounwind {
 ;CHECK: vld2.32 {d16, d17}, [r1]!
 	%A = load float*, float** %ptr
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
 	%tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -76,7 +76,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld1.64 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
+	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
         %tmp4 = add <1 x i64> %tmp2, %tmp3
@@ -87,7 +87,7 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld2Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
+	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -99,7 +99,7 @@ define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK-LABEL: vld2Qi8_update:
 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1
 	%A = load i8*, i8** %ptr
-	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
+	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 16)
         %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -113,7 +113,7 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
+	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
         %tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -125,7 +125,7 @@ define <4 x i32> @vld2Qi32(i32* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
+	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp0, i32 64)
         %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
         %tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -136,20 +136,20 @@ define <4 x float> @vld2Qf(float* %A) nounwind {
 ;CHECK-LABEL: vld2Qf:
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
         %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*, i32) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
diff --git a/llvm/test/CodeGen/ARM/vld3.ll b/llvm/test/CodeGen/ARM/vld3.ll
index 0d14179ba73..c3e8ee8691f 100644
--- a/llvm/test/CodeGen/ARM/vld3.ll
+++ b/llvm/test/CodeGen/ARM/vld3.ll
@@ -16,7 +16,7 @@ define <8 x i8> @vld3i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld3i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld3.8 {d16, d17, d18}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
+	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 32)
         %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
         %tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -27,7 +27,7 @@ define <4 x i16> @vld3i16(i16* %A) nounwind {
 ;CHECK-LABEL: vld3i16:
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
         %tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -40,7 +40,7 @@ define <4 x i16> @vld3i16_update(i16** %ptr, i32 %inc) nounwind {
 ;CHECK: vld3.16 {d16, d17, d18}, [{{r[0-9]+}}], {{r[0-9]+}}
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
 	%tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -53,7 +53,7 @@ define <2 x i32> @vld3i32(i32* %A) nounwind {
 ;CHECK-LABEL: vld3i32:
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
         %tmp4 = add <2 x i32> %tmp2, %tmp3
@@ -64,7 +64,7 @@ define <2 x float> @vld3f(float* %A) nounwind {
 ;CHECK-LABEL: vld3f:
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2
         %tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -76,7 +76,7 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld1.64 {d16, d17, d18}, [r0:64]
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
+	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
         %tmp4 = add <1 x i64> %tmp2, %tmp3
@@ -87,7 +87,7 @@ define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind {
 ;CHECK-LABEL: vld3i64_update:
 ;CHECK: vld1.64	{d16, d17, d18}, [r1:64]!
         %tmp0 = bitcast i64* %A to i8*
-        %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
+        %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
         %tmp5 = getelementptr i64, i64* %A, i32 3
         store i64* %tmp5, i64** %ptr
         %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
@@ -101,7 +101,7 @@ define <16 x i8> @vld3Qi8(i8* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld3.8 {d16, d18, d20}, [r0:64]!
 ;CHECK: vld3.8 {d17, d19, d21}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
+	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8* %A, i32 32)
         %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -113,7 +113,7 @@ define <8 x i16> @vld3Qi16(i16* %A) nounwind {
 ;CHECK: vld3.16
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2
         %tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -125,7 +125,7 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
 ;CHECK: vld3.32
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
         %tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -139,7 +139,7 @@ define <4 x i32> @vld3Qi32_update(i32** %ptr) nounwind {
 ;CHECK: vld3.32 {d17, d19, d21}, [r[[R]]]!
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
 	%tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -153,20 +153,20 @@ define <4 x float> @vld3Qf(float* %A) nounwind {
 ;CHECK: vld3.32
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2
         %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*, i32) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8*, i32) nounwind readonly
diff --git a/llvm/test/CodeGen/ARM/vld4.ll b/llvm/test/CodeGen/ARM/vld4.ll
index 575e0fa717f..10570039a9d 100644
--- a/llvm/test/CodeGen/ARM/vld4.ll
+++ b/llvm/test/CodeGen/ARM/vld4.ll
@@ -15,7 +15,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld4i8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
+	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
         %tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -27,7 +27,7 @@ define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK-LABEL: vld4i8_update:
 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r2:128], r1
 	%A = load i8*, i8** %ptr
-	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16)
+	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 16)
 	%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
 	%tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -41,7 +41,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
+	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
         %tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -53,7 +53,7 @@ define <2 x i32> @vld4i32(i32* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
+	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
         %tmp4 = add <2 x i32> %tmp2, %tmp3
@@ -64,7 +64,7 @@ define <2 x float> @vld4f(float* %A) nounwind {
 ;CHECK-LABEL: vld4f:
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2
         %tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -76,7 +76,7 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
+	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
         %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
         %tmp4 = add <1 x i64> %tmp2, %tmp3
@@ -87,7 +87,7 @@ define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind {
 ;CHECK-LABEL: vld4i64_update:
 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]!
         %tmp0 = bitcast i64* %A to i8*
-        %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
+        %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
         %tmp5 = getelementptr i64, i64* %A, i32 4
         store i64* %tmp5, i64** %ptr
         %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
@@ -101,7 +101,7 @@ define <16 x i8> @vld4Qi8(i8* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.8 {d16, d18, d20, d22}, [r0:256]!
 ;CHECK: vld4.8 {d17, d19, d21, d23}, [r0:256]
-	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
+	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8* %A, i32 64)
         %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -114,7 +114,7 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
 ;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
         %tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -128,7 +128,7 @@ define <8 x i16> @vld4Qi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r1:64]!
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
+	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 8)
 	%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
 	%tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -142,7 +142,7 @@ define <4 x i32> @vld4Qi32(i32* %A) nounwind {
 ;CHECK: vld4.32
 ;CHECK: vld4.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2
         %tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -154,20 +154,20 @@ define <4 x float> @vld4Qf(float* %A) nounwind {
 ;CHECK: vld4.32
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2
         %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*, i32) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) nounwind readonly
diff --git a/llvm/test/CodeGen/ARM/vlddup.ll b/llvm/test/CodeGen/ARM/vlddup.ll
index 09304d87d53..c115a3863d0 100644
--- a/llvm/test/CodeGen/ARM/vlddup.ll
+++ b/llvm/test/CodeGen/ARM/vlddup.ll
@@ -66,7 +66,7 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld2dupi8:
 ;Check the (default) alignment value.
 ;CHECK: vld2.8 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+	%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
@@ -80,7 +80,7 @@ define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
 ;CHECK: vld2.16 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -95,7 +95,7 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
 	%A = load i16*, i16** %ptr
         %A2 = bitcast i16* %A to i8*
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -110,7 +110,7 @@ define <2 x i32> @vld2dupi32(i8* %A) nounwind {
 ;CHECK-LABEL: vld2dupi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld2.32 {d16[], d17[]}, [r0:64]
-	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
+	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
 	%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
@@ -119,9 +119,9 @@ define <2 x i32> @vld2dupi32(i8* %A) nounwind {
         ret <2 x i32> %tmp5
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -131,7 +131,7 @@ define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK-LABEL: vld3dupi8_update:
 ;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1
 	%A = load i8*, i8** %ptr
-	%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
@@ -149,7 +149,7 @@ define <4 x i16> @vld3dupi16(i8* %A) nounwind {
 ;CHECK-LABEL: vld3dupi16:
 ;Check the (default) alignment value. VLD3 does not support alignment.
 ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
@@ -161,8 +161,8 @@ define <4 x i16> @vld3dupi16(i8* %A) nounwind {
         ret <4 x i16> %tmp8
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
 
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
@@ -173,7 +173,7 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
 	%A = load i16*, i16** %ptr
         %A2 = bitcast i16* %A to i8*
-	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
@@ -195,7 +195,7 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
 ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0:64]
-	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
@@ -210,5 +210,5 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
         ret <2 x i32> %tmp11
 }
 
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll
index ac2be7f87f5..2c14bc2d8f4 100644
--- a/llvm/test/CodeGen/ARM/vldlane.ll
+++ b/llvm/test/CodeGen/ARM/vldlane.ll
@@ -102,7 +102,7 @@ define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 16 bits:
 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
         %tmp5 = add <8 x i8> %tmp3, %tmp4
@@ -115,7 +115,7 @@ define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
         %tmp5 = add <4 x i16> %tmp3, %tmp4
@@ -127,7 +127,7 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
         %tmp5 = add <2 x i32> %tmp3, %tmp4
@@ -141,7 +141,7 @@ define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
 	%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
 	%tmp5 = add <2 x i32> %tmp3, %tmp4
@@ -155,7 +155,7 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
         %tmp5 = fadd <2 x float> %tmp3, %tmp4
@@ -168,7 +168,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
         %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
         %tmp5 = add <8 x i16> %tmp3, %tmp4
@@ -181,7 +181,7 @@ define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
         %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
         %tmp5 = add <4 x i32> %tmp3, %tmp4
@@ -193,21 +193,21 @@ define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
         %tmp5 = fadd <4 x float> %tmp3, %tmp4
 	ret <4 x float> %tmp5
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -222,7 +222,7 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vld3lanei8:
 ;CHECK: vld3.8
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
@@ -237,7 +237,7 @@ define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
@@ -251,7 +251,7 @@ define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
@@ -265,7 +265,7 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
@@ -280,7 +280,7 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@@ -296,7 +296,7 @@ define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounw
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
 	%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
 	%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
 	%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@@ -312,7 +312,7 @@ define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
@@ -326,7 +326,7 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
@@ -335,14 +335,14 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp7
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
@@ -358,7 +358,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 32 bits:
 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@@ -375,7 +375,7 @@ define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
 	%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
 	%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@@ -395,7 +395,7 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
         %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
@@ -413,7 +413,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
@@ -429,7 +429,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
@@ -446,7 +446,7 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
         %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
@@ -463,7 +463,7 @@ define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
@@ -479,7 +479,7 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
@@ -490,14 +490,14 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp9
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
 
 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
@@ -511,7 +511,7 @@ define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
   %tmp65 = shl i128 %tmp64, 64
   %ins67 = or i128 %tmp65, 0
   %tmp78 = bitcast i128 %ins67 to <8 x i16>
-  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
+  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
   %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
   %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
   %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
diff --git a/llvm/test/CodeGen/ARM/vmov.ll b/llvm/test/CodeGen/ARM/vmov.ll
index b7a23b7bb59..b1b4f1a940d 100644
--- a/llvm/test/CodeGen/ARM/vmov.ll
+++ b/llvm/test/CodeGen/ARM/vmov.ll
@@ -393,8 +393,8 @@ entry:
   %sub.i = sub <4 x i32> %add.i185, zeroinitializer
   %add.i = add <4 x i32> %sub.i, zeroinitializer
   %vmovn.i = trunc <4 x i32> %add.i to <4 x i16>
-  tail call void @llvm.arm.neon.vst1.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
   unreachable
 }
 
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vmul.ll b/llvm/test/CodeGen/ARM/vmul.ll
index c3e41cacde4..0455190b4c9 100644
--- a/llvm/test/CodeGen/ARM/vmul.ll
+++ b/llvm/test/CodeGen/ARM/vmul.ll
@@ -447,7 +447,7 @@ entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
   %4 = bitcast <16 x i8> %3 to <2 x double>
   %5 = extractelement <2 x double> %4, i32 1
   %6 = bitcast double %5 to <8 x i8>
@@ -459,13 +459,13 @@ entry:
   %12 = add <8 x i16> %7, %11
   %13 = mul <8 x i16> %12, %8
   %14 = bitcast i16* %dst to i8*
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %14, <8 x i16> %13, i32 2)
   ret void
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
 
 ; Take advantage of the Cortex-A8 multiplier accumulator forward.
 
@@ -480,7 +480,7 @@ entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
   %4 = bitcast <16 x i8> %3 to <2 x double>
   %5 = extractelement <2 x double> %4, i32 1
   %6 = bitcast double %5 to <8 x i8>
@@ -502,7 +502,7 @@ entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
   %4 = bitcast <16 x i8> %3 to <2 x double>
   %5 = extractelement <2 x double> %4, i32 1
   %6 = bitcast double %5 to <8 x i8>
@@ -559,7 +559,7 @@ for.body33.lr.ph:                                 ; preds = %for.body
 
 for.body33:                                       ; preds = %for.body33, %for.body33.lr.ph
   %add45 = add i32 undef, undef
-  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1)
+  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* undef, i32 1)
   %0 = load i32*, i32** undef, align 4
   %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
   %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
diff --git a/llvm/test/CodeGen/ARM/vst1.ll b/llvm/test/CodeGen/ARM/vst1.ll
index f605fa4d600..404129a7e6a 100644
--- a/llvm/test/CodeGen/ARM/vst1.ll
+++ b/llvm/test/CodeGen/ARM/vst1.ll
@@ -5,7 +5,7 @@ define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vst1.8 {d16}, [r0:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
 	ret void
 }
 
@@ -14,7 +14,7 @@ define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst1.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -23,7 +23,7 @@ define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -32,7 +32,7 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -43,7 +43,7 @@ define void @vst1f_update(float** %ptr, <2 x float>* %B) nounwind {
 	%A = load float*, float** %ptr
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
 	%tmp2 = getelementptr float, float* %A, i32 2
 	store float* %tmp2, float** %ptr
 	ret void
@@ -54,7 +54,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
 	ret void
 }
 
@@ -63,7 +63,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vst1.8 {d16, d17}, [r0:64]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -73,7 +73,7 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst1.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
 	ret void
 }
 
@@ -84,7 +84,7 @@ define void @vst1Qi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
 	%tmp2 = getelementptr i16, i16* %A, i32 %inc
 	store i16* %tmp2, i16** %ptr
 	ret void
@@ -95,7 +95,7 @@ define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -104,7 +104,7 @@ define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -113,7 +113,7 @@ define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <2 x i64>, <2 x i64>* %B
-	call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
 	ret void
 }
 
@@ -122,19 +122,19 @@ define void @vst1Qf64(double* %A, <2 x double>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast double* %A to i8*
 	%tmp1 = load <2 x double>, <2 x double>* %B
-	call void @llvm.arm.neon.vst1.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v1i64(i8*, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i64(i8*, <2 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2f64(i8*, <2 x double>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vst2.ll b/llvm/test/CodeGen/ARM/vst2.ll
index 17c8a4bdad9..e0846ff512e 100644
--- a/llvm/test/CodeGen/ARM/vst2.ll
+++ b/llvm/test/CodeGen/ARM/vst2.ll
@@ -5,7 +5,7 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vst2.8 {d16, d17}, [r0:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -15,7 +15,7 @@ define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 ;CHECK: vst2.8 {d16, d17}, [r1], r2
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
+	call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
 	%tmp2 = getelementptr i8, i8* %A, i32 %inc
 	store i8* %tmp2, i8** %ptr
 	ret void
@@ -27,7 +27,7 @@ define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
 	ret void
 }
 
@@ -36,7 +36,7 @@ define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -45,7 +45,7 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -55,7 +55,7 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
 	ret void
 }
 
@@ -66,7 +66,7 @@ define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
 	%A = load i64*, i64** %ptr
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
 	%tmp2 = getelementptr i64, i64* %A, i32 2
 	store i64* %tmp2, i64** %ptr
 	ret void
@@ -77,7 +77,7 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vst2.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -87,7 +87,7 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
 	ret void
 }
 
@@ -97,7 +97,7 @@ define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
+	call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
 	ret void
 }
 
@@ -106,7 +106,7 @@ define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -114,7 +114,7 @@ define i8* @vst2update(i8* %out, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: vst2update:
 ;CHECK: vst2.16 {d16, d17}, [r0]!
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	tail call void @llvm.arm.neon.vst2.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
+	tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
 	%t5 = getelementptr inbounds i8, i8* %out, i32 16
 	ret i8* %t5
 }
@@ -123,18 +123,18 @@ define i8* @vst2update2(i8 * %out, <4 x float> * %this) nounwind optsize ssp ali
 ;CHECK-LABEL: vst2update2:
 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0]!
   %tmp1 = load <4 x float>, <4 x float>* %this
-  call void @llvm.arm.neon.vst2.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
+  call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
   %tmp2 = getelementptr inbounds i8, i8* %out, i32  32
   ret i8* %tmp2
 }
 
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vst3.ll b/llvm/test/CodeGen/ARM/vst3.ll
index 691ee3bd28f..d70d5957900 100644
--- a/llvm/test/CodeGen/ARM/vst3.ll
+++ b/llvm/test/CodeGen/ARM/vst3.ll
@@ -6,7 +6,7 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;This test runs at -O0 so do not check for specific register numbers.
 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
 	ret void
 }
 
@@ -15,7 +15,7 @@ define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -24,7 +24,7 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -35,7 +35,7 @@ define void @vst3i32_update(i32** %ptr, <2 x i32>* %B) nounwind {
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	%tmp2 = getelementptr i32, i32* %A, i32 6
 	store i32* %tmp2, i32** %ptr
 	ret void
@@ -46,7 +46,7 @@ define void @vst3f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -57,7 +57,7 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
 	ret void
 }
 
@@ -67,7 +67,7 @@ define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
         %A = load i64*, i64** %ptr
         %tmp0 = bitcast i64* %A to i8*
         %tmp1 = load <1 x i64>, <1 x i64>* %B
-        call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
         %tmp2 = getelementptr i64, i64* %A, i32 3
         store i64* %tmp2, i64** %ptr
         ret void
@@ -80,7 +80,7 @@ define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]!
 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
 	ret void
 }
 
@@ -90,7 +90,7 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -102,7 +102,7 @@ define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	%tmp2 = getelementptr i16, i16* %A, i32 24
 	store i16* %tmp2, i16** %ptr
 	ret void
@@ -114,7 +114,7 @@ define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -124,17 +124,17 @@ define void @vst3Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vst4.ll b/llvm/test/CodeGen/ARM/vst4.ll
index c343c6c8695..18895510229 100644
--- a/llvm/test/CodeGen/ARM/vst4.ll
+++ b/llvm/test/CodeGen/ARM/vst4.ll
@@ -5,7 +5,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -15,7 +15,7 @@ define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r1:128], r2
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
 	%tmp2 = getelementptr i8, i8* %A, i32 %inc
 	store i8* %tmp2, i8** %ptr
 	ret void
@@ -27,7 +27,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
 	ret void
 }
 
@@ -37,7 +37,7 @@ define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
 	ret void
 }
 
@@ -46,7 +46,7 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -56,7 +56,7 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
+	call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
 	ret void
 }
 
@@ -66,7 +66,7 @@ define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
         %A = load i64*, i64** %ptr
         %tmp0 = bitcast i64* %A to i8*
         %tmp1 = load <1 x i64>, <1 x i64>* %B
-        call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
         %tmp2 = getelementptr i64, i64* %A, i32 4
         store i64* %tmp2, i64** %ptr
         ret void
@@ -78,7 +78,7 @@ define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst4.8 {d16, d18, d20, d22}, [r0:256]!
 ;CHECK: vst4.8 {d17, d19, d21, d23}, [r0:256]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
+	call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
 	ret void
 }
 
@@ -89,7 +89,7 @@ define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -99,7 +99,7 @@ define void @vst4Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -109,7 +109,7 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -121,19 +121,19 @@ define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
 	%A = load float*, float** %ptr
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	%tmp2 = getelementptr float, float* %A, i32 16
 	store float* %tmp2, float** %ptr
 	ret void
 }
 
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
diff --git a/llvm/test/CodeGen/ARM/vstlane.ll b/llvm/test/CodeGen/ARM/vstlane.ll
index a4575417bce..7e130ea01b6 100644
--- a/llvm/test/CodeGen/ARM/vstlane.ll
+++ b/llvm/test/CodeGen/ARM/vstlane.ll
@@ -110,7 +110,7 @@ define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 16 bits:
 ;CHECK: vst2.8 {d16[1], d17[1]}, [r0:16]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
+	call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
 	ret void
 }
 
@@ -120,7 +120,7 @@ define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
 	ret void
 }
 
@@ -131,7 +131,7 @@ define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
 	%tmp2 = getelementptr i16, i16* %A, i32 %inc
 	store i16* %tmp2, i16** %ptr
 	ret void
@@ -142,7 +142,7 @@ define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -151,7 +151,7 @@ define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -161,7 +161,7 @@ define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d17[1], d19[1]}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
 	ret void
 }
 
@@ -171,7 +171,7 @@ define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2.32 {d17[0], d19[0]}, [r0:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
 	ret void
 }
 
@@ -180,24 +180,24 @@ define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
 
-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
 
 define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst3lanei8:
 ;CHECK: vst3.8
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -207,7 +207,7 @@ define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
 	ret void
 }
 
@@ -216,7 +216,7 @@ define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -225,7 +225,7 @@ define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -235,7 +235,7 @@ define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
+	call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
 	ret void
 }
 
@@ -244,7 +244,7 @@ define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
 	ret void
 }
 
@@ -255,7 +255,7 @@ define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
 	%tmp2 = getelementptr i32, i32* %A, i32 3
 	store i32* %tmp2, i32** %ptr
 	ret void
@@ -266,18 +266,18 @@ define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
 
-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
 
 
 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
@@ -285,7 +285,7 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 32 bits:
 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	ret void
 }
 
@@ -295,7 +295,7 @@ define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	%tmp2 = getelementptr i8, i8* %A, i32 4
 	store i8* %tmp2, i8** %ptr
 	ret void
@@ -306,7 +306,7 @@ define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -316,7 +316,7 @@ define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
+	call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
 	ret void
 }
 
@@ -325,7 +325,7 @@ define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -335,7 +335,7 @@ define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
+	call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
 	ret void
 }
 
@@ -345,7 +345,7 @@ define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
 	ret void
 }
 
@@ -354,7 +354,7 @@ define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -365,11 +365,11 @@ define <8 x i16> @variable_insertelement(<8 x i16> %a, i16 %b, i32 %c) nounwind
     ret <8 x i16> %r
 }
 
-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
 
-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
-- 
cgit v1.2.3