ARM64: initial clang support commit.

This adds Clang support for the ARM64 backend. There are definitely still some rough edges, so please bring up any issues you see with this patch. As with the LLVM commit though, we think it'll be more useful for merging with AArch64 from within the tree. llvm-svn: 205100
author: Tim Northover <tnorthover@apple.com> 2014-03-29 15:09:45 +0000
committer: Tim Northover <tnorthover@apple.com> 2014-03-29 15:09:45 +0000
commit: a2ee433c8d99632419d4a13a66cc4d06eada4014 (patch)
tree: e6ab2db8facbc4c5ed2fb11df260db8138572ace /clang/test/CodeGen
parent: af3698066a1ea2e5ab4cc08ae9a59620cf18adb7 (diff)
download: bcm5719-llvm-a2ee433c8d99632419d4a13a66cc4d06eada4014.tar.gz
bcm5719-llvm-a2ee433c8d99632419d4a13a66cc4d06eada4014.zip
54 files changed, 5398 insertions, 335 deletions
diff --git a/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c b/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c
index f5489514354..92171e2bd9b 100644
--- a/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c
+++ b/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -o - -emit-llvm | FileCheck %s
-// XFAIL: aarch64
+// XFAIL: aarch64, arm64
 
 // PR1513
 
diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 706757f95c2..19c9b16adc8 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -1,8 +1,10 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
@@ -281,7 +283,7 @@ float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
   // CHECK-LABEL: test_vfmsd_lane_f64
   return vfmsd_lane_f64(a, b, v, 0);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+\.d\[0\]|fmsub d[0-9]+, d[0-9]+, d[0-9]+}}
 }
 
 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
@@ -738,7 +740,7 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
   // CHECK-LABEL: test_vmul_lane_f64
   return vmul_lane_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+\.d\[0\]|d[0-9]+}}
 }
 
 
@@ -1574,109 +1576,109 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
   // CHECK-LABEL: test_vmull_high_n_s16
   return vmull_high_n_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
   // CHECK-LABEL: test_vmull_high_n_s32
   return vmull_high_n_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
   // CHECK-LABEL: test_vmull_high_n_u16
   return vmull_high_n_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
   // CHECK-LABEL: test_vmull_high_n_u32
   return vmull_high_n_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
   // CHECK-LABEL: test_vqdmull_high_n_s16
   return vqdmull_high_n_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
   // CHECK-LABEL: test_vqdmull_high_n_s32
   return vqdmull_high_n_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   // CHECK-LABEL: test_vmlal_high_n_s16
   return vmlal_high_n_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   // CHECK-LABEL: test_vmlal_high_n_s32
   return vmlal_high_n_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   // CHECK-LABEL: test_vmlal_high_n_u16
   return vmlal_high_n_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   // CHECK-LABEL: test_vmlal_high_n_u32
   return vmlal_high_n_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   // CHECK-LABEL: test_vqdmlal_high_n_s16
   return vqdmlal_high_n_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   // CHECK-LABEL: test_vqdmlal_high_n_s32
   return vqdmlal_high_n_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   // CHECK-LABEL: test_vmlsl_high_n_s16
   return vmlsl_high_n_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   // CHECK-LABEL: test_vmlsl_high_n_s32
   return vmlsl_high_n_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   // CHECK-LABEL: test_vmlsl_high_n_u16
   return vmlsl_high_n_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   // CHECK-LABEL: test_vmlsl_high_n_u32
   return vmlsl_high_n_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   // CHECK-LABEL: test_vqdmlsl_high_n_s16
   return vqdmlsl_high_n_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   // CHECK-LABEL: test_vqdmlsl_high_n_s32
   return vqdmlsl_high_n_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
diff --git a/clang/test/CodeGen/aarch64-neon-3v.c b/clang/test/CodeGen/aarch64-neon-3v.c
index fc18d0671ca..5251690c184 100644
--- a/clang/test/CodeGen/aarch64-neon-3v.c
+++ b/clang/test/CodeGen/aarch64-neon-3v.c
@@ -1,6 +1,8 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
diff --git a/clang/test/CodeGen/aarch64-neon-across.c b/clang/test/CodeGen/aarch64-neon-across.c
index 906ac837164..17ac3efb85c 100644
--- a/clang/test/CodeGen/aarch64-neon-across.c
+++ b/clang/test/CodeGen/aarch64-neon-across.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
diff --git a/clang/test/CodeGen/aarch64-neon-extract.c b/clang/test/CodeGen/aarch64-neon-extract.c
index 6e2d9691c9c..77d574cf0e2 100644
--- a/clang/test/CodeGen/aarch64-neon-extract.c
+++ b/clang/test/CodeGen/aarch64-neon-extract.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
@@ -9,19 +12,19 @@
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_vext_s8
   return vext_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   // CHECK-LABEL: test_vext_s16
   return vext_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vext_s32
   return vext_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
@@ -32,43 +35,43 @@ int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   // CHECK-LABEL: test_vextq_s8
   return vextq_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   // CHECK-LABEL: test_vextq_s16
   return vextq_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   // CHECK-LABEL: test_vextq_s32
   return vextq_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vextq_s64
   return vextq_s64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vext_u8
   return vext_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   // CHECK-LABEL: test_vext_u16
   return vext_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vext_u32
   return vext_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
@@ -79,31 +82,31 @@ uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vextq_u8
   return vextq_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   // CHECK-LABEL: test_vextq_u16
   return vextq_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   // CHECK-LABEL: test_vextq_u32
   return vextq_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vextq_u64
   return vextq_u64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vext_f32
   return vext_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
@@ -114,35 +117,35 @@ float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   // CHECK-LABEL: test_vextq_f32
   return vextq_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vextq_f64
   return vextq_f64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   // CHECK-LABEL: test_vext_p8
   return vext_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   // CHECK-LABEL: test_vext_p16
   return vext_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   // CHECK-LABEL: test_vextq_p8
   return vextq_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   // CHECK-LABEL: test_vextq_p16
   return vextq_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
diff --git a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
index 3dd5e006457..77022f58fe3 100644
--- a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
@@ -14,120 +17,120 @@ float32_t test_vcvtxd_f32_f64(float64_t a) {
 
 int32_t test_vcvtas_s32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtas_s32_f32
-// CHECK: fcvtas {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtas_s32_f32(a);
 }
 
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
 // CHECK-LABEL: test_test_vcvtad_s64_f64
-// CHECK: fcvtas {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtad_s64_f64(a);
 }
 
 uint32_t test_vcvtas_u32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtas_u32_f32
-// CHECK: fcvtau {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtas_u32_f32(a);
 }
 
 uint64_t test_vcvtad_u64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtad_u64_f64
-// CHECK: fcvtau {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
 int32_t test_vcvtms_s32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtms_s32_f32
-// CHECK: fcvtms {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtms_s32_f32(a);
 }
 
 int64_t test_vcvtmd_s64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtmd_s64_f64
-// CHECK: fcvtms {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
 uint32_t test_vcvtms_u32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtms_u32_f32
-// CHECK: fcvtmu {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtms_u32_f32(a);
 }
 
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtmd_u64_f64
-// CHECK: fcvtmu {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
 int32_t test_vcvtns_s32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtns_s32_f32
-// CHECK: fcvtns {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtns_s32_f32(a);
 }
 
 int64_t test_vcvtnd_s64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtnd_s64_f64
-// CHECK: fcvtns {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
 uint32_t test_vcvtns_u32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtns_u32_f32
-// CHECK: fcvtnu {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtns_u32_f32(a);
 }
 
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtnd_u64_f64
-// CHECK: fcvtnu {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
 int32_t test_vcvtps_s32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtps_s32_f32
-// CHECK: fcvtps {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtps_s32_f32(a);
 }
 
 int64_t test_vcvtpd_s64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtpd_s64_f64
-// CHECK: fcvtps {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
 uint32_t test_vcvtps_u32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvtps_u32_f32
-// CHECK: fcvtpu {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtps_u32_f32(a);
 }
 
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtpd_u64_f64
-// CHECK: fcvtpu {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
 int32_t test_vcvts_s32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvts_s32_f32
-// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvts_s32_f32(a);
 }
 
 int64_t test_vcvtd_s64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtd_s64_f64
-// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtd_s64_f64(a);
 }
 
 uint32_t test_vcvts_u32_f32(float32_t a) {
 // CHECK-LABEL: test_vcvts_u32_f32
-// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvts_u32_f32(a);
 }
 
 uint64_t test_vcvtd_u64_f64(float64_t a) {
 // CHECK-LABEL: test_vcvtd_u64_f64
-// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}
+// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtd_u64_f64(a);
 }
diff --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c
index 2e549ed44bd..85603c52576 100644
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@@ -1,8 +1,10 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck -check-prefix=CHECK-FMA %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
@@ -192,11 +194,11 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   // CHECK-LABEL: test_vfmaq_n_f64:
   return vfmaq_n_f64(a, b, c);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
 
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   // CHECK-LABEL: test_vfmsq_n_f64:
   return vfmsq_n_f64(a, b, c);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
diff --git a/clang/test/CodeGen/aarch64-neon-misc.c b/clang/test/CodeGen/aarch64-neon-misc.c
index f0a8d3a4873..dce00f34543 100644
--- a/clang/test/CodeGen/aarch64-neon-misc.c
+++ b/clang/test/CodeGen/aarch64-neon-misc.c
@@ -1,109 +1,112 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
 // CHECK-LABEL: test_vceqz_s8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
 // CHECK-LABEL: test_vceqz_s16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
 // CHECK-LABEL: test_vceqz_s32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
 // CHECK-LABEL: test_vceqz_s64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #0x0
+// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
 // CHECK-LABEL: test_vceqz_u64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #0x0
+// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
 // CHECK-LABEL: test_vceqz_p64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #0x0
+// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
 // CHECK-LABEL: test_vceqzq_s8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
 // CHECK-LABEL: test_vceqzq_s16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
 // CHECK-LABEL: test_vceqzq_s32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
 // CHECK-LABEL: test_vceqzq_s64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
 // CHECK-LABEL: test_vceqz_u8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
 // CHECK-LABEL: test_vceqz_u16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
 // CHECK-LABEL: test_vceqz_u32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
 // CHECK-LABEL: test_vceqzq_u8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
 // CHECK-LABEL: test_vceqzq_u16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
 // CHECK-LABEL: test_vceqzq_u32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
 // CHECK-LABEL: test_vceqzq_u64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
@@ -127,25 +130,25 @@ uint32x4_t test_vceqzq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: test_vceqz_p8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
 // CHECK-LABEL: test_vceqzq_p8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
 // CHECK-LABEL: test_vceqz_p16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 uint16x4_t test_vceqz_p16(poly16x4_t a) {
   return vceqz_p16(a);
 }
 
 // CHECK-LABEL: test_vceqzq_p16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 uint16x8_t test_vceqzq_p16(poly16x8_t a) {
   return vceqzq_p16(a);
 }
@@ -163,49 +166,49 @@ uint64x2_t test_vceqzq_p64(poly64x2_t a) {
 }
 
 // CHECK-LABEL: test_vcgez_s8
-// CHECK: cmge  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+// CHECK: cmge  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
 // CHECK-LABEL: test_vcgez_s16
-// CHECK: cmge  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+// CHECK: cmge  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
 // CHECK-LABEL: test_vcgez_s32
-// CHECK: cmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+// CHECK: cmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
 // CHECK-LABEL: test_vcgez_s64
-// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #0x0
+// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
 // CHECK-LABEL: test_vcgezq_s8
-// CHECK: cmge  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+// CHECK: cmge  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
 // CHECK-LABEL: test_vcgezq_s16
-// CHECK: cmge  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+// CHECK: cmge  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
 // CHECK-LABEL: test_vcgezq_s32
-// CHECK: cmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+// CHECK: cmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
 // CHECK-LABEL: test_vcgezq_s64
-// CHECK: cmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+// CHECK: cmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
@@ -235,49 +238,49 @@ uint64x2_t test_vcgezq_f64(float64x2_t a) {
 }
 
 // CHECK-LABEL: test_vclez_s8
-// CHECK: cmle  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+// CHECK: cmle  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
 // CHECK-LABEL: test_vclez_s16
-// CHECK: cmle  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+// CHECK: cmle  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
 // CHECK-LABEL: test_vclez_s32
-// CHECK: cmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+// CHECK: cmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
 // CHECK-LABEL: test_vclez_s64
-// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #0x0
+// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
 // CHECK-LABEL: test_vclezq_s8
-// CHECK: cmle  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+// CHECK: cmle  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
 // CHECK-LABEL: test_vclezq_s16
-// CHECK: cmle  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+// CHECK: cmle  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
 // CHECK-LABEL: test_vclezq_s32
-// CHECK: cmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+// CHECK: cmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
 // CHECK-LABEL: test_vclezq_s64
-// CHECK: cmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+// CHECK: cmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
@@ -307,49 +310,49 @@ uint64x2_t test_vclezq_f64(float64x2_t a) {
 }
 
 // CHECK-LABEL: test_vcgtz_s8
-// CHECK: cmgt  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
 // CHECK-LABEL: test_vcgtz_s16
-// CHECK: cmgt  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
 // CHECK-LABEL: test_vcgtz_s32
-// CHECK: cmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
 // CHECK-LABEL: test_vcgtz_s64
-// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #0x0
+// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
 // CHECK-LABEL: test_vcgtzq_s8
-// CHECK: cmgt  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
 // CHECK-LABEL: test_vcgtzq_s16
-// CHECK: cmgt  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
 // CHECK-LABEL: test_vcgtzq_s32
-// CHECK: cmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
 // CHECK-LABEL: test_vcgtzq_s64
-// CHECK: cmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+// CHECK: cmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
diff --git a/clang/test/CodeGen/aarch64-neon-perm.c b/clang/test/CodeGen/aarch64-neon-perm.c
index 23a7cabdce4..c9a3d600dc5 100644
--- a/clang/test/CodeGen/aarch64-neon-perm.c
+++ b/clang/test/CodeGen/aarch64-neon-perm.c
@@ -1,9 +1,11 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
-
 #include <arm_neon.h>
 
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
@@ -33,7 +35,7 @@ int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vuzp1_s32
   return vuzp1_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
@@ -45,7 +47,7 @@ int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vuzp1q_s64
   return vuzp1q_s64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
@@ -75,7 +77,7 @@ uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vuzp1_u32
   return vuzp1_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
@@ -87,13 +89,13 @@ uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vuzp1q_u64
   return vuzp1q_u64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vuzp1_f32
   return vuzp1_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
@@ -105,7 +107,7 @@ float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vuzp1q_f64
   return vuzp1q_f64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
@@ -159,7 +161,7 @@ int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vuzp2_s32
   return vuzp2_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
@@ -171,7 +173,7 @@ int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vuzp2q_s64
   return vuzp2q_s64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
@@ -201,7 +203,7 @@ uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vuzp2_u32
   return vuzp2_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
@@ -213,13 +215,13 @@ uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vuzp2q_u64
   return vuzp2q_u64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vuzp2_f32
   return vuzp2_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
@@ -231,7 +233,7 @@ float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vuzp2q_f64
   return vuzp2q_f64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
@@ -285,7 +287,7 @@ int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vzip1_s32
   return vzip1_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
@@ -297,7 +299,7 @@ int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vzip1q_s64
   return vzip1q_s64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
@@ -327,7 +329,7 @@ uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vzip1_u32
   return vzip1_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
@@ -339,13 +341,13 @@ uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vzip1q_u64
   return vzip1q_u64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vzip1_f32
   return vzip1_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
@@ -357,7 +359,7 @@ float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vzip1q_f64
   return vzip1q_f64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
@@ -411,7 +413,7 @@ int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vzip2_s32
   return vzip2_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
@@ -423,7 +425,7 @@ int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vzip2q_s64
   return vzip2q_s64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
@@ -453,7 +455,7 @@ uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vzip2_u32
   return vzip2_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
@@ -465,13 +467,13 @@ uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vzip2q_u64
   return vzip2q_u64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vzip2_f32
   return vzip2_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
@@ -483,7 +485,7 @@ float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vzip2q_f64
   return vzip2q_f64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
@@ -537,7 +539,7 @@ int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vtrn1_s32
   return vtrn1_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
@@ -549,7 +551,7 @@ int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vtrn1q_s64
   return vtrn1q_s64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
@@ -579,7 +581,7 @@ uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vtrn1_u32
   return vtrn1_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
@@ -591,13 +593,13 @@ uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vtrn1q_u64
   return vtrn1q_u64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vtrn1_f32
   return vtrn1_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
@@ -609,7 +611,7 @@ float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vtrn1q_f64
   return vtrn1q_f64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
@@ -663,7 +665,7 @@ int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vtrn2_s32
   return vtrn2_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
@@ -675,7 +677,7 @@ int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
   // CHECK-LABEL: test_vtrn2q_s64
   return vtrn2q_s64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
@@ -705,7 +707,7 @@ uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vtrn2_u32
   return vtrn2_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
@@ -717,13 +719,13 @@ uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
   // CHECK-LABEL: test_vtrn2q_u64
   return vtrn2q_u64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vtrn2_f32
   return vtrn2_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
@@ -735,7 +737,7 @@ float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
   // CHECK-LABEL: test_vtrn2q_f64
   return vtrn2q_f64(a, b);
-  // CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
@@ -778,8 +780,8 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vuzp_s32
   return vuzp_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vuzp_u8
@@ -796,14 +798,14 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vuzp_u32
   return vuzp_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vuzp_f32
   return vuzp_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   // CHECK-LABEL: test_vuzp_p8
@@ -888,8 +890,8 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vzip_s32
   return vzip_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vzip_u8
@@ -906,14 +908,14 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vzip_u32
   return vzip_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vzip_f32
   return vzip_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   // CHECK-LABEL: test_vzip_p8
@@ -998,8 +1000,8 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   // CHECK-LABEL: test_vtrn_s32
   return vtrn_s32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtrn_u8
@@ -1016,14 +1018,14 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   // CHECK-LABEL: test_vtrn_u32
   return vtrn_u32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   // CHECK-LABEL: test_vtrn_f32
   return vtrn_f32(a, b);
-  // CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  // CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
+  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   // CHECK-LABEL: test_vtrn_p8
diff --git a/clang/test/CodeGen/aarch64-neon-scalar-copy.c b/clang/test/CodeGen/aarch64-neon-scalar-copy.c
index cd8777f5a85..227c6e04e14 100644
--- a/clang/test/CodeGen/aarch64-neon-scalar-copy.c
+++ b/clang/test/CodeGen/aarch64-neon-scalar-copy.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 
 #include <arm_neon.h>
@@ -40,14 +43,14 @@ float64_t test_vdupd_laneq_f64(float64x2_t a) {
 // CHECK-LABEL: test_vdupb_lane_s8
 int8_t test_vdupb_lane_s8(int8x8_t a) {
   return vdupb_lane_s8(a, 7);
-// CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
+// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
 // CHECK-LABEL: test_vduph_lane_s16
 int16_t test_vduph_lane_s16(int16x4_t a) {
   return vduph_lane_s16(a, 3);
-// CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[3]
+// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
@@ -95,14 +98,14 @@ uint64_t test_vdupd_lane_u64(uint64x1_t a) {
 // CHECK-LABEL: test_vdupb_laneq_s8
 int8_t test_vdupb_laneq_s8(int8x16_t a) {
   return vdupb_laneq_s8(a, 15);
-// CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[15]
+// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
 // CHECK-LABEL: test_vduph_laneq_s16
 int16_t test_vduph_laneq_s16(int16x8_t a) {
   return vduph_laneq_s16(a, 7);
-// CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[7]
+// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
diff --git a/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
index de81266d976..0293e983c27 100644
--- a/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
@@ -16,7 +19,7 @@ float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
   // CHECK-LABEL: test_vmuld_lane_f64
   return vmuld_lane_f64(a, b, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
@@ -34,7 +37,7 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
   // CHECK-LABEL: test_vmul_n_f64
   return vmul_n_f64(a, b);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
@@ -52,7 +55,7 @@ float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
 // CHECK-LABEL: test_vmulxd_lane_f64
   return vmulxd_lane_f64(a, b, 0);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
@@ -64,7 +67,7 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
 // CHECK-LABEL: test_vmulx_lane_f64
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
   return vmulx_lane_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 
@@ -90,7 +93,7 @@ float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
 // CHECK-LABEL: test_vfmad_lane_f64
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 // CHECK-LABEL: test_vfmad_laneq_f64
@@ -108,13 +111,13 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
 // CHECK-LABEL: test_vfma_lane_f64
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 // CHECK-LABEL: test_vfms_lane_f64
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 // CHECK-LABEL: test_vfma_laneq_f64
@@ -132,7 +135,7 @@ float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
 // CHECK-LABEL: test_vqdmullh_lane_s16
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmullh_lane_s16(a, b, 3);
-  // CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9].4h}}, {{v[0-9]+}}.h[3]
 }
 
 // CHECK-LABEL: test_vqdmulls_lane_s32
@@ -144,7 +147,7 @@ int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
 // CHECK-LABEL: test_vqdmullh_laneq_s16
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmullh_laneq_s16(a, b, 7);
-  // CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 // CHECK-LABEL: test_vqdmulls_laneq_s32
@@ -156,7 +159,7 @@ int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
 // CHECK-LABEL: test_vqdmulhh_lane_s16
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmulhh_lane_s16(a, b, 3);
-// CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
 // CHECK-LABEL: test_vqdmulhs_lane_s32
@@ -169,7 +172,7 @@ int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
 // CHECK-LABEL: test_vqdmulhh_laneq_s16
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
@@ -182,7 +185,7 @@ int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
 // CHECK-LABEL: test_vqrdmulhh_lane_s16
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqrdmulhh_lane_s16(a, b, 3);
-// CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
 // CHECK-LABEL: test_vqrdmulhs_lane_s32
@@ -195,7 +198,7 @@ int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
 // CHECK-LABEL: test_vqrdmulhh_laneq_s16
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqrdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
@@ -208,7 +211,7 @@ int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
 // CHECK-LABEL: test_vqdmlalh_lane_s16
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlalh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
 // CHECK-LABEL: test_vqdmlals_lane_s32
@@ -220,7 +223,7 @@ int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
 // CHECK-LABEL: test_vqdmlalh_laneq_s16
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlalh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 // CHECK-LABEL: test_vqdmlals_laneq_s32
@@ -232,7 +235,7 @@ int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
 // CHECK-LABEL: test_vqdmlslh_lane_s16
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlslh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
 // CHECK-LABEL: test_vqdmlsls_lane_s32
@@ -244,7 +247,7 @@ int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
 // CHECK-LABEL: test_vqdmlslh_laneq_s16
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlslh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 // CHECK-LABEL: test_vqdmlsls_laneq_s32
@@ -262,11 +265,11 @@ float64x1_t test_vmulx_lane_f64_0() {
       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       result = vmulx_lane_f64(arg1, arg2, 0);
-// CHECK: adrp	x0
-// CHECK: ldr	d0, [x0,
-// CHECK: adrp	x0
-// CHECK: ldr	d1, [x0,
-// CHECK: fmulx	d0, d1, d0
+// CHECK: adrp x[[ADDRLO:[0-9]+]]
+// CHECK: ldr d0, [x[[ADDRLO]],
+// CHECK: adrp x[[ADDRLO:[0-9]+]]
+// CHECK: ldr d1, [x[[ADDRLO]],
+// CHECK: fmulx d0, d1, d0
       return result;
 }
 
@@ -281,10 +284,10 @@ float64x1_t test_vmulx_laneq_f64_2() {
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       arg3 = vcombine_f64(arg1, arg2);
       result = vmulx_laneq_f64(arg1, arg3, 1);
-// CHECK: adrp	x0
-// CHECK: ldr	d0, [x0,
-// CHECK: adrp	x0
-// CHECK: ldr	d1, [x0,
-// CHECK: fmulx	d0, d1, d0
+// CHECK: adrp x[[ADDRLO:[0-9]+]]
+// CHECK: ldr d0, [x[[ADDRLO]],
+// CHECK: adrp x[[ADDRLO:[0-9]+]]
+// CHECK: ldr d1, [x[[ADDRLO]],
+// CHECK: fmulx d0, d1, d0
       return result;
 }
diff --git a/clang/test/CodeGen/aarch64-neon-shifts.c b/clang/test/CodeGen/aarch64-neon-shifts.c
index 4777f18aae7..99adff1f04e 100644
--- a/clang/test/CodeGen/aarch64-neon-shifts.c
+++ b/clang/test/CodeGen/aarch64-neon-shifts.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s
 
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-neon-tbl.c b/clang/test/CodeGen/aarch64-neon-tbl.c
index 365ce298bb5..93cba1d7e18 100644
--- a/clang/test/CodeGen/aarch64-neon-tbl.c
+++ b/clang/test/CodeGen/aarch64-neon-tbl.c
@@ -1,6 +1,9 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
@@ -9,81 +12,81 @@
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_vtbl1_s8
   return vtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbl1_s8(int8x16_t a, int8x8_t b) {
   // CHECK-LABEL: test_vqtbl1_s8
   return vqtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   // CHECK-LABEL: test_vtbl2_s8
   return vtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbl2_s8(int8x16x2_t a, int8x8_t b) {
   // CHECK-LABEL: test_vqtbl2_s8
   return vqtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   // CHECK-LABEL: test_vtbl3_s8
   return vtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbl3_s8(int8x16x3_t a, int8x8_t b) {
   // CHECK-LABEL: test_vqtbl3_s8
   return vqtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   // CHECK-LABEL: test_vtbl4_s8
   return vtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbl4_s8(int8x16x4_t a, int8x8_t b) {
   // CHECK-LABEL: test_vqtbl4_s8
   return vqtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
   // CHECK-LABEL: test_vqtbl1q_s8
   return vqtbl1q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
   // CHECK-LABEL: test_vqtbl2q_s8
   return vqtbl2q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
   // CHECK-LABEL: test_vqtbl3q_s8
   return vqtbl3q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
   // CHECK-LABEL: test_vqtbl4q_s8
   return vqtbl4q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   // CHECK-LABEL: test_vtbx1_s8
   return vtbx1_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+}}.8b, #0
+  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
   // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
   // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
@@ -91,15 +94,15 @@ int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   // CHECK-LABEL: test_vtbx2_s8
   return vtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   // CHECK-LABEL: test_vtbx3_s8
   return vtbx3_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+}}.8b, #0
+  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
   // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
   // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
@@ -107,135 +110,135 @@ int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   // CHECK-LABEL: test_vtbx4_s8
   return vtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, int8x8_t c) {
   // CHECK-LABEL: test_vqtbx1_s8
   return vqtbx1_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, int8x8_t c) {
   // CHECK-LABEL: test_vqtbx2_s8
   return vqtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, int8x8_t c) {
   // CHECK-LABEL: test_vqtbx3_s8
   return vqtbx3_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, int8x8_t c) {
   // CHECK-LABEL: test_vqtbx4_s8
   return vqtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   // CHECK-LABEL: test_vqtbx1q_s8
   return vqtbx1q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
   // CHECK-LABEL: test_vqtbx2q_s8
   return vqtbx2q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
   // CHECK-LABEL: test_vqtbx3q_s8
   return vqtbx3q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
   // CHECK-LABEL: test_vqtbx4q_s8
   return vqtbx4q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl1_u8
   return vtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl1_u8
   return vqtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl2_u8
   return vtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl2_u8
   return vqtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl3_u8
   return vtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl3_u8
   return vqtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl4_u8
   return vtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl4_u8
   return vqtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl1q_u8
   return vqtbl1q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl2q_u8
   return vqtbl2q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl3q_u8
   return vqtbl3q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl4q_u8
   return vqtbl4q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx1_u8
   return vtbx1_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+}}.8b, #0
+  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
   // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
   // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
@@ -243,15 +246,15 @@ uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx2_u8
   return vtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx3_u8
   return vtbx3_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+}}.8b, #0
+  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
   // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
   // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
@@ -259,135 +262,135 @@ uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx4_u8
   return vtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx1_u8
   return vqtbx1_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx2_u8
   return vqtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx3_u8
   return vqtbx3_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx4_u8
   return vqtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx1q_u8
   return vqtbx1q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx2q_u8
   return vqtbx2q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx3q_u8
   return vqtbx3q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx4q_u8
   return vqtbx4q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl1_p8
   return vtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl1_p8
   return vqtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl2_p8
   return vtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl2_p8
   return vqtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl3_p8
   return vtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl3_p8
   return vqtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vtbl4_p8
   return vtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
   // CHECK-LABEL: test_vqtbl4_p8
   return vqtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl1q_p8
   return vqtbl1q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl2q_p8
   return vqtbl2q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl3q_p8
   return vqtbl3q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
   // CHECK-LABEL: test_vqtbl4q_p8
   return vqtbl4q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx1_p8
   return vtbx1_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+}}.8b, #0
+  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
   // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
   // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
@@ -395,15 +398,15 @@ poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx2_p8
   return vtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx3_p8
   return vtbx3_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+}}.8b, #0
+  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
   // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
   // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
@@ -411,53 +414,53 @@ poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vtbx4_p8
   return vtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx1_p8
   return vqtbx1_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx2_p8
   return vqtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx3_p8
   return vqtbx3_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
   // CHECK-LABEL: test_vqtbx4_p8
   return vqtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
 poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx1q_p8
   return vqtbx1q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx2q_p8
   return vqtbx2q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx3q_p8
   return vqtbx3q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
 poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
   // CHECK-LABEL: test_vqtbx4q_p8
   return vqtbx4q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
diff --git a/clang/test/CodeGen/aarch64-neon-vcombine.c b/clang/test/CodeGen/aarch64-neon-vcombine.c
index 3e170c8c4c9..381b0528492 100644
--- a/clang/test/CodeGen/aarch64-neon-vcombine.c
+++ b/clang/test/CodeGen/aarch64-neon-vcombine.c
@@ -1,6 +1,8 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
 // RUN:   -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -S -O3 -o - %s | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
diff --git a/clang/test/CodeGen/aarch64-neon-vget-hilo.c b/clang/test/CodeGen/aarch64-neon-vget-hilo.c
index 012b0bbb964..4d80d414d38 100644
--- a/clang/test/CodeGen/aarch64-neon-vget-hilo.c
+++ b/clang/test/CodeGen/aarch64-neon-vget-hilo.c
@@ -1,176 +1,193 @@
 // REQUIRES: aarch64-registered-target
+// REQUIRES: arm64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-AARCH64
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-ARM64
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
 int8x8_t test_vget_high_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vget_high_s8:
+  // CHECK-COMMON-LABEL: test_vget_high_s8:
   return vget_high_s8(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 int16x4_t test_vget_high_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vget_high_s16:
+  // CHECK-COMMON-LABEL: test_vget_high_s16:
   return vget_high_s16(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 int32x2_t test_vget_high_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vget_high_s32:
+  // CHECK-COMMON-LABEL: test_vget_high_s32:
   return vget_high_s32(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 int64x1_t test_vget_high_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vget_high_s64:
+  // CHECK-COMMON-LABEL: test_vget_high_s64:
   return vget_high_s64(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vget_high_u8:
+  // CHECK-COMMON-LABEL: test_vget_high_u8:
   return vget_high_u8(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vget_high_u16:
+  // CHECK-COMMON-LABEL: test_vget_high_u16:
   return vget_high_u16(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vget_high_u32:
+  // CHECK-COMMON-LABEL: test_vget_high_u32:
   return vget_high_u32(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vget_high_u64:
+  // CHECK-COMMON-LABEL: test_vget_high_u64:
   return vget_high_u64(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 poly64x1_t test_vget_high_p64(poly64x2_t a) {
-  // CHECK-LABEL: test_vget_high_p64:
+  // CHECK-COMMON-LABEL: test_vget_high_p64:
   return vget_high_p64(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 float16x4_t test_vget_high_f16(float16x8_t a) {
-  // CHECK-LABEL: test_vget_high_f16:
+  // CHECK-COMMON-LABEL: test_vget_high_f16:
   return vget_high_f16(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 float32x2_t test_vget_high_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vget_high_f32:
+  // CHECK-COMMON-LABEL: test_vget_high_f32:
   return vget_high_f32(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vget_high_p8:
+  // CHECK-COMMON-LABEL: test_vget_high_p8:
   return vget_high_p8(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
-  // CHECK-LABEL: test_vget_high_p16
+  // CHECK-COMMON-LABEL: test_vget_high_p16
   return vget_high_p16(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 float64x1_t test_vget_high_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vget_high_f64
+  // CHECK-COMMON-LABEL: test_vget_high_f64
   return vget_high_f64(a);
-  // CHECK: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
+  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
 int8x8_t test_vget_low_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vget_low_s8:
+  // CHECK-COMMON-LABEL: test_vget_low_s8:
   return vget_low_s8(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 int16x4_t test_vget_low_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vget_low_s16:
+  // CHECK-COMMON-LABEL: test_vget_low_s16:
   return vget_low_s16(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 int32x2_t test_vget_low_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vget_low_s32:
+  // CHECK-COMMON-LABEL: test_vget_low_s32:
   return vget_low_s32(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 int64x1_t test_vget_low_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vget_low_s64:
+  // CHECK-COMMON-LABEL: test_vget_low_s64:
   return vget_low_s64(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vget_low_u8:
+  // CHECK-COMMON-LABEL: test_vget_low_u8:
   return vget_low_u8(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vget_low_u16:
+  // CHECK-COMMON-LABEL: test_vget_low_u16:
   return vget_low_u16(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vget_low_u32:
+  // CHECK-COMMON-LABEL: test_vget_low_u32:
   return vget_low_u32(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vget_low_u64:
+  // CHECK-COMMON-LABEL: test_vget_low_u64:
   return vget_low_u64(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 poly64x1_t test_vget_low_p64(poly64x2_t a) {
-  // CHECK-LABEL: test_vget_low_p64:
+  // CHECK-COMMON-LABEL: test_vget_low_p64:
   return vget_low_p64(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 float16x4_t test_vget_low_f16(float16x8_t a) {
-  // CHECK-LABEL: test_vget_low_f16:
+  // CHECK-COMMON-LABEL: test_vget_low_f16:
   return vget_low_f16(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 float32x2_t test_vget_low_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vget_low_f32:
+  // CHECK-COMMON-LABEL: test_vget_low_f32:
   return vget_low_f32(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vget_low_p8:
+  // CHECK-COMMON-LABEL: test_vget_low_p8:
   return vget_low_p8(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
-  // CHECK-LABEL: test_vget_low_p16:
+  // CHECK-COMMON-LABEL: test_vget_low_p16:
   return vget_low_p16(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
 float64x1_t test_vget_low_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vget_low_f64:
+  // CHECK-COMMON-LABEL: test_vget_low_f64:
   return vget_low_f64(a);
-  // CHECK-NEXT: ret
+  // CHECK-COMMON-NEXT: ret
 }
 
diff --git a/clang/test/CodeGen/aarch64-varargs.c b/clang/test/CodeGen/aarch64-varargs.c
index 21a244d600f..053982bb25b 100644
--- a/clang/test/CodeGen/aarch64-varargs.c
+++ b/clang/test/CodeGen/aarch64-varargs.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple aarch64 -emit-llvm -o - %s | FileCheck -check-prefix=CHECK --check-prefix=CHECK-LE %s
 // RUN: %clang_cc1 -triple aarch64_be -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s 
+// RUN: %clang_cc1 -triple arm64-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
 
 #include <stdarg.h>
 
diff --git a/clang/test/CodeGen/arm-aapcs-vfp.c b/clang/test/CodeGen/arm-aapcs-vfp.c
index 4fa2aac493f..cd9aaa47353 100644
--- a/clang/test/CodeGen/arm-aapcs-vfp.c
+++ b/clang/test/CodeGen/arm-aapcs-vfp.c
@@ -12,7 +12,15 @@
 // RUN:  -ffreestanding \
 // RUN:  -emit-llvm -w -o - %s | FileCheck %s
 
+// RUN: %clang_cc1 -triple arm64-apple-darwin9 \
+// RUN:   -ffreestanding \
+// RUN:   -emit-llvm -w -o - %s | FileCheck -check-prefix=CHECK64 %s
+
+#ifdef __arm64__
+#include <arm_neon.h>
+#else
 #include <arm_neon.h>
+#endif
 
 struct homogeneous_struct {
   float f[2];
@@ -20,6 +28,7 @@ struct homogeneous_struct {
   float f4;
 };
 // CHECK: define arm_aapcs_vfpcc %struct.homogeneous_struct @test_struct(float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+// CHECK64: define %struct.homogeneous_struct @test_struct(float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
 extern struct homogeneous_struct struct_callee(struct homogeneous_struct);
 struct homogeneous_struct test_struct(struct homogeneous_struct arg) {
   return struct_callee(arg);
@@ -34,6 +43,7 @@ struct nested_array {
   double d[4];
 };
 // CHECK: define arm_aapcs_vfpcc void @test_array(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}})
+// CHECK64: define void @test_array(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}})
 extern void array_callee(struct nested_array);
 void test_array(struct nested_array arg) {
   array_callee(arg);
@@ -41,6 +51,7 @@ void test_array(struct nested_array arg) {
 
 extern void complex_callee(__complex__ double);
 // CHECK: define arm_aapcs_vfpcc void @test_complex(double %{{.*}}, double %{{.*}})
+// CHECK64: define void @test_complex(double %{{.*}}, double %{{.*}})
 void test_complex(__complex__ double cd) {
   complex_callee(cd);
 }
@@ -62,6 +73,9 @@ struct big_struct {
   float f4;
 };
 // CHECK: define arm_aapcs_vfpcc void @test_big([5 x i32] %{{.*}})
+// CHECK64: define void @test_big(%struct.big_struct* %{{.*}})
+// CHECK64: call void @llvm.memcpy
+// CHECK64: call void @big_callee(%struct.big_struct*
 extern void big_callee(struct big_struct);
 void test_big(struct big_struct arg) {
   big_callee(arg);
@@ -75,6 +89,7 @@ struct heterogeneous_struct {
   int i2;
 };
 // CHECK: define arm_aapcs_vfpcc void @test_hetero([2 x i32] %{{.*}})
+// CHECK64: define void @test_hetero(i64 %{{.*}})
 extern void hetero_callee(struct heterogeneous_struct);
 void test_hetero(struct heterogeneous_struct arg) {
   hetero_callee(arg);
@@ -82,6 +97,7 @@ void test_hetero(struct heterogeneous_struct arg) {
 
 // Neon multi-vector types are homogeneous aggregates.
 // CHECK: define arm_aapcs_vfpcc <16 x i8> @f0(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+// CHECK64: define <16 x i8> @f0(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 int8x16_t f0(int8x16x4_t v4) {
   return vaddq_s8(v4.val[0], v4.val[3]);
 }
@@ -95,6 +111,7 @@ struct neon_struct {
   int16x4_t v4;
 };
 // CHECK: define arm_aapcs_vfpcc void @test_neon(<8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <2 x i32> %{{.*}}, <4 x i16> %{{.*}})
+// CHECK64: define void @test_neon(<8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <2 x i32> %{{.*}}, <4 x i16> %{{.*}})
 extern void neon_callee(struct neon_struct);
 void test_neon(struct neon_struct arg) {
   neon_callee(arg);
diff --git a/clang/test/CodeGen/arm-homogenous.c b/clang/test/CodeGen/arm-homogenous.c
index 7737399f930..ad21444aabb 100644
--- a/clang/test/CodeGen/arm-homogenous.c
+++ b/clang/test/CodeGen/arm-homogenous.c
@@ -1,6 +1,11 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv7---eabi -target-abi aapcs -mfloat-abi hard -emit-llvm %s -o - | FileCheck %s
 
+// RUN: %clang_cc1 -triple arm64-apple-darwin9 -target-abi darwinpcs \
+// RUN:  -ffreestanding -emit-llvm -w -o - %s | FileCheck -check-prefix=CHECK64 %s
+
+// RUN: %clang_cc1 -triple arm64-linux-gnu -ffreestanding -emit-llvm -w -o - %s \
+// RUN:   | FileCheck --check-prefix=CHECK64-AAPCS %s
 typedef long long int64_t;
 typedef unsigned int uint32_t;
 
@@ -170,6 +175,10 @@ struct_of_four_doubles g_s4d;
 void test_struct_of_four_doubles(void) {
 // CHECK: test_struct_of_four_doubles
 // CHECK: call arm_aapcs_vfpcc void @takes_struct_of_four_doubles(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, [6 x float] undef, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
+// CHECK64: test_struct_of_four_doubles
+// CHECK64: call void @takes_struct_of_four_doubles(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, [3 x float] undef, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
+// CHECK64-AAPCS: test_struct_of_four_doubles
+// CHECK64-AAPCS: call void @takes_struct_of_four_doubles(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, [3 x float] undef, [4 x double] {{.*}}, double {{.*}})
   takes_struct_of_four_doubles(3.0, g_s4d, g_s4d, 4.0);
 }
 
@@ -202,6 +211,10 @@ struct_of_vecs g_vec;
 void test_struct_of_vecs(void) {
 // CHECK: test_struct_of_vecs
 // CHECK: call arm_aapcs_vfpcc void @takes_struct_of_vecs(double {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, [6 x float] undef, <8 x i8> {{.*}}, <4 x i16> {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, double {{.*}})
+// CHECK64: test_struct_of_vecs
+// CHECK64: call void @takes_struct_of_vecs(double {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, [3 x float] undef, <8 x i8> {{.*}}, <4 x i16> {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, double {{.*}})
+// CHECK64-AAPCS: test_struct_of_vecs
+// CHECK64-AAPCS: call void @takes_struct_of_vecs(double {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, <8 x i8> {{.*}}, <4 x i16> {{.*}}, [3 x float] undef, [4 x double] {{.*}})
   takes_struct_of_vecs(3.0, g_vec, g_vec, 4.0);
 }
 
diff --git a/clang/test/CodeGen/arm64-abi-vector.c b/clang/test/CodeGen/arm64-abi-vector.c
new file mode 100644
index 00000000000..502fb080d70
--- /dev/null
+++ b/clang/test/CodeGen/arm64-abi-vector.c
@@ -0,0 +1,430 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-abi darwinpcs -emit-llvm -o - %s | FileCheck %s
+
+#include <stdarg.h>
+
+typedef __attribute__(( ext_vector_type(3) ))  char __char3;
+typedef __attribute__(( ext_vector_type(4) ))  char __char4;
+typedef __attribute__(( ext_vector_type(5) ))  char __char5;
+typedef __attribute__(( ext_vector_type(9) ))  char __char9;
+typedef __attribute__(( ext_vector_type(19) )) char __char19;
+typedef __attribute__(( ext_vector_type(3) ))  short __short3;
+typedef __attribute__(( ext_vector_type(5) ))  short __short5;
+typedef __attribute__(( ext_vector_type(3) ))  int __int3;
+typedef __attribute__(( ext_vector_type(5) ))  int __int5;
+typedef __attribute__(( ext_vector_type(3) ))  double __double3;
+
+double varargs_vec_3c(int fixed, ...) {
+// CHECK: varargs_vec_3c
+// CHECK: alloca <3 x i8>, align 4
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <3 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char3 c3 = va_arg(ap, __char3);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_3c(__char3 *in) {
+// CHECK: test_3c
+// CHECK: call double (i32, ...)* @varargs_vec_3c(i32 3, i32 {{%.*}})
+  return varargs_vec_3c(3, *in);
+}
+
+double varargs_vec_4c(int fixed, ...) {
+// CHECK: varargs_vec_4c
+// CHECK: alloca <4 x i8>, align 4
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <4 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char4 c4 = va_arg(ap, __char4);
+  sum = sum + c4.x + c4.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_4c(__char4 *in) {
+// CHECK: test_4c
+// CHECK: call double (i32, ...)* @varargs_vec_4c(i32 4, i32 {{%.*}})
+  return varargs_vec_4c(4, *in);
+}
+
+double varargs_vec_5c(int fixed, ...) {
+// CHECK: varargs_vec_5c
+// CHECK: alloca <5 x i8>, align 8
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <5 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char5 c5 = va_arg(ap, __char5);
+  sum = sum + c5.x + c5.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_5c(__char5 *in) {
+// CHECK: test_5c
+// CHECK: call double (i32, ...)* @varargs_vec_5c(i32 5, <2 x i32> {{%.*}})
+  return varargs_vec_5c(5, *in);
+}
+
+double varargs_vec_9c(int fixed, ...) {
+// CHECK: varargs_vec_9c
+// CHECK: alloca <9 x i8>, align 16
+// CHECK: [[ALIGN:%.*]] = and i64 {{%.*}}, -16
+// CHECK: [[AP_ALIGN:%.*]] = inttoptr i64 [[ALIGN]] to i8*
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_ALIGN]], i32 16
+// CHECK: bitcast i8* [[AP_ALIGN]] to <9 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char9 c9 = va_arg(ap, __char9);
+  sum = sum + c9.x + c9.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_9c(__char9 *in) {
+// CHECK: test_9c
+// CHECK: call double (i32, ...)* @varargs_vec_9c(i32 9, <4 x i32> {{%.*}})
+  return varargs_vec_9c(9, *in);
+}
+
+double varargs_vec_19c(int fixed, ...) {
+// CHECK: varargs_vec_19c
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: [[VAR:%.*]] = bitcast i8* [[AP_CUR]] to i8**
+// CHECK: [[VAR2:%.*]] = load i8** [[VAR]]
+// CHECK: bitcast i8* [[VAR2]] to <19 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char19 c19 = va_arg(ap, __char19);
+  sum = sum + c19.x + c19.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_19c(__char19 *in) {
+// CHECK: test_19c
+// CHECK: call double (i32, ...)* @varargs_vec_19c(i32 19, <19 x i8>* {{%.*}})
+  return varargs_vec_19c(19, *in);
+}
+
+double varargs_vec_3s(int fixed, ...) {
+// CHECK: varargs_vec_3s
+// CHECK: alloca <3 x i16>, align 8
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <3 x i16>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __short3 c3 = va_arg(ap, __short3);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_3s(__short3 *in) {
+// CHECK: test_3s
+// CHECK: call double (i32, ...)* @varargs_vec_3s(i32 3, <2 x i32> {{%.*}})
+  return varargs_vec_3s(3, *in);
+}
+
+double varargs_vec_5s(int fixed, ...) {
+// CHECK: varargs_vec_5s
+// CHECK: alloca <5 x i16>, align 16
+// CHECK: [[ALIGN:%.*]] = and i64 {{%.*}}, -16
+// CHECK: [[AP_ALIGN:%.*]] = inttoptr i64 [[ALIGN]] to i8*
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_ALIGN]], i32 16
+// CHECK: bitcast i8* [[AP_ALIGN]] to <5 x i16>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __short5 c5 = va_arg(ap, __short5);
+  sum = sum + c5.x + c5.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_5s(__short5 *in) {
+// CHECK: test_5s
+// CHECK: call double (i32, ...)* @varargs_vec_5s(i32 5, <4 x i32> {{%.*}})
+  return varargs_vec_5s(5, *in);
+}
+
+double varargs_vec_3i(int fixed, ...) {
+// CHECK: varargs_vec_3i
+// CHECK: alloca <3 x i32>, align 16
+// CHECK: [[ALIGN:%.*]] = and i64 {{%.*}}, -16
+// CHECK: [[AP_ALIGN:%.*]] = inttoptr i64 [[ALIGN]] to i8*
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_ALIGN]], i32 16
+// CHECK: bitcast i8* [[AP_ALIGN]] to <3 x i32>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __int3 c3 = va_arg(ap, __int3);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_3i(__int3 *in) {
+// CHECK: test_3i
+// CHECK: call double (i32, ...)* @varargs_vec_3i(i32 3, <4 x i32> {{%.*}})
+  return varargs_vec_3i(3, *in);
+}
+
+double varargs_vec_5i(int fixed, ...) {
+// CHECK: varargs_vec_5i
+// CHECK: alloca <5 x i32>, align 16
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: [[VAR:%.*]] = bitcast i8* [[AP_CUR]] to i8**
+// CHECK: [[VAR2:%.*]] = load i8** [[VAR]]
+// CHECK: bitcast i8* [[VAR2]] to <5 x i32>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __int5 c5 = va_arg(ap, __int5);
+  sum = sum + c5.x + c5.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_5i(__int5 *in) {
+// CHECK: test_5i
+// CHECK: call double (i32, ...)* @varargs_vec_5i(i32 5, <5 x i32>* {{%.*}})
+  return varargs_vec_5i(5, *in);
+}
+
+double varargs_vec_3d(int fixed, ...) {
+// CHECK: varargs_vec_3d
+// CHECK: alloca <3 x double>, align 16
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: [[VAR:%.*]] = bitcast i8* [[AP_CUR]] to i8**
+// CHECK: [[VAR2:%.*]] = load i8** [[VAR]]
+// CHECK: bitcast i8* [[VAR2]] to <3 x double>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __double3 c3 = va_arg(ap, __double3);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_3d(__double3 *in) {
+// CHECK: test_3d
+// CHECK: call double (i32, ...)* @varargs_vec_3d(i32 3, <3 x double>* {{%.*}})
+  return varargs_vec_3d(3, *in);
+}
+
+double varargs_vec(int fixed, ...) {
+// CHECK: varargs_vec
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char3 c3 = va_arg(ap, __char3);
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <3 x i8>*
+  sum = sum + c3.x + c3.y;
+  __char5 c5 = va_arg(ap, __char5);
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <5 x i8>*
+  sum = sum + c5.x + c5.y;
+  __char9 c9 = va_arg(ap, __char9);
+// CHECK: [[ALIGN:%.*]] = and i64 {{%.*}}, -16
+// CHECK: [[AP_ALIGN:%.*]] = inttoptr i64 [[ALIGN]] to i8*
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_ALIGN]], i32 16
+// CHECK: bitcast i8* [[AP_ALIGN]] to <9 x i8>*
+  sum = sum + c9.x + c9.y;
+  __char19 c19 = va_arg(ap, __char19);
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: [[VAR:%.*]] = bitcast i8* [[AP_CUR]] to i8**
+// CHECK: [[VAR2:%.*]] = load i8** [[VAR]]
+// CHECK: bitcast i8* [[VAR2]] to <19 x i8>*
+  sum = sum + c19.x + c19.y;
+  __short3 s3 = va_arg(ap, __short3);
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: bitcast i8* [[AP_CUR]] to <3 x i16>*
+  sum = sum + s3.x + s3.y;
+  __short5 s5 = va_arg(ap, __short5);
+// CHECK: [[ALIGN:%.*]] = and i64 {{%.*}}, -16
+// CHECK: [[AP_ALIGN:%.*]] = inttoptr i64 [[ALIGN]] to i8*
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_ALIGN]], i32 16
+// CHECK: bitcast i8* [[AP_ALIGN]] to <5 x i16>*
+  sum = sum + s5.x + s5.y;
+  __int3 i3 = va_arg(ap, __int3);
+// CHECK: [[ALIGN:%.*]] = and i64 {{%.*}}, -16
+// CHECK: [[AP_ALIGN:%.*]] = inttoptr i64 [[ALIGN]] to i8*
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_ALIGN]], i32 16
+// CHECK: bitcast i8* [[AP_ALIGN]] to <3 x i32>*
+  sum = sum + i3.x + i3.y;
+  __int5 i5 = va_arg(ap, __int5);
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: [[VAR:%.*]] = bitcast i8* [[AP_CUR]] to i8**
+// CHECK: [[VAR2:%.*]] = load i8** [[VAR]]
+// CHECK: bitcast i8* [[VAR2]] to <5 x i32>*
+  sum = sum + i5.x + i5.y;
+  __double3 d3 = va_arg(ap, __double3);
+// CHECK: [[AP_NEXT:%.*]] = getelementptr i8* [[AP_CUR:%.*]], i32 8
+// CHECK: [[VAR:%.*]] = bitcast i8* [[AP_CUR]] to i8**
+// CHECK: [[VAR2:%.*]] = load i8** [[VAR]]
+// CHECK: bitcast i8* [[VAR2]] to <3 x double>*
+  sum = sum + d3.x + d3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test(__char3 *c3, __char5 *c5, __char9 *c9, __char19 *c19,
+            __short3 *s3, __short5 *s5, __int3 *i3, __int5 *i5,
+            __double3 *d3) {
+  double ret = varargs_vec(3, *c3, *c5, *c9, *c19, *s3, *s5, *i3, *i5, *d3);
+// CHECK: call double (i32, ...)* @varargs_vec(i32 3, i32 {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, <19 x i8>* {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, <5 x i32>* {{%.*}}, <3 x double>* {{%.*}})
+  return ret;
+}
+
+__attribute__((noinline)) double args_vec_3c(int fixed, __char3 c3) {
+// CHECK: args_vec_3c
+// CHECK: [[C3:%.*]] = alloca <3 x i8>, align 4
+// CHECK: [[TMP:%.*]] = bitcast <3 x i8>* [[C3]] to i32*
+// CHECK: store i32 {{%.*}}, i32* [[TMP]]
+  double sum = fixed;
+  sum = sum + c3.x + c3.y;
+  return sum;
+}
+
+double fixed_3c(__char3 *in) {
+// CHECK: fixed_3c
+// CHECK: call double @args_vec_3c(i32 3, i32 {{%.*}})
+  return args_vec_3c(3, *in);
+}
+
+__attribute__((noinline)) double args_vec_5c(int fixed, __char5 c5) {
+// CHECK: args_vec_5c
+// CHECK: [[C5:%.*]] = alloca <5 x i8>, align 8
+// CHECK: [[TMP:%.*]] = bitcast <5 x i8>* [[C5]] to <2 x i32>*
+// CHECK: store <2 x i32> {{%.*}}, <2 x i32>* [[TMP]], align 1
+  double sum = fixed;
+  sum = sum + c5.x + c5.y;
+  return sum;
+}
+
+double fixed_5c(__char5 *in) {
+// CHECK: fixed_5c
+// CHECK: call double @args_vec_5c(i32 5, <2 x i32> {{%.*}})
+  return args_vec_5c(5, *in);
+}
+
+__attribute__((noinline)) double args_vec_9c(int fixed, __char9 c9) {
+// CHECK: args_vec_9c
+// CHECK: [[C9:%.*]] = alloca <9 x i8>, align 16
+// CHECK: [[TMP:%.*]] = bitcast <9 x i8>* [[C9]] to <4 x i32>*
+// CHECK: store <4 x i32> {{%.*}}, <4 x i32>* [[TMP]], align 1
+  double sum = fixed;
+  sum = sum + c9.x + c9.y;
+  return sum;
+}
+
+double fixed_9c(__char9 *in) {
+// CHECK: fixed_9c
+// CHECK: call double @args_vec_9c(i32 9, <4 x i32> {{%.*}})
+  return args_vec_9c(9, *in);
+}
+
+__attribute__((noinline)) double args_vec_19c(int fixed, __char19 c19) {
+// CHECK: args_vec_19c
+// CHECK: [[C19:%.*]] = load <19 x i8>* {{.*}}, align 16
+  double sum = fixed;
+  sum = sum + c19.x + c19.y;
+  return sum;
+}
+
+double fixed_19c(__char19 *in) {
+// CHECK: fixed_19c
+// CHECK: call double @args_vec_19c(i32 19, <19 x i8>* {{%.*}})
+  return args_vec_19c(19, *in);
+}
+
+__attribute__((noinline)) double args_vec_3s(int fixed, __short3 c3) {
+// CHECK: args_vec_3s
+// CHECK: [[C3:%.*]] = alloca <3 x i16>, align 8
+// CHECK: [[TMP:%.*]] = bitcast <3 x i16>* [[C3]] to <2 x i32>*
+// CHECK: store <2 x i32> {{%.*}}, <2 x i32>* [[TMP]], align 1
+  double sum = fixed;
+  sum = sum + c3.x + c3.y;
+  return sum;
+}
+
+double fixed_3s(__short3 *in) {
+// CHECK: fixed_3s
+// CHECK: call double @args_vec_3s(i32 3, <2 x i32> {{%.*}})
+  return args_vec_3s(3, *in);
+}
+
+__attribute__((noinline)) double args_vec_5s(int fixed, __short5 c5) {
+// CHECK: args_vec_5s
+// CHECK: [[C5:%.*]] = alloca <5 x i16>, align 16
+// CHECK: [[TMP:%.*]] = bitcast <5 x i16>* [[C5]] to <4 x i32>*
+// CHECK: store <4 x i32> {{%.*}}, <4 x i32>* [[TMP]], align 1
+  double sum = fixed;
+  sum = sum + c5.x + c5.y;
+  return sum;
+}
+
+double fixed_5s(__short5 *in) {
+// CHECK: fixed_5s
+// CHECK: call double @args_vec_5s(i32 5, <4 x i32> {{%.*}})
+  return args_vec_5s(5, *in);
+}
+
+__attribute__((noinline)) double args_vec_3i(int fixed, __int3 c3) {
+// CHECK: args_vec_3i
+// CHECK: [[C3:%.*]] = alloca <3 x i32>, align 16
+// CHECK: [[TMP:%.*]] = bitcast <3 x i32>* [[C3]] to <4 x i32>*
+// CHECK: store <4 x i32> {{%.*}}, <4 x i32>* [[TMP]], align 1
+  double sum = fixed;
+  sum = sum + c3.x + c3.y;
+  return sum;
+}
+
+double fixed_3i(__int3 *in) {
+// CHECK: fixed_3i
+// CHECK: call double @args_vec_3i(i32 3, <4 x i32> {{%.*}})
+  return args_vec_3i(3, *in);
+}
+
+__attribute__((noinline)) double args_vec_5i(int fixed, __int5 c5) {
+// CHECK: args_vec_5i
+// CHECK: [[C5:%.*]] = load <5 x i32>* {{%.*}}, align 16
+  double sum = fixed;
+  sum = sum + c5.x + c5.y;
+  return sum;
+}
+
+double fixed_5i(__int5 *in) {
+// CHECK: fixed_5i
+// CHECK: call double @args_vec_5i(i32 5, <5 x i32>* {{%.*}})
+  return args_vec_5i(5, *in);
+}
+
+__attribute__((noinline)) double args_vec_3d(int fixed, __double3 c3) {
+// CHECK: args_vec_3d
+// CHECK: [[CAST:%.*]] = bitcast <3 x double>* {{%.*}} to <4 x double>*
+// CHECK: [[LOAD:%.*]] = load <4 x double>* [[CAST]]
+// CHECK: shufflevector <4 x double> [[LOAD]], <4 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  double sum = fixed;
+  sum = sum + c3.x + c3.y;
+  return sum;
+}
+
+double fixed_3d(__double3 *in) {
+// CHECK: fixed_3d
+// CHECK: call double @args_vec_3d(i32 3, <3 x double>* {{%.*}})
+  return args_vec_3d(3, *in);
+}
diff --git a/clang/test/CodeGen/arm64-arguments.c b/clang/test/CodeGen/arm64-arguments.c
new file mode 100644
index 00000000000..edfb62832a4
--- /dev/null
+++ b/clang/test/CodeGen/arm64-arguments.c
@@ -0,0 +1,713 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-abi darwinpcs -ffreestanding -emit-llvm -w -o - %s | FileCheck %s
+
+// CHECK: define signext i8 @f0()
+char f0(void) {
+  return 0;
+}
+
+// Struct as return type. Aggregates <= 16 bytes are passed directly and round
+// up to multiple of 8 bytes.
+// CHECK: define i64 @f1()
+struct s1 { char f0; };
+struct s1 f1(void) {}
+
+// CHECK: define i64 @f2()
+struct s2 { short f0; };
+struct s2 f2(void) {}
+
+// CHECK: define i64 @f3()
+struct s3 { int f0; };
+struct s3 f3(void) {}
+
+// CHECK: define i64 @f4()
+struct s4 { struct s4_0 { int f0; } f0; };
+struct s4 f4(void) {}
+
+// CHECK: define i64 @f5()
+struct s5 { struct { } f0; int f1; };
+struct s5 f5(void) {}
+
+// CHECK: define i64 @f6()
+struct s6 { int f0[1]; };
+struct s6 f6(void) {}
+
+// CHECK: define void @f7()
+struct s7 { struct { int : 0; } f0; };
+struct s7 f7(void) {}
+
+// CHECK: define void @f8()
+struct s8 { struct { int : 0; } f0[1]; };
+struct s8 f8(void) {}
+
+// CHECK: define i64 @f9()
+struct s9 { int f0; int : 0; };
+struct s9 f9(void) {}
+
+// CHECK: define i64 @f10()
+struct s10 { int f0; int : 0; int : 0; };
+struct s10 f10(void) {}
+
+// CHECK: define i64 @f11()
+struct s11 { int : 0; int f0; };
+struct s11 f11(void) {}
+
+// CHECK: define i64 @f12()
+union u12 { char f0; short f1; int f2; };
+union u12 f12(void) {}
+
+// Homogeneous Aggregate as return type will be passed directly.
+// CHECK: define %struct.s13 @f13()
+struct s13 { float f0; };
+struct s13 f13(void) {}
+// CHECK: define %union.u14 @f14()
+union u14 { float f0; };
+union u14 f14(void) {}
+
+// CHECK: define void @f15()
+void f15(struct s7 a0) {}
+
+// CHECK: define void @f16()
+void f16(struct s8 a0) {}
+
+// CHECK: define i64 @f17()
+struct s17 { short f0 : 13; char f1 : 4; };
+struct s17 f17(void) {}
+
+// CHECK: define i64 @f18()
+struct s18 { short f0; char f1 : 4; };
+struct s18 f18(void) {}
+
+// CHECK: define i64 @f19()
+struct s19 { int f0; struct s8 f1; };
+struct s19 f19(void) {}
+
+// CHECK: define i64 @f20()
+struct s20 { struct s8 f1; int f0; };
+struct s20 f20(void) {}
+
+// CHECK: define i64 @f21()
+struct s21 { struct {} f1; int f0 : 4; };
+struct s21 f21(void) {}
+
+// CHECK: define i64 @f22()
+// CHECK: define i64 @f23()
+// CHECK: define i64 @f24()
+// CHECK: define i128 @f25()
+// CHECK: define { float, float } @f26()
+// CHECK: define { double, double } @f27()
+_Complex char       f22(void) {}
+_Complex short      f23(void) {}
+_Complex int        f24(void) {}
+_Complex long long  f25(void) {}
+_Complex float      f26(void) {}
+_Complex double     f27(void) {}
+
+// CHECK: define i64 @f28()
+struct s28 { _Complex char f0; };
+struct s28 f28() {}
+
+// CHECK: define i64 @f29()
+struct s29 { _Complex short f0; };
+struct s29 f29() {}
+
+// CHECK: define i64 @f30()
+struct s30 { _Complex int f0; };
+struct s30 f30() {}
+
+struct s31 { char x; };
+void f31(struct s31 s) { }
+// CHECK: define void @f31(i64 %s.coerce)
+// CHECK: %s = alloca %struct.s31, align 8
+// CHECK: trunc i64 %s.coerce to i8
+// CHECK: store i8 %{{.*}},
+
+struct s32 { double x; };
+void f32(struct s32 s) { }
+// Expand Homogeneous Aggregate.
+// CHECK: @f32(double %{{.*}})
+
+// A composite type larger than 16 bytes should be passed indirectly.
+struct s33 { char buf[32*32]; };
+void f33(struct s33 s) { }
+// CHECK: define void @f33(%struct.s33* %s)
+
+struct s34 { char c; };
+void f34(struct s34 s);
+void g34(struct s34 *s) { f34(*s); }
+// CHECK: @g34(%struct.s34* %s)
+// CHECK: %[[a:.*]] = load i8* %{{.*}}
+// CHECK: zext i8 %[[a]] to i64
+// CHECK: call void @f34(i64 %{{.*}})
+
+/*
+ * Check that va_arg accesses stack according to ABI alignment
+ */
+long long t1(int i, ...) {
+    // CHECK: t1
+    __builtin_va_list ap;
+    __builtin_va_start(ap, i);
+    // CHECK-NOT: add i32 %{{.*}} 7
+    // CHECK-NOT: and i32 %{{.*}} -8
+    long long ll = __builtin_va_arg(ap, long long);
+    __builtin_va_end(ap);
+    return ll;
+}
+double t2(int i, ...) {
+    // CHECK: t2
+    __builtin_va_list ap;
+    __builtin_va_start(ap, i);
+    // CHECK-NOT: add i32 %{{.*}} 7
+    // CHECK-NOT: and i32 %{{.*}} -8
+    double ll = __builtin_va_arg(ap, double);
+    __builtin_va_end(ap);
+    return ll;
+}
+
+#include <arm_neon.h>
+
+// Homogeneous Vector Aggregate as return type and argument type.
+// CHECK: define %struct.int8x16x2_t @f0_0(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+int8x16x2_t f0_0(int8x16_t a0, int8x16_t a1) {
+  return vzipq_s8(a0, a1);
+}
+
+// Test direct vector passing.
+typedef float T_float32x2 __attribute__ ((__vector_size__ (8)));
+typedef float T_float32x4 __attribute__ ((__vector_size__ (16)));
+typedef float T_float32x8 __attribute__ ((__vector_size__ (32)));
+typedef float T_float32x16 __attribute__ ((__vector_size__ (64)));
+
+// CHECK: define <2 x float> @f1_0(<2 x float> %{{.*}})
+T_float32x2 f1_0(T_float32x2 a0) { return a0; }
+// CHECK: define <4 x float> @f1_1(<4 x float> %{{.*}})
+T_float32x4 f1_1(T_float32x4 a0) { return a0; }
+// Vector with length bigger than 16-byte is illegal and is passed indirectly.
+// CHECK: define void @f1_2(<8 x float>* noalias sret  %{{.*}}, <8 x float>*)
+T_float32x8 f1_2(T_float32x8 a0) { return a0; }
+// CHECK: define void @f1_3(<16 x float>* noalias sret %{{.*}}, <16 x float>*)
+T_float32x16 f1_3(T_float32x16 a0) { return a0; }
+
+// Testing alignment with aggregates: HFA, aggregates with size <= 16 bytes and
+// aggregates with size > 16 bytes.
+struct s35
+{
+   float v[4]; //Testing HFA.
+} __attribute__((aligned(16)));
+typedef struct s35 s35_with_align;
+
+typedef __attribute__((neon_vector_type(4))) float float32x4_t;
+float32x4_t f35(int i, s35_with_align s1, s35_with_align s2) {
+// CHECK: define <4 x float> @f35(i32 %i, float %s1.0, float %s1.1, float %s1.2, float %s1.3, float %s2.0, float %s2.1, float %s2.2, float %s2.3)
+// CHECK: %s1 = alloca %struct.s35, align 16
+// CHECK: %s2 = alloca %struct.s35, align 16
+// CHECK: %[[a:.*]] = bitcast %struct.s35* %s1 to <4 x float>*
+// CHECK: load <4 x float>* %[[a]], align 16
+// CHECK: %[[b:.*]] = bitcast %struct.s35* %s2 to <4 x float>*
+// CHECK: load <4 x float>* %[[b]], align 16
+  float32x4_t v = vaddq_f32(*(float32x4_t *)&s1,
+                            *(float32x4_t *)&s2);
+  return v;
+}
+
+struct s36
+{
+   int v[4]; //Testing 16-byte aggregate.
+} __attribute__((aligned(16)));
+typedef struct s36 s36_with_align;
+
+typedef __attribute__((neon_vector_type(4))) int int32x4_t;
+int32x4_t f36(int i, s36_with_align s1, s36_with_align s2) {
+// CHECK: define <4 x i32> @f36(i32 %i, i128 %s1.coerce, i128 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s36, align 16
+// CHECK: %s2 = alloca %struct.s36, align 16
+// CHECK: store i128 %s1.coerce, i128* %{{.*}}, align 1
+// CHECK: store i128 %s2.coerce, i128* %{{.*}}, align 1
+// CHECK: %[[a:.*]] = bitcast %struct.s36* %s1 to <4 x i32>*
+// CHECK: load <4 x i32>* %[[a]], align 16
+// CHECK: %[[b:.*]] = bitcast %struct.s36* %s2 to <4 x i32>*
+// CHECK: load <4 x i32>* %[[b]], align 16
+  int32x4_t v = vaddq_s32(*(int32x4_t *)&s1,
+                          *(int32x4_t *)&s2);
+  return v;
+}
+
+struct s37
+{
+   int v[18]; //Testing large aggregate.
+} __attribute__((aligned(16)));
+typedef struct s37 s37_with_align;
+
+int32x4_t f37(int i, s37_with_align s1, s37_with_align s2) {
+// CHECK: define <4 x i32> @f37(i32 %i, %struct.s37* %s1, %struct.s37* %s2)
+// CHECK: %[[a:.*]] = bitcast %struct.s37* %s1 to <4 x i32>*
+// CHECK: load <4 x i32>* %[[a]], align 16
+// CHECK: %[[b:.*]] = bitcast %struct.s37* %s2 to <4 x i32>*
+// CHECK: load <4 x i32>* %[[b]], align 16
+  int32x4_t v = vaddq_s32(*(int32x4_t *)&s1,
+                          *(int32x4_t *)&s2);
+  return v;
+}
+s37_with_align g37;
+int32x4_t caller37() {
+// CHECK: caller37
+// CHECK: %[[a:.*]] = alloca %struct.s37, align 16
+// CHECK: %[[b:.*]] = alloca %struct.s37, align 16
+// CHECK: call void @llvm.memcpy
+// CHECK: call void @llvm.memcpy
+// CHECK: call <4 x i32> @f37(i32 3, %struct.s37* %[[a]], %struct.s37* %[[b]])
+  return f37(3, g37, g37);
+}
+
+// rdar://problem/12648441
+// Test passing structs with size < 8, < 16 and > 16
+// with alignment of 16 and without
+
+// structs with size <= 8 bytes, without alignment attribute
+// passed as i64 regardless of the align attribute
+struct s38
+{
+  int i;
+  short s;
+};
+typedef struct s38 s38_no_align;
+// passing structs in registers
+__attribute__ ((noinline))
+int f38(int i, s38_no_align s1, s38_no_align s2) {
+// CHECK: define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s38, align 8
+// CHECK: %s2 = alloca %struct.s38, align 8
+// CHECK: store i64 %s1.coerce, i64* %{{.*}}, align 1
+// CHECK: store i64 %s2.coerce, i64* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s38* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s38* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s38* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s38* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + s1.s + s2.s;
+}
+s38_no_align g38;
+s38_no_align g38_2;
+int caller38() {
+// CHECK: define i32 @caller38()
+// CHECK: %[[a:.*]] = load i64* bitcast (%struct.s38* @g38 to i64*), align 1
+// CHECK: %[[b:.*]] = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 1
+// CHECK: call i32 @f38(i32 3, i64 %[[a]], i64 %[[b]])
+  return f38(3, g38, g38_2);
+}
+// passing structs on stack
+__attribute__ ((noinline))
+int f38_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
+              int i9, s38_no_align s1, s38_no_align s2) {
+// CHECK: define i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s38, align 8
+// CHECK: %s2 = alloca %struct.s38, align 8
+// CHECK: store i64 %s1.coerce, i64* %{{.*}}, align 1
+// CHECK: store i64 %s2.coerce, i64* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s38* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s38* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s38* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s38* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + s1.s + s2.s;
+}
+int caller38_stack() {
+// CHECK: define i32 @caller38_stack()
+// CHECK: %[[a:.*]] = load i64* bitcast (%struct.s38* @g38 to i64*), align 1
+// CHECK: %[[b:.*]] = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 1
+// CHECK: call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i64 %[[a]], i64 %[[b]])
+  return f38_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g38, g38_2);
+}
+
+// structs with size <= 8 bytes, with alignment attribute
+struct s39
+{
+  int i;
+  short s;
+} __attribute__((aligned(16)));
+typedef struct s39 s39_with_align;
+// passing aligned structs in registers
+__attribute__ ((noinline))
+int f39(int i, s39_with_align s1, s39_with_align s2) {
+// CHECK: define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s39, align 16
+// CHECK: %s2 = alloca %struct.s39, align 16
+// CHECK: store i128 %s1.coerce, i128* %{{.*}}, align 1
+// CHECK: store i128 %s2.coerce, i128* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s39* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s39* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s39* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s39* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + s1.s + s2.s;
+}
+s39_with_align g39;
+s39_with_align g39_2;
+int caller39() {
+// CHECK: define i32 @caller39()
+// CHECK: %[[a:.*]] = load i128* bitcast (%struct.s39* @g39 to i128*), align 1
+// CHECK: %[[b:.*]] = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 1
+// CHECK: call i32 @f39(i32 3, i128 %[[a]], i128 %[[b]])
+  return f39(3, g39, g39_2);
+}
+// passing aligned structs on stack
+__attribute__ ((noinline))
+int f39_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
+              int i9, s39_with_align s1, s39_with_align s2) {
+// CHECK: define i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s39, align 16
+// CHECK: %s2 = alloca %struct.s39, align 16
+// CHECK: store i128 %s1.coerce, i128* %{{.*}}, align 1
+// CHECK: store i128 %s2.coerce, i128* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s39* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s39* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s39* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s39* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + s1.s + s2.s;
+}
+int caller39_stack() {
+// CHECK: define i32 @caller39_stack()
+// CHECK: %[[a:.*]] = load i128* bitcast (%struct.s39* @g39 to i128*), align 1
+// CHECK: %[[b:.*]] = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 1
+// CHECK: call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i128 %[[a]], i128 %[[b]])
+  return f39_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g39, g39_2);
+}
+
+// structs with size <= 16 bytes, without alignment attribute
+struct s40
+{
+  int i;
+  short s;
+  int i2;
+  short s2;
+};
+typedef struct s40 s40_no_align;
+// passing structs in registers
+__attribute__ ((noinline))
+int f40(int i, s40_no_align s1, s40_no_align s2) {
+// CHECK: define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce)
+// CHECK: %s1 = alloca %struct.s40, align 8
+// CHECK: %s2 = alloca %struct.s40, align 8
+// CHECK: store [2 x i64] %s1.coerce, [2 x i64]* %{{.*}}, align 1
+// CHECK: store [2 x i64] %s2.coerce, [2 x i64]* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s40* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s40* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s40* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s40* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + s1.s + s2.s;
+}
+s40_no_align g40;
+s40_no_align g40_2;
+int caller40() {
+// CHECK: define i32 @caller40()
+// CHECK: %[[a:.*]] = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 1
+// CHECK: %[[b:.*]] = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 1
+// CHECK: call i32 @f40(i32 3, [2 x i64] %[[a]], [2 x i64] %[[b]])
+  return f40(3, g40, g40_2);
+}
+// passing structs on stack
+__attribute__ ((noinline))
+int f40_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
+              int i9, s40_no_align s1, s40_no_align s2) {
+// CHECK: define i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce)
+// CHECK: %s1 = alloca %struct.s40, align 8
+// CHECK: %s2 = alloca %struct.s40, align 8
+// CHECK: store [2 x i64] %s1.coerce, [2 x i64]* %{{.*}}, align 1
+// CHECK: store [2 x i64] %s2.coerce, [2 x i64]* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s40* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s40* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s40* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s40* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + s1.s + s2.s;
+}
+int caller40_stack() {
+// CHECK: define i32 @caller40_stack()
+// CHECK: %[[a:.*]] = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 1
+// CHECK: %[[b:.*]] = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 1
+// CHECK: call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, [2 x i64] %[[a]], [2 x i64] %[[b]])
+  return f40_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g40, g40_2);
+}
+
+// structs with size <= 16 bytes, with alignment attribute
+struct s41
+{
+  int i;
+  short s;
+  int i2;
+  short s2;
+} __attribute__((aligned(16)));
+typedef struct s41 s41_with_align;
+// passing aligned structs in registers
+__attribute__ ((noinline))
+int f41(int i, s41_with_align s1, s41_with_align s2) {
+// CHECK: define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s41, align 16
+// CHECK: %s2 = alloca %struct.s41, align 16
+// CHECK: store i128 %s1.coerce, i128* %{{.*}}, align 1
+// CHECK: store i128 %s2.coerce, i128* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s41* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s41* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s41* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s41* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + s1.s + s2.s;
+}
+s41_with_align g41;
+s41_with_align g41_2;
+int caller41() {
+// CHECK: define i32 @caller41()
+// CHECK: %[[a:.*]] = load i128* bitcast (%struct.s41* @g41 to i128*), align 1
+// CHECK: %[[b:.*]] = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 1
+// CHECK: call i32 @f41(i32 3, i128 %[[a]], i128 %[[b]])
+  return f41(3, g41, g41_2);
+}
+// passing aligned structs on stack
+__attribute__ ((noinline))
+int f41_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
+              int i9, s41_with_align s1, s41_with_align s2) {
+// CHECK: define i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce)
+// CHECK: %s1 = alloca %struct.s41, align 16
+// CHECK: %s2 = alloca %struct.s41, align 16
+// CHECK: store i128 %s1.coerce, i128* %{{.*}}, align 1
+// CHECK: store i128 %s2.coerce, i128* %{{.*}}, align 1
+// CHECK: getelementptr inbounds %struct.s41* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s41* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s41* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s41* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + s1.s + s2.s;
+}
+int caller41_stack() {
+// CHECK: define i32 @caller41_stack()
+// CHECK: %[[a:.*]] = load i128* bitcast (%struct.s41* @g41 to i128*), align 1
+// CHECK: %[[b:.*]] = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 1
+// CHECK: call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i128 %[[a]], i128 %[[b]])
+  return f41_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g41, g41_2);
+}
+
+// structs with size > 16 bytes, without alignment attribute
+struct s42
+{
+  int i;
+  short s;
+  int i2;
+  short s2;
+  int i3;
+  short s3;
+};
+typedef struct s42 s42_no_align;
+// passing structs in registers
+__attribute__ ((noinline))
+int f42(int i, s42_no_align s1, s42_no_align s2) {
+// CHECK: define i32 @f42(i32 %i, %struct.s42* %s1, %struct.s42* %s2)
+// CHECK: getelementptr inbounds %struct.s42* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s42* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s42* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s42* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + s1.s + s2.s;
+}
+s42_no_align g42;
+s42_no_align g42_2;
+int caller42() {
+// CHECK: define i32 @caller42()
+// CHECK: %[[a:.*]] = alloca %struct.s42, align 4
+// CHECK: %[[b:.*]] = alloca %struct.s42, align 4
+// CHECK: %[[c:.*]] = bitcast %struct.s42* %[[a]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: %[[d:.*]] = bitcast %struct.s42* %[[b]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: call i32 @f42(i32 3, %struct.s42* %[[a]], %struct.s42* %[[b]])
+  return f42(3, g42, g42_2);
+}
+// passing structs on stack
+__attribute__ ((noinline))
+int f42_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
+              int i9, s42_no_align s1, s42_no_align s2) {
+// CHECK: define i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, %struct.s42* %s1, %struct.s42* %s2)
+// CHECK: getelementptr inbounds %struct.s42* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s42* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s42* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s42* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + s1.s + s2.s;
+}
+int caller42_stack() {
+// CHECK: define i32 @caller42_stack()
+// CHECK: %[[a:.*]] = alloca %struct.s42, align 4
+// CHECK: %[[b:.*]] = alloca %struct.s42, align 4
+// CHECK: %[[c:.*]] = bitcast %struct.s42* %[[a]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: %[[d:.*]] = bitcast %struct.s42* %[[b]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, %struct.s42* %[[a]], %struct.s42* %[[b]])
+  return f42_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g42, g42_2);
+}
+
+// structs with size > 16 bytes, with alignment attribute
+struct s43
+{
+  int i;
+  short s;
+  int i2;
+  short s2;
+  int i3;
+  short s3;
+} __attribute__((aligned(16)));
+typedef struct s43 s43_with_align;
+// passing aligned structs in registers
+__attribute__ ((noinline))
+int f43(int i, s43_with_align s1, s43_with_align s2) {
+// CHECK: define i32 @f43(i32 %i, %struct.s43* %s1, %struct.s43* %s2)
+// CHECK: getelementptr inbounds %struct.s43* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s43* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s43* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s43* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + s1.s + s2.s;
+}
+s43_with_align g43;
+s43_with_align g43_2;
+int caller43() {
+// CHECK: define i32 @caller43()
+// CHECK: %[[a:.*]] = alloca %struct.s43, align 16
+// CHECK: %[[b:.*]] = alloca %struct.s43, align 16
+// CHECK: %[[c:.*]] = bitcast %struct.s43* %[[a]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: %[[d:.*]] = bitcast %struct.s43* %[[b]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: call i32 @f43(i32 3, %struct.s43* %[[a]], %struct.s43* %[[b]])
+  return f43(3, g43, g43_2);
+}
+// passing aligned structs on stack
+__attribute__ ((noinline))
+int f43_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
+              int i9, s43_with_align s1, s43_with_align s2) {
+// CHECK: define i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, %struct.s43* %s1, %struct.s43* %s2)
+// CHECK: getelementptr inbounds %struct.s43* %s1, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s43* %s2, i32 0, i32 0
+// CHECK: getelementptr inbounds %struct.s43* %s1, i32 0, i32 1
+// CHECK: getelementptr inbounds %struct.s43* %s2, i32 0, i32 1
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + s1.s + s2.s;
+}
+int caller43_stack() {
+// CHECK: define i32 @caller43_stack()
+// CHECK: %[[a:.*]] = alloca %struct.s43, align 16
+// CHECK: %[[b:.*]] = alloca %struct.s43, align 16
+// CHECK: %[[c:.*]] = bitcast %struct.s43* %[[a]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: %[[d:.*]] = bitcast %struct.s43* %[[b]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, %struct.s43* %[[a]], %struct.s43* %[[b]])
+  return f43_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g43, g43_2);
+}
+
+// rdar://13668927
+// We should not split argument s1 between registers and stack.
+__attribute__ ((noinline))
+int f40_split(int i, int i2, int i3, int i4, int i5, int i6, int i7,
+              s40_no_align s1, s40_no_align s2) {
+// CHECK: define i32 @f40_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, [1 x i32], [2 x i64] %s1.coerce, [2 x i64] %s2.coerce)
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + s1.s + s2.s;
+}
+int caller40_split() {
+// CHECK: define i32 @caller40_split()
+// CHECK: call i32 @f40_split(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, [1 x i32] undef, [2 x i64] %{{.*}} [2 x i64] %{{.*}})
+  return f40_split(1, 2, 3, 4, 5, 6, 7, g40, g40_2);
+}
+
+__attribute__ ((noinline))
+int f41_split(int i, int i2, int i3, int i4, int i5, int i6, int i7,
+              s41_with_align s1, s41_with_align s2) {
+// CHECK: define i32 @f41_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, [1 x i32], i128 %s1.coerce, i128 %s2.coerce)
+  return s1.i + s2.i + i + i2 + i3 + i4 + i5 + i6 + i7 + s1.s + s2.s;
+}
+int caller41_split() {
+// CHECK: define i32 @caller41_split()
+// CHECK: call i32 @f41_split(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, [1 x i32] undef, i128 %{{.*}}, i128 %{{.*}})
+  return f41_split(1, 2, 3, 4, 5, 6, 7, g41, g41_2);
+}
+
+// Handle homogeneous aggregates properly in variadic functions.
+struct HFA {
+  float a, b, c, d;
+};
+
+float test_hfa(int n, ...) {
+// CHECK-LABEL: define float @test_hfa(i32 %n, ...)
+// CHECK: [[THELIST:%.*]] = alloca i8*
+// CHECK: [[CURLIST:%.*]] = load i8** [[THELIST]]
+
+  // HFA is not indirect, so occupies its full 16 bytes on the stack.
+// CHECK: [[NEXTLIST:%.*]] = getelementptr i8* [[CURLIST]], i32 16
+// CHECK: store i8* [[NEXTLIST]], i8** [[THELIST]]
+
+// CHECK: bitcast i8* [[CURLIST]] to %struct.HFA*
+  __builtin_va_list thelist;
+  __builtin_va_start(thelist, n);
+  struct HFA h = __builtin_va_arg(thelist, struct HFA);
+  return h.d;
+}
+
+struct TooBigHFA {
+  float a, b, c, d, e;
+};
+
+float test_toobig_hfa(int n, ...) {
+// CHECK-LABEL: define float @test_toobig_hfa(i32 %n, ...)
+// CHECK: [[THELIST:%.*]] = alloca i8*
+// CHECK: [[CURLIST:%.*]] = load i8** [[THELIST]]
+
+  // TooBigHFA is not actually an HFA, so gets passed indirectly. Only 8 bytes
+  // of stack consumed.
+// CHECK: [[NEXTLIST:%.*]] = getelementptr i8* [[CURLIST]], i32 8
+// CHECK: store i8* [[NEXTLIST]], i8** [[THELIST]]
+
+// CHECK: [[HFAPTRPTR:%.*]] = bitcast i8* [[CURLIST]] to i8**
+// CHECK: [[HFAPTR:%.*]] = load i8** [[HFAPTRPTR]]
+// CHECK: bitcast i8* [[HFAPTR]] to %struct.TooBigHFA*
+  __builtin_va_list thelist;
+  __builtin_va_start(thelist, n);
+  struct TooBigHFA h = __builtin_va_arg(thelist, struct TooBigHFA);
+  return h.d;
+}
+
+struct HVA {
+  int32x4_t a, b;
+};
+
+int32x4_t test_hva(int n, ...) {
+// CHECK-LABEL: define <4 x i32> @test_hva(i32 %n, ...)
+// CHECK: [[THELIST:%.*]] = alloca i8*
+// CHECK: [[CURLIST:%.*]] = load i8** [[THELIST]]
+
+  // HVA is not indirect, so occupies its full 16 bytes on the stack. but it
+  // must be properly aligned.
+// CHECK: [[ALIGN0:%.*]] = getelementptr i8* [[CURLIST]], i32 15
+// CHECK: [[ALIGN1:%.*]] = ptrtoint i8* [[ALIGN0]] to i64
+// CHECK: [[ALIGN2:%.*]] = and i64 [[ALIGN1]], -16
+// CHECK: [[ALIGNED_LIST:%.*]] = inttoptr i64 [[ALIGN2]] to i8*
+
+// CHECK: [[NEXTLIST:%.*]] = getelementptr i8* [[ALIGNED_LIST]], i32 32
+// CHECK: store i8* [[NEXTLIST]], i8** [[THELIST]]
+
+// CHECK: bitcast i8* [[ALIGNED_LIST]] to %struct.HVA*
+  __builtin_va_list thelist;
+  __builtin_va_start(thelist, n);
+  struct HVA h = __builtin_va_arg(thelist, struct HVA);
+  return h.b;
+}
+
+struct TooBigHVA {
+  int32x4_t a, b, c, d, e;
+};
+
+int32x4_t test_toobig_hva(int n, ...) {
+// CHECK-LABEL: define <4 x i32> @test_toobig_hva(i32 %n, ...)
+// CHECK: [[THELIST:%.*]] = alloca i8*
+// CHECK: [[CURLIST:%.*]] = load i8** [[THELIST]]
+
+  // TooBigHVA is not actually an HVA, so gets passed indirectly. Only 8 bytes
+  // of stack consumed.
+// CHECK: [[NEXTLIST:%.*]] = getelementptr i8* [[CURLIST]], i32 8
+// CHECK: store i8* [[NEXTLIST]], i8** [[THELIST]]
+
+// CHECK: [[HVAPTRPTR:%.*]] = bitcast i8* [[CURLIST]] to i8**
+// CHECK: [[HVAPTR:%.*]] = load i8** [[HVAPTRPTR]]
+// CHECK: bitcast i8* [[HVAPTR]] to %struct.TooBigHVA*
+  __builtin_va_list thelist;
+  __builtin_va_start(thelist, n);
+  struct TooBigHVA h = __builtin_va_arg(thelist, struct TooBigHVA);
+  return h.d;
+}
diff --git a/clang/test/CodeGen/arm64-crc32.c b/clang/test/CodeGen/arm64-crc32.c
new file mode 100644
index 00000000000..cb31a782a26
--- /dev/null
+++ b/clang/test/CodeGen/arm64-crc32.c
@@ -0,0 +1,55 @@
+// REQUIRES: arm64-registered-target
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+
+int crc32b(int a, char b)
+{
+        return __builtin_arm_crc32b(a,b);
+// CHECK: [[T0:%[0-9]+]] = zext i8 %b to i32
+// CHECK: call i32 @llvm.arm64.crc32b(i32 %a, i32 [[T0]])
+}
+
+int crc32cb(int a, char b)
+{
+        return __builtin_arm_crc32cb(a,b);
+// CHECK: [[T0:%[0-9]+]] = zext i8 %b to i32
+// CHECK: call i32 @llvm.arm64.crc32cb(i32 %a, i32 [[T0]])
+}
+
+int crc32h(int a, short b)
+{
+        return __builtin_arm_crc32h(a,b);
+// CHECK: [[T0:%[0-9]+]] = zext i16 %b to i32
+// CHECK: call i32 @llvm.arm64.crc32h(i32 %a, i32 [[T0]])
+}
+
+int crc32ch(int a, short b)
+{
+        return __builtin_arm_crc32ch(a,b);
+// CHECK: [[T0:%[0-9]+]] = zext i16 %b to i32
+// CHECK: call i32 @llvm.arm64.crc32ch(i32 %a, i32 [[T0]])
+}
+
+int crc32w(int a, int b)
+{
+        return __builtin_arm_crc32w(a,b);
+// CHECK: call i32 @llvm.arm64.crc32w(i32 %a, i32 %b)
+}
+
+int crc32cw(int a, int b)
+{
+        return __builtin_arm_crc32cw(a,b);
+// CHECK: call i32 @llvm.arm64.crc32cw(i32 %a, i32 %b)
+}
+
+int crc32d(int a, long b)
+{
+        return __builtin_arm_crc32d(a,b);
+// CHECK: call i32 @llvm.arm64.crc32x(i32 %a, i64 %b)
+}
+
+int crc32cd(int a, long b)
+{
+        return __builtin_arm_crc32cd(a,b);
+// CHECK: call i32 @llvm.arm64.crc32cx(i32 %a, i64 %b)
+}
diff --git a/clang/test/CodeGen/arm64-lanes.c b/clang/test/CodeGen/arm64-lanes.c
new file mode 100644
index 00000000000..63d7d0c8314
--- /dev/null
+++ b/clang/test/CodeGen/arm64-lanes.c
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -ffreestanding -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vdupb_lane_s8
+int8_t test_vdupb_lane_s8(int8x8_t src) {
+  return vdupb_lane_s8(src, 2);
+  // CHECK: extractelement <8 x i8> %src, i32 2
+}
+
+// CHECK-LABEL: @test_vdupb_lane_u8
+uint8_t test_vdupb_lane_u8(uint8x8_t src) {
+  return vdupb_lane_u8(src, 2);
+  // CHECK: extractelement <8 x i8> %src, i32 2
+}
+
+// CHECK-LABEL: @test_vduph_lane_s16
+int16_t test_vduph_lane_s16(int16x4_t src) {
+  return vduph_lane_s16(src, 2);
+  // CHECK: extractelement <4 x i16> %src, i32 2
+}
+
+// CHECK-LABEL: @test_vduph_lane_u16
+uint16_t test_vduph_lane_u16(uint16x4_t src) {
+  return vduph_lane_u16(src, 2);
+  // CHECK: extractelement <4 x i16> %src, i32 2
+}
+
+// CHECK-LABEL: @test_vdups_lane_s32
+int32_t test_vdups_lane_s32(int32x2_t src) {
+  return vdups_lane_s32(src, 0);
+  // CHECK: extractelement <2 x i32> %src, i32 0
+}
+
+// CHECK-LABEL: @test_vdups_lane_u32
+uint32_t test_vdups_lane_u32(uint32x2_t src) {
+  return vdups_lane_u32(src, 0);
+  // CHECK: extractelement <2 x i32> %src, i32 0
+}
+
+// CHECK-LABEL: @test_vdups_lane_f32
+float32_t test_vdups_lane_f32(float32x2_t src) {
+  return vdups_lane_f32(src, 0);
+  // CHECK: extractelement <2 x float> %src, i32 0
+}
+
+// CHECK-LABEL: @test_vdupd_lane_s64
+int64_t test_vdupd_lane_s64(int64x1_t src) {
+  return vdupd_lane_s64(src, 0);
+  // CHECK: extractelement <1 x i64> %src, i32 0
+}
+
+// CHECK-LABEL: @test_vdupd_lane_u64
+uint64_t test_vdupd_lane_u64(uint64x1_t src) {
+  return vdupd_lane_u64(src, 0);
+  // CHECK: extractelement <1 x i64> %src, i32 0
+}
+
+// CHECK-LABEL: @test_vdupd_lane_f64
+float64_t test_vdupd_lane_f64(float64x1_t src) {
+  return vdupd_lane_f64(src, 0);
+  // CHECK: extractelement <1 x double> %src, i32 0
+}
diff --git a/clang/test/CodeGen/arm64-scalar-test.c b/clang/test/CodeGen/arm64-scalar-test.c
new file mode 100644
index 00000000000..1e1087d40bc
--- /dev/null
+++ b/clang/test/CodeGen/arm64-scalar-test.c
@@ -0,0 +1,535 @@
+// REQUIRES: arm64-registered-target
+// RUN: %clang_cc1 -triple arm64-apple-ios7.0  \
+// RUN:   -S -O1 -o - -ffreestanding %s | FileCheck %s
+
+// We're explicitly using arm_neon.h here: some types probably don't match
+// the ACLE definitions, but we want to check current codegen.
+#include <arm_neon.h>
+
+float test_vrsqrtss_f32(float a, float b) {
+// CHECK: test_vrsqrtss_f32
+  return vrsqrtss_f32(a, b);
+// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+double test_vrsqrtsd_f64(double a, double b) {
+// CHECK: test_vrsqrtsd_f64
+  return vrsqrtsd_f64(a, b);
+// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vrshl_s64
+  return vrshl_s64(a, b);
+// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
+// CHECK: test_vrshl_u64
+  return vrshl_u64(a, b);
+// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vrshld_s64
+int64_t test_vrshld_s64(int64_t a, int64_t b) {
+  return vrshld_s64(a, b);
+// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vrshld_u64
+uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
+  return vrshld_u64(a, b);
+// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqrshlb_s8
+int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
+  return vqrshlb_s8(a, b);
+// CHECK: sqrshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqrshlh_s16
+int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
+  return vqrshlh_s16(a, b);
+// CHECK: sqrshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqrshls_s32
+int32_t test_vqrshls_s32(int32_t a, int32_t b) {
+  return vqrshls_s32(a, b);
+// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqrshld_s64
+int64_t test_vqrshld_s64(int64_t a, int64_t b) {
+  return vqrshld_s64(a, b);
+// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqrshlb_u8
+uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
+  return vqrshlb_u8(a, b);
+// CHECK: uqrshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqrshlh_u16
+uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
+  return vqrshlh_u16(a, b);
+// CHECK: uqrshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqrshls_u32
+uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
+  return vqrshls_u32(a, b);
+// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqrshld_u64
+uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
+  return vqrshld_u64(a, b);
+// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqshlb_s8
+int8_t test_vqshlb_s8(int8_t a, int8_t b) {
+  return vqshlb_s8(a, b);
+// CHECK: sqshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqshlh_s16
+int16_t test_vqshlh_s16(int16_t a, int16_t b) {
+  return vqshlh_s16(a, b);
+// CHECK: sqshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqshls_s32
+int32_t test_vqshls_s32(int32_t a, int32_t b) {
+  return vqshls_s32(a, b);
+// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqshld_s64
+int64_t test_vqshld_s64(int64_t a, int64_t b) {
+  return vqshld_s64(a, b);
+// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqshlb_u8
+uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
+  return vqshlb_u8(a, b);
+// CHECK: uqshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqshlh_u16
+uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
+  return vqshlh_u16(a, b);
+// CHECK: uqshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqshls_u32
+uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
+  return vqshls_u32(a, b);
+// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqshld_u64
+uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
+  return vqshld_u64(a, b);
+// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vshld_u64
+uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
+  return vshld_u64(a, b);
+// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vshld_s64
+int64_t test_vshld_s64(int64_t a, int64_t b) {
+  return vshld_s64(a, b);
+// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqdmullh_s16
+int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
+  return vqdmullh_s16(a, b);
+// CHECK: sqdmull.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqdmulls_s32
+int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
+  return vqdmulls_s32(a, b);
+// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqaddb_s8
+int8_t test_vqaddb_s8(int8_t a, int8_t b) {
+  return vqaddb_s8(a, b);
+// CHECK: sqadd.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqaddh_s16
+int16_t test_vqaddh_s16(int16_t a, int16_t b) {
+  return vqaddh_s16(a, b);
+// CHECK: sqadd.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqadds_s32
+int32_t test_vqadds_s32(int32_t a, int32_t b) {
+  return vqadds_s32(a, b);
+// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqaddd_s64
+int64_t test_vqaddd_s64(int64_t a, int64_t b) {
+  return vqaddd_s64(a, b);
+// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqaddb_u8
+uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
+  return vqaddb_u8(a, b);
+// CHECK: uqadd.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqaddh_u16
+uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
+  return vqaddh_u16(a, b);
+// CHECK: uqadd.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqadds_u32
+uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
+  return vqadds_u32(a, b);
+// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqaddd_u64
+uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
+  return vqaddd_u64(a, b);
+// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqsubb_s8
+int8_t test_vqsubb_s8(int8_t a, int8_t b) {
+  return vqsubb_s8(a, b);
+// CHECK: sqsub.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqsubh_s16
+int16_t test_vqsubh_s16(int16_t a, int16_t b) {
+  return vqsubh_s16(a, b);
+// CHECK: sqsub.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqsubs_s32
+int32_t test_vqsubs_s32(int32_t a, int32_t b) {
+  return vqsubs_s32(a, b);
+// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqsubd_s64
+int64_t test_vqsubd_s64(int64_t a, int64_t b) {
+  return vqsubd_s64(a, b);
+// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqsubb_u8
+uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
+  return vqsubb_u8(a, b);
+// CHECK: uqsub.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqsubh_u16
+uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
+  return vqsubh_u16(a, b);
+// CHECK: uqsub.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqsubs_u32
+uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
+  return vqsubs_u32(a, b);
+// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqsubd_u64
+uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
+  return vqsubd_u64(a, b);
+// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqmovnh_s16
+int8_t test_vqmovnh_s16(int16_t a) {
+  return vqmovnh_s16(a);
+// CHECK: sqxtn.8b {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqmovnh_u16
+uint8_t test_vqmovnh_u16(uint16_t a) {
+  return vqmovnh_u16(a);
+// CHECK: uqxtn.8b {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqmovns_s32
+int16_t test_vqmovns_s32(int32_t a) {
+  return vqmovns_s32(a);
+// CHECK: sqxtn.4h {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqmovns_u32
+uint16_t test_vqmovns_u32(uint32_t a) {
+  return vqmovns_u32(a);
+// CHECK: uqxtn.4h {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqmovnd_s64
+int32_t test_vqmovnd_s64(int64_t a) {
+  return vqmovnd_s64(a);
+// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqmovnd_u64
+uint32_t test_vqmovnd_u64(uint64_t a) {
+  return vqmovnd_u64(a);
+// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqmovunh_s16
+int8_t test_vqmovunh_s16(int16_t a) {
+  return vqmovunh_s16(a);
+// CHECK: sqxtun.8b {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqmovuns_s32
+int16_t test_vqmovuns_s32(int32_t a) {
+  return vqmovuns_s32(a);
+// CHECK: sqxtun.4h {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqmovund_s64
+int32_t test_vqmovund_s64(int64_t a) {
+  return vqmovund_s64(a);
+// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqabsb_s8
+int8_t test_vqabsb_s8(int8_t a) {
+  return vqabsb_s8(a);
+// CHECK: sqabs.8b {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqabsh_s16
+int16_t test_vqabsh_s16(int16_t a) {
+  return vqabsh_s16(a);
+// CHECK: sqabs.4h {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqabss_s32
+int32_t test_vqabss_s32(int32_t a) {
+  return vqabss_s32(a);
+// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqabsd_s64
+int64_t test_vqabsd_s64(int64_t a) {
+  return vqabsd_s64(a);
+// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vqnegb_s8
+int8_t test_vqnegb_s8(int8_t a) {
+  return vqnegb_s8(a);
+// CHECK: sqneg.8b {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqnegh_s16
+int16_t test_vqnegh_s16(int16_t a) {
+  return vqnegh_s16(a);
+// CHECK: sqneg.4h {{v[0-9]+}}, {{v[0-9]+}}
+}
+
+// CHECK: test_vqnegs_s32
+int32_t test_vqnegs_s32(int32_t a) {
+  return vqnegs_s32(a);
+// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vqnegd_s64
+int64_t test_vqnegd_s64(int64_t a) {
+  return vqnegd_s64(a);
+// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvts_n_f32_s32
+float32_t test_vcvts_n_f32_s32(int32_t a) {
+  return vcvts_n_f32_s32(a, 3);
+// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #3
+}
+
+// CHECK: test_vcvts_n_f32_u32
+float32_t test_vcvts_n_f32_u32(uint32_t a) {
+  return vcvts_n_f32_u32(a, 3);
+// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #3
+}
+
+// CHECK: test_vcvtd_n_f64_s64
+float64_t test_vcvtd_n_f64_s64(int64_t a) {
+  return vcvtd_n_f64_s64(a, 3);
+// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #3
+}
+
+// CHECK: test_vcvtd_n_f64_u64
+float64_t test_vcvtd_n_f64_u64(uint64_t a) {
+  return vcvtd_n_f64_u64(a, 3);
+// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #3
+}
+
+// CHECK: test_vcvts_n_s32_f32
+int32_t test_vcvts_n_s32_f32(float32_t a) {
+  return vcvts_n_s32_f32(a, 3);
+// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #3
+}
+
+// CHECK: test_vcvts_n_u32_f32
+uint32_t test_vcvts_n_u32_f32(float32_t a) {
+  return vcvts_n_u32_f32(a, 3);
+// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #3
+}
+
+// CHECK: test_vcvtd_n_s64_f64
+int64_t test_vcvtd_n_s64_f64(float64_t a) {
+  return vcvtd_n_s64_f64(a, 3);
+// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #3
+}
+
+// CHECK: test_vcvtd_n_u64_f64
+uint64_t test_vcvtd_n_u64_f64(float64_t a) {
+  return vcvtd_n_u64_f64(a, 3);
+// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #3
+}
+
+// CHECK: test_vcvtas_s32_f32
+int32_t test_vcvtas_s32_f32(float32_t a) {
+  return vcvtas_s32_f32(a);
+// CHECK: fcvtas {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtas_u32_f32
+uint32_t test_vcvtas_u32_f32(float32_t a) {
+  return vcvtas_u32_f32(a);
+// CHECK: fcvtau {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtad_s64_f64
+int64_t test_vcvtad_s64_f64(float64_t a) {
+  return vcvtad_s64_f64(a);
+// CHECK: fcvtas {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtad_u64_f64
+uint64_t test_vcvtad_u64_f64(float64_t a) {
+  return vcvtad_u64_f64(a);
+// CHECK: fcvtau {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtms_s32_f32
+int32_t test_vcvtms_s32_f32(float32_t a) {
+  return vcvtms_s32_f32(a);
+// CHECK: fcvtms {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtms_u32_f32
+uint32_t test_vcvtms_u32_f32(float32_t a) {
+  return vcvtms_u32_f32(a);
+// CHECK: fcvtmu {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtmd_s64_f64
+int64_t test_vcvtmd_s64_f64(float64_t a) {
+  return vcvtmd_s64_f64(a);
+// CHECK: fcvtms {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtmd_u64_f64
+uint64_t test_vcvtmd_u64_f64(float64_t a) {
+  return vcvtmd_u64_f64(a);
+// CHECK: fcvtmu {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtns_s32_f32
+int32_t test_vcvtns_s32_f32(float32_t a) {
+  return vcvtns_s32_f32(a);
+// CHECK: fcvtns {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtns_u32_f32
+uint32_t test_vcvtns_u32_f32(float32_t a) {
+  return vcvtns_u32_f32(a);
+// CHECK: fcvtnu {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtnd_s64_f64
+int64_t test_vcvtnd_s64_f64(float64_t a) {
+  return vcvtnd_s64_f64(a);
+// CHECK: fcvtns {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtnd_u64_f64
+uint64_t test_vcvtnd_u64_f64(float64_t a) {
+  return vcvtnd_u64_f64(a);
+// CHECK: fcvtnu {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtps_s32_f32
+int32_t test_vcvtps_s32_f32(float32_t a) {
+  return vcvtps_s32_f32(a);
+// CHECK: fcvtps {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtps_u32_f32
+uint32_t test_vcvtps_u32_f32(float32_t a) {
+  return vcvtps_u32_f32(a);
+// CHECK: fcvtpu {{w[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vcvtpd_s64_f64
+int64_t test_vcvtpd_s64_f64(float64_t a) {
+  return vcvtpd_s64_f64(a);
+// CHECK: fcvtps {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtpd_u64_f64
+uint64_t test_vcvtpd_u64_f64(float64_t a) {
+  return vcvtpd_u64_f64(a);
+// CHECK: fcvtpu {{x[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vcvtxd_f32_f64
+float32_t test_vcvtxd_f32_f64(float64_t a) {
+  return vcvtxd_f32_f64(a);
+// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vabds_f32
+float32_t test_vabds_f32(float32_t a, float32_t b) {
+  return vabds_f32(a, b);
+  // CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vabdd_f64
+float64_t test_vabdd_f64(float64_t a, float64_t b) {
+  return vabdd_f64(a, b);
+  // CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+// CHECK: test_vmulxs_f32
+float32_t test_vmulxs_f32(float32_t a, float32_t b) {
+  return vmulxs_f32(a, b);
+  // CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}
+}
+
+// CHECK: test_vmulxd_f64
+float64_t test_vmulxd_f64(float64_t a, float64_t b) {
+  return vmulxd_f64(a, b);
+  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}
+}
diff --git a/clang/test/CodeGen/arm64-vrnd.c b/clang/test/CodeGen/arm64-vrnd.c
new file mode 100644
index 00000000000..4de2ec73ac5
--- /dev/null
+++ b/clang/test/CodeGen/arm64-vrnd.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -ffreestanding -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_neon.h>
+
+int32x2_t rnd1(float32x2_t a) { return vrnd_f32(a); }
+// CHECK: call <2 x float> @llvm.trunc.v2f32(<2 x float>
+int32x4_t rnd3(float32x4_t a) { return vrndq_f32(a); }
+// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float>
+int64x2_t rnd5(float64x2_t a) { return vrndq_f64(a); }
+// CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double>
+
+
+int32x2_t rnd7(float32x2_t a) { return vrndn_f32(a); }
+// CHECK: call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>
+int32x4_t rnd8(float32x4_t a) { return vrndnq_f32(a); }
+// CHECK: call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>
+int64x2_t rnd9(float64x2_t a) { return vrndnq_f64(a); }
+// CHECK: call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>
+int64x2_t rnd10(float64x2_t a) { return vrndnq_f64(a); }
+// CHECK: call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>
+
+int32x2_t rnd11(float32x2_t a) { return vrndm_f32(a); }
+// CHECK: call <2 x float> @llvm.floor.v2f32(<2 x float>
+int32x4_t rnd12(float32x4_t a) { return vrndmq_f32(a); }
+// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float>
+int64x2_t rnd13(float64x2_t a) { return vrndmq_f64(a); }
+// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double>
+int64x2_t rnd14(float64x2_t a) { return vrndmq_f64(a); }
+// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double>
+
+int32x2_t rnd15(float32x2_t a) { return vrndp_f32(a); }
+// CHECK: call <2 x float> @llvm.ceil.v2f32(<2 x float>
+int32x4_t rnd16(float32x4_t a) { return vrndpq_f32(a); }
+// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float>
+int64x2_t rnd18(float64x2_t a) { return vrndpq_f64(a); }
+// CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double>
+
+int32x2_t rnd19(float32x2_t a) { return vrnda_f32(a); }
+// CHECK: call <2 x float> @llvm.round.v2f32(<2 x float>
+int32x4_t rnd20(float32x4_t a) { return vrndaq_f32(a); }
+// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float>
+int64x2_t rnd22(float64x2_t a) { return vrndaq_f64(a); }
+// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double>
+
+int32x2_t rnd23(float32x2_t a) { return vrndx_f32(a); }
+// CHECK: call <2 x float> @llvm.rint.v2f32(<2 x float>
+int32x4_t rnd24(float32x4_t a) { return vrndxq_f32(a); }
+// CHECK: call <4 x float> @llvm.rint.v4f32(<4 x float>
+int64x2_t rnd25(float64x2_t a) { return vrndxq_f64(a); }
+// CHECK: call <2 x double> @llvm.rint.v2f64(<2 x double>
+
diff --git a/clang/test/CodeGen/arm64-vrsqrt.c b/clang/test/CodeGen/arm64-vrsqrt.c
new file mode 100644
index 00000000000..45a536ca28e
--- /dev/null
+++ b/clang/test/CodeGen/arm64-vrsqrt.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -ffreestanding -emit-llvm -O1 -o - %s | FileCheck %s
+
+#include <arm_neon.h>
+
+uint32x2_t test_vrsqrte_u32(uint32x2_t in) {
+  // CHECK-LABEL: @test_vrsqrte_u32
+  // CHECK: call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %in)
+  return vrsqrte_u32(in);
+}
+
+float32x2_t test_vrsqrte_f32(float32x2_t in) {
+  // CHECK-LABEL: @test_vrsqrte_f32
+  // CHECK: call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %in)
+  return vrsqrte_f32(in);
+}
+
+
+uint32x4_t test_vrsqrteq_u32(uint32x4_t in) {
+  // CHECK-LABEL: @test_vrsqrteq_u32
+  // CHECK: call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %in)
+  return vrsqrteq_u32(in);
+}
+
+float32x4_t test_vrsqrteq_f32(float32x4_t in) {
+  // CHECK-LABEL: @test_vrsqrteq_f32
+  // CHECK: call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %in)
+  return vrsqrteq_f32(in);
+}
+
+
+float32x2_t test_vrsqrts_f32(float32x2_t est, float32x2_t val) {
+  // CHECK-LABEL: @test_vrsqrts_f32
+  // CHECK: call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %est, <2 x float> %val)
+  return vrsqrts_f32(est, val);
+}
+
+
+float32x4_t test_vrsqrtsq_f32(float32x4_t est, float32x4_t val) {
+  // CHECK-LABEL: @test_vrsqrtsq_f32
+  // CHECK: call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %est, <4 x float> %val)
+  return vrsqrtsq_f32(est, val);
+}
+
diff --git a/clang/test/CodeGen/arm64_crypto.c b/clang/test/CodeGen/arm64_crypto.c
new file mode 100644
index 00000000000..7150c56c8af
--- /dev/null
+++ b/clang/test/CodeGen/arm64_crypto.c
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -ffreestanding -Os -S -o - %s | FileCheck %s
+// REQUIRES: arm64-registered-target
+
+#include <arm_neon.h>
+
+uint8x16_t test_aese(uint8x16_t data, uint8x16_t key) {
+  // CHECK-LABEL: test_aese:
+  // CHECK: aese.16b v0, v1
+  return vaeseq_u8(data, key);
+}
+
+uint8x16_t test_aesd(uint8x16_t data, uint8x16_t key) {
+  // CHECK-LABEL: test_aesd:
+  // CHECK: aesd.16b v0, v1
+  return vaesdq_u8(data, key);
+}
+
+uint8x16_t test_aesmc(uint8x16_t data, uint8x16_t key) {
+  // CHECK-LABEL: test_aesmc:
+  // CHECK: aesmc.16b v0, v0
+  return vaesmcq_u8(data);
+}
+
+uint8x16_t test_aesimc(uint8x16_t data, uint8x16_t key) {
+  // CHECK-LABEL: test_aesimc:
+  // CHECK: aesimc.16b v0, v0
+  return vaesimcq_u8(data);
+}
+
+uint32x4_t test_sha1c(uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) {
+  // CHECK-LABEL: test_sha1c:
+  // CHECK: fmov [[HASH_E:s[0-9]+]], w0
+  // CHECK: sha1c.4s q0, [[HASH_E]], v1
+  return vsha1cq_u32(hash_abcd, hash_e, wk);
+}
+
+uint32x4_t test_sha1p(uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) {
+  // CHECK-LABEL: test_sha1p:
+  // CHECK: fmov [[HASH_E:s[0-9]+]], w0
+  // CHECK: sha1p.4s q0, [[HASH_E]], v1
+  return vsha1pq_u32(hash_abcd, hash_e, wk);
+}
+
+uint32x4_t test_sha1m(uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) {
+  // CHECK-LABEL: test_sha1m:
+  // CHECK: fmov [[HASH_E:s[0-9]+]], w0
+  // CHECK: sha1m.4s q0, [[HASH_E]], v1
+  return vsha1mq_u32(hash_abcd, hash_e, wk);
+}
+
+uint32_t test_sha1h(uint32_t hash_e) {
+  // CHECK-LABEL: test_sha1h:
+  // CHECK: fmov [[HASH_E:s[0-9]+]], w0
+  // CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+  // CHECK: fmov w0, [[RES]]
+  return vsha1h_u32(hash_e);
+}
+
+uint32x4_t test_sha1su0(uint32x4_t wk0_3, uint32x4_t wk4_7, uint32x4_t wk8_11) {
+  // CHECK-LABEL: test_sha1su0:
+  // CHECK: sha1su0.4s v0, v1, v2
+  return vsha1su0q_u32(wk0_3, wk4_7, wk8_11);
+}
+
+uint32x4_t test_sha1su1(uint32x4_t wk0_3, uint32x4_t wk12_15) {
+  // CHECK-LABEL: test_sha1su1:
+  // CHECK: sha1su1.4s v0, v1
+  return vsha1su1q_u32(wk0_3, wk12_15);
+}
+
+uint32x4_t test_sha256h(uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk) {
+  // CHECK-LABEL: test_sha256h:
+  // CHECK: sha256h.4s q0, q1, v2
+  return vsha256hq_u32(hash_abcd, hash_efgh, wk);
+}
+
+uint32x4_t test_sha256h2(uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk) {
+  // CHECK-LABEL: test_sha256h2:
+  // CHECK: sha256h2.4s q0, q1, v2
+  return vsha256h2q_u32(hash_efgh, hash_abcd, wk);
+}
+
+uint32x4_t test_sha256su0(uint32x4_t w0_3, uint32x4_t w4_7) {
+  // CHECK-LABEL: test_sha256su0:
+  // CHECK: sha256su0.4s v0, v1
+  return vsha256su0q_u32(w0_3, w4_7);
+}
+
+uint32x4_t test_sha256su1(uint32x4_t w0_3, uint32x4_t w8_11, uint32x4_t w12_15) {
+  // CHECK-LABEL: test_sha256su1:
+  // CHECK: sha256su1.4s v0, v1, v2
+  return vsha256su1q_u32(w0_3, w8_11, w12_15);
+}
diff --git a/clang/test/CodeGen/arm64_neon_high_half.c b/clang/test/CodeGen/arm64_neon_high_half.c
new file mode 100644
index 00000000000..920dded8d31
--- /dev/null
+++ b/clang/test/CodeGen/arm64_neon_high_half.c
@@ -0,0 +1,559 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -ffreestanding -Os -S -o - %s | FileCheck %s
+// REQUIRES: arm64-registered-target
+
+#include <arm_neon.h>
+
+int16x8_t test_vaddw_high_s8(int16x8_t lhs, int8x16_t rhs) {
+  // CHECK: saddw2.8h
+  return vaddw_high_s8(lhs, rhs);
+}
+
+int32x4_t test_vaddw_high_s16(int32x4_t lhs, int16x8_t rhs) {
+  // CHECK: saddw2.4s
+  return vaddw_high_s16(lhs, rhs);
+}
+
+int64x2_t test_vaddw_high_s32(int64x2_t lhs, int32x4_t rhs) {
+  // CHECK: saddw2.2d
+  return vaddw_high_s32(lhs, rhs);
+}
+
+uint16x8_t test_vaddw_high_u8(uint16x8_t lhs, uint8x16_t rhs) {
+  // CHECK: uaddw2.8h
+  return vaddw_high_u8(lhs, rhs);
+}
+
+uint32x4_t test_vaddw_high_u16(uint32x4_t lhs, uint16x8_t rhs) {
+  // CHECK: uaddw2.4s
+  return vaddw_high_u16(lhs, rhs);
+}
+
+uint64x2_t test_vaddw_high_u32(uint64x2_t lhs, uint32x4_t rhs) {
+  // CHECK: uaddw2.2d
+  return vaddw_high_u32(lhs, rhs);
+}
+
+int16x8_t test_vsubw_high_s8(int16x8_t lhs, int8x16_t rhs) {
+  // CHECK: ssubw2.8h
+  return vsubw_high_s8(lhs, rhs);
+}
+
+int32x4_t test_vsubw_high_s16(int32x4_t lhs, int16x8_t rhs) {
+  // CHECK: ssubw2.4s
+  return vsubw_high_s16(lhs, rhs);
+}
+
+int64x2_t test_vsubw_high_s32(int64x2_t lhs, int32x4_t rhs) {
+  // CHECK: ssubw2.2d
+  return vsubw_high_s32(lhs, rhs);
+}
+
+uint16x8_t test_vsubw_high_u8(uint16x8_t lhs, uint8x16_t rhs) {
+  // CHECK: usubw2.8h
+  return vsubw_high_u8(lhs, rhs);
+}
+
+uint32x4_t test_vsubw_high_u16(uint32x4_t lhs, uint16x8_t rhs) {
+  // CHECK: usubw2.4s
+  return vsubw_high_u16(lhs, rhs);
+}
+
+uint64x2_t test_vsubw_high_u32(uint64x2_t lhs, uint32x4_t rhs) {
+  // CHECK: usubw2.2d
+  return vsubw_high_u32(lhs, rhs);
+}
+
+int16x8_t test_vabdl_high_s8(int8x16_t lhs, int8x16_t rhs) {
+  // CHECK: sabdl2.8h
+  return vabdl_high_s8(lhs, rhs);
+}
+
+int32x4_t test_vabdl_high_s16(int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: sabdl2.4s
+  return vabdl_high_s16(lhs, rhs);
+}
+
+int64x2_t test_vabdl_high_s32(int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: sabdl2.2d
+  return vabdl_high_s32(lhs, rhs);
+}
+
+uint16x8_t test_vabdl_high_u8(uint8x16_t lhs, uint8x16_t rhs) {
+  // CHECK: uabdl2.8h
+  return vabdl_high_u8(lhs, rhs);
+}
+
+uint32x4_t test_vabdl_high_u16(uint16x8_t lhs, uint16x8_t rhs) {
+  // CHECK: uabdl2.4s
+  return vabdl_high_u16(lhs, rhs);
+}
+
+uint64x2_t test_vabdl_high_u32(uint32x4_t lhs, uint32x4_t rhs) {
+  // CHECK: uabdl2.2d
+  return vabdl_high_u32(lhs, rhs);
+}
+
+int16x8_t test_vabal_high_s8(int16x8_t accum, int8x16_t lhs, int8x16_t rhs) {
+  // CHECK: sabal2.8h
+  return vabal_high_s8(accum, lhs, rhs);
+}
+
+int32x4_t test_vabal_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: sabal2.4s
+  return vabal_high_s16(accum, lhs, rhs);
+}
+
+int64x2_t test_vabal_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: sabal2.2d
+  return vabal_high_s32(accum, lhs, rhs);
+}
+
+uint16x8_t test_vabal_high_u8(uint16x8_t accum, uint8x16_t lhs, uint8x16_t rhs) {
+  // CHECK: uabal2.8h
+  return vabal_high_u8(accum, lhs, rhs);
+}
+
+uint32x4_t test_vabal_high_u16(uint32x4_t accum, uint16x8_t lhs, uint16x8_t rhs) {
+  // CHECK: uabal2.4s
+  return vabal_high_u16(accum, lhs, rhs);
+}
+
+uint64x2_t test_vabal_high_u32(uint64x2_t accum, uint32x4_t lhs, uint32x4_t rhs) {
+  // CHECK: uabal2.2d
+  return vabal_high_u32(accum, lhs, rhs);
+}
+
+int32x4_t test_vqdmlal_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: sqdmlal2.4s
+  return vqdmlal_high_s16(accum, lhs, rhs);
+}
+
+int64x2_t test_vqdmlal_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: sqdmlal2.2d
+  return vqdmlal_high_s32(accum, lhs, rhs);
+}
+
+int32x4_t test_vqdmlsl_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: sqdmlsl2.4s
+  return vqdmlsl_high_s16(accum, lhs, rhs);
+}
+
+int64x2_t test_vqdmlsl_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: sqdmlsl2.2d
+  return vqdmlsl_high_s32(accum, lhs, rhs);
+}
+
+int32x4_t test_vqdmull_high_s16(int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: sqdmull2.4s
+  return vqdmull_high_s16(lhs, rhs);
+}
+
+int64x2_t test_vqdmull_high_s32(int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: sqdmull2.2d
+  return vqdmull_high_s32(lhs, rhs);
+}
+
+int16x8_t test_vshll_high_n_s8(int8x16_t in) {
+  // CHECK: sshll2.8h
+  return vshll_high_n_s8(in, 7);
+}
+
+int32x4_t test_vshll_high_n_s16(int16x8_t in) {
+  // CHECK: sshll2.4s
+  return vshll_high_n_s16(in, 15);
+}
+
+int64x2_t test_vshll_high_n_s32(int32x4_t in) {
+  // CHECK: sshll2.2d
+  return vshll_high_n_s32(in, 31);
+}
+
+int16x8_t test_vshll_high_n_u8(int8x16_t in) {
+  // CHECK: ushll2.8h
+  return vshll_high_n_u8(in, 7);
+}
+
+int32x4_t test_vshll_high_n_u16(int16x8_t in) {
+  // CHECK: ushll2.4s
+  return vshll_high_n_u16(in, 15);
+}
+
+int64x2_t test_vshll_high_n_u32(int32x4_t in) {
+  // CHECK: ushll2.2d
+  return vshll_high_n_u32(in, 31);
+}
+
+int16x8_t test_vshll_high_n_s8_max(int8x16_t in) {
+  // CHECK: shll2.8h
+  return vshll_high_n_s8(in, 8);
+}
+
+int32x4_t test_vshll_high_n_s16_max(int16x8_t in) {
+  // CHECK: shll2.4s
+  return vshll_high_n_s16(in, 16);
+}
+
+int64x2_t test_vshll_high_n_s32_max(int32x4_t in) {
+  // CHECK: shll2.2d
+  return vshll_high_n_s32(in, 32);
+}
+
+int16x8_t test_vshll_high_n_u8_max(int8x16_t in) {
+  // CHECK: shll2.8h
+  return vshll_high_n_u8(in, 8);
+}
+
+int32x4_t test_vshll_high_n_u16_max(int16x8_t in) {
+  // CHECK: shll2.4s
+  return vshll_high_n_u16(in, 16);
+}
+
+int64x2_t test_vshll_high_n_u32_max(int32x4_t in) {
+  // CHECK: shll2.2d
+  return vshll_high_n_u32(in, 32);
+}
+
+int16x8_t test_vsubl_high_s8(int8x16_t lhs, int8x16_t rhs) {
+  // CHECK: ssubl2.8h
+  return vsubl_high_s8(lhs, rhs);
+}
+
+int32x4_t test_vsubl_high_s16(int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: ssubl2.4s
+  return vsubl_high_s16(lhs, rhs);
+}
+
+int64x2_t test_vsubl_high_s32(int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: ssubl2.2d
+  return vsubl_high_s32(lhs, rhs);
+}
+
+uint16x8_t test_vsubl_high_u8(uint8x16_t lhs, uint8x16_t rhs) {
+  // CHECK: usubl2.8h
+  return vsubl_high_u8(lhs, rhs);
+}
+
+uint32x4_t test_vsubl_high_u16(uint16x8_t lhs, uint16x8_t rhs) {
+  // CHECK: usubl2.4s
+  return vsubl_high_u16(lhs, rhs);
+}
+
+uint64x2_t test_vsubl_high_u32(uint32x4_t lhs, uint32x4_t rhs) {
+  // CHECK: usubl2.2d
+  return vsubl_high_u32(lhs, rhs);
+}
+
+int8x16_t test_vrshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
+  // CHECK: rshrn2.16b
+  return vrshrn_high_n_s16(lowpart, input, 2);
+}
+
+int16x8_t test_vrshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
+  // CHECK: rshrn2.8h
+  return vrshrn_high_n_s32(lowpart, input, 2);
+}
+
+int32x4_t test_vrshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
+  // CHECK: shrn2.4s
+  return vrshrn_high_n_s64(lowpart, input, 2);
+}
+
+uint8x16_t test_vrshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
+  // CHECK: rshrn2.16b
+  return vrshrn_high_n_u16(lowpart, input, 2);
+}
+
+uint16x8_t test_vrshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
+  // CHECK: rshrn2.8h
+  return vrshrn_high_n_u32(lowpart, input, 2);
+}
+
+uint32x4_t test_vrshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
+  // CHECK: rshrn2.4s
+  return vrshrn_high_n_u64(lowpart, input, 2);
+}
+
+int8x16_t test_vshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
+  // CHECK: shrn2.16b
+  return vshrn_high_n_s16(lowpart, input, 2);
+}
+
+int16x8_t test_vshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
+  // CHECK: shrn2.8h
+  return vshrn_high_n_s32(lowpart, input, 2);
+}
+
+int32x4_t test_vshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
+  // CHECK: shrn2.4s
+  return vshrn_high_n_s64(lowpart, input, 2);
+}
+
+uint8x16_t test_vshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
+  // CHECK: shrn2.16b
+  return vshrn_high_n_u16(lowpart, input, 2);
+}
+
+uint16x8_t test_vshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
+  // CHECK: shrn2.8h
+  return vshrn_high_n_u32(lowpart, input, 2);
+}
+
+uint32x4_t test_vshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
+  // CHECK: shrn2.4s
+  return vshrn_high_n_u64(lowpart, input, 2);
+}
+
+uint8x16_t test_vqshrun_high_n_s16(uint8x8_t lowpart, int16x8_t input) {
+  // CHECK: sqshrun2.16b
+  return vqshrun_high_n_s16(lowpart, input, 2);
+}
+
+uint16x8_t test_vqshrun_high_n_s32(uint16x4_t lowpart, int32x4_t input) {
+  // CHECK: sqshrun2.8h
+  return vqshrun_high_n_s32(lowpart, input, 2);
+}
+
+uint32x4_t test_vqshrun_high_n_s64(uint32x2_t lowpart, int64x2_t input) {
+  // CHECK: sqshrun2.4s
+  return vqshrun_high_n_s64(lowpart, input, 2);
+}
+
+uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t lowpart, int16x8_t input) {
+  // CHECK: sqrshrun2.16b
+  return vqrshrun_high_n_s16(lowpart, input, 2);
+}
+
+uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t lowpart, int32x4_t input) {
+  // CHECK: sqrshrun2.8h
+  return vqrshrun_high_n_s32(lowpart, input, 2);
+}
+
+uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t lowpart, int64x2_t input) {
+  // CHECK: sqrshrun2.4s
+  return vqrshrun_high_n_s64(lowpart, input, 2);
+}
+
+int8x16_t test_vqshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
+  // CHECK: sqshrn2.16b
+  return vqshrn_high_n_s16(lowpart, input, 2);
+}
+
+int16x8_t test_vqshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
+  // CHECK: sqshrn2.8h
+  return vqshrn_high_n_s32(lowpart, input, 2);
+}
+
+int32x4_t test_vqshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
+  // CHECK: sqshrn2.4s
+  return vqshrn_high_n_s64(lowpart, input, 2);
+}
+
+uint8x16_t test_vqshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
+  // CHECK: uqshrn2.16b
+  return vqshrn_high_n_u16(lowpart, input, 2);
+}
+
+uint16x8_t test_vqshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
+  // CHECK: uqshrn2.8h
+  return vqshrn_high_n_u32(lowpart, input, 2);
+}
+
+uint32x4_t test_vqshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
+  // CHECK: uqshrn2.4s
+  return vqshrn_high_n_u64(lowpart, input, 2);
+}
+
+int8x16_t test_vqrshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
+  // CHECK: sqrshrn2.16b
+  return vqrshrn_high_n_s16(lowpart, input, 2);
+}
+
+int16x8_t test_vqrshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
+  // CHECK: sqrshrn2.8h
+  return vqrshrn_high_n_s32(lowpart, input, 2);
+}
+
+int32x4_t test_vqrshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
+  // CHECK: sqrshrn2.4s
+  return vqrshrn_high_n_s64(lowpart, input, 2);
+}
+
+uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
+  // CHECK: uqrshrn2.16b
+  return vqrshrn_high_n_u16(lowpart, input, 2);
+}
+
+uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
+  // CHECK: uqrshrn2.8h
+  return vqrshrn_high_n_u32(lowpart, input, 2);
+}
+
+uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
+  // CHECK: uqrshrn2.4s
+  return vqrshrn_high_n_u64(lowpart, input, 2);
+}
+
+int8x16_t test_vaddhn_high_s16(int8x8_t lowpart, int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: addhn2.16b v0, v1, v2
+  return vaddhn_high_s16(lowpart, lhs, rhs);
+}
+
+int16x8_t test_vaddhn_high_s32(int16x4_t lowpart, int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: addhn2.8h v0, v1, v2
+  return vaddhn_high_s32(lowpart, lhs, rhs);
+}
+
+int32x4_t test_vaddhn_high_s64(int32x2_t lowpart, int64x2_t lhs, int64x2_t rhs) {
+  // CHECK: addhn2.4s v0, v1, v2
+  return vaddhn_high_s64(lowpart, lhs, rhs);
+}
+
+uint8x16_t test_vaddhn_high_u16(uint8x8_t lowpart, uint16x8_t lhs, uint16x8_t rhs) {
+  // CHECK: addhn2.16b v0, v1, v2
+  return vaddhn_high_s16(lowpart, lhs, rhs);
+}
+
+uint16x8_t test_vaddhn_high_u32(uint16x4_t lowpart, uint32x4_t lhs, uint32x4_t rhs) {
+  // CHECK: addhn2.8h v0, v1, v2
+  return vaddhn_high_s32(lowpart, lhs, rhs);
+}
+
+uint32x4_t test_vaddhn_high_u64(uint32x2_t lowpart, uint64x2_t lhs, uint64x2_t rhs) {
+  // CHECK: addhn2.4s v0, v1, v2
+  return vaddhn_high_s64(lowpart, lhs, rhs);
+}
+
+int8x16_t test_vraddhn_high_s16(int8x8_t lowpart, int16x8_t lhs, int16x8_t rhs) {
+  // CHECK: raddhn2.16b v0, v1, v2
+  return vraddhn_high_s16(lowpart, lhs, rhs);
+}
+
+int16x8_t test_vraddhn_high_s32(int16x4_t lowpart, int32x4_t lhs, int32x4_t rhs) {
+  // CHECK: raddhn2.8h v0, v1, v2
+  return vraddhn_high_s32(lowpart, lhs, rhs);
+}
+
+int32x4_t test_vraddhn_high_s64(int32x2_t lowpart, int64x2_t lhs, int64x2_t rhs) {
+  // CHECK: raddhn2.4s v0, v1, v2
+  return vraddhn_high_s64(lowpart, lhs, rhs);
+}
+
+uint8x16_t test_vraddhn_high_u16(uint8x8_t lowpart, uint16x8_t lhs, uint16x8_t rhs) {
+  // CHECK: raddhn2.16b v0, v1, v2
+  return vraddhn_high_s16(lowpart, lhs, rhs);
+}
+
+uint16x8_t test_vraddhn_high_u32(uint16x4_t lowpart, uint32x4_t lhs, uint32x4_t rhs) {
+  // CHECK: raddhn2.8h v0, v1, v2
+  return vraddhn_high_s32(lowpart, lhs, rhs);
+}
+
+uint32x4_t test_vraddhn_high_u64(uint32x2_t lowpart, uint64x2_t lhs, uint64x2_t rhs) {
+  // CHECK: raddhn2.4s v0, v1, v2
+  return vraddhn_high_s64(lowpart, lhs, rhs);
+}
+
+int8x16_t test_vmovn_high_s16(int8x8_t lowpart, int16x8_t wide) {
+  // CHECK: xtn2.16b v0, v1
+  return vmovn_high_s16(lowpart, wide);
+}
+
+int16x8_t test_vmovn_high_s32(int16x4_t lowpart, int32x4_t wide) {
+  // CHECK: xtn2.8h v0, v1
+  return vmovn_high_s32(lowpart, wide);
+}
+
+int32x4_t test_vmovn_high_s64(int32x2_t lowpart, int64x2_t wide) {
+  // CHECK: xtn2.4s v0, v1
+  return vmovn_high_s64(lowpart, wide);
+}
+
+uint8x16_t test_vmovn_high_u16(uint8x8_t lowpart, uint16x8_t wide) {
+  // CHECK: xtn2.16b v0, v1
+  return vmovn_high_u16(lowpart, wide);
+}
+
+uint16x8_t test_vmovn_high_u32(uint16x4_t lowpart, uint32x4_t wide) {
+  // CHECK: xtn2.8h v0, v1
+  return vmovn_high_u32(lowpart, wide);
+}
+
+uint32x4_t test_vmovn_high_u64(uint32x2_t lowpart, uint64x2_t wide) {
+  // CHECK: xtn2.4s v0, v1
+  return vmovn_high_u64(lowpart, wide);
+}
+
+int8x16_t test_vqmovn_high_s16(int8x8_t lowpart, int16x8_t wide) {
+  // CHECK: sqxtn2.16b v0, v1
+  return vqmovn_high_s16(lowpart, wide);
+}
+
+int16x8_t test_vqmovn_high_s32(int16x4_t lowpart, int32x4_t wide) {
+  // CHECK: sqxtn2.8h v0, v1
+  return vqmovn_high_s32(lowpart, wide);
+}
+
+int32x4_t test_vqmovn_high_s64(int32x2_t lowpart, int64x2_t wide) {
+  // CHECK: sqxtn2.4s v0, v1
+  return vqmovn_high_s64(lowpart, wide);
+}
+
+uint8x16_t test_vqmovn_high_u16(uint8x8_t lowpart, int16x8_t wide) {
+  // CHECK: uqxtn2.16b v0, v1
+  return vqmovn_high_u16(lowpart, wide);
+}
+
+uint16x8_t test_vqmovn_high_u32(uint16x4_t lowpart, int32x4_t wide) {
+  // CHECK: uqxtn2.8h v0, v1
+  return vqmovn_high_u32(lowpart, wide);
+}
+
+uint32x4_t test_vqmovn_high_u64(uint32x2_t lowpart, int64x2_t wide) {
+  // CHECK: uqxtn2.4s v0, v1
+  return vqmovn_high_u64(lowpart, wide);
+}
+
+uint8x16_t test_vqmovun_high_s16(uint8x8_t lowpart, int16x8_t wide) {
+  // CHECK: sqxtun2.16b v0, v1
+  return vqmovun_high_s16(lowpart, wide);
+}
+
+uint16x8_t test_vqmovun_high_s32(uint16x4_t lowpart, int32x4_t wide) {
+  // CHECK: sqxtun2.8h v0, v1
+  return vqmovun_high_s32(lowpart, wide);
+}
+
+uint32x4_t test_vqmovun_high_s64(uint32x2_t lowpart, int64x2_t wide) {
+  // CHECK: sqxtun2.4s v0, v1
+  return vqmovun_high_s64(lowpart, wide);
+}
+
+float32x4_t test_vcvtx_high_f32_f64(float32x2_t lowpart, float64x2_t wide) {
+  // CHECK: fcvtxn2 v0.4s, v1.2d
+  return vcvtx_high_f32_f64(lowpart, wide);
+}
+
+float64x2_t test_vcvt_f64_f32(float32x2_t x) {
+  // CHECK: fcvtl v0.2d, v0.2s
+  return vcvt_f64_f32(x);
+}
+
+float64x2_t test_vcvt_high_f64_f32(float32x4_t x) {
+  // CHECK: fcvtl2 v0.2d, v0.4s
+  return vcvt_high_f64_f32(x);
+}
+
+float32x2_t test_vcvt_f32_f64(float64x2_t v) {
+  // CHECK: fcvtn v0.2s, v0.2d
+  return vcvt_f32_f64(v);
+}
+
+float32x4_t test_vcvt_high_f32_f64(float32x2_t x, float64x2_t v) {
+  // CHECK: fcvtn2 v0.4s, v1.2d
+  return vcvt_high_f32_f64(x, v);
+}
+
+float32x2_t test_vcvtx_f32_f64(float64x2_t v) {
+  // CHECK: fcvtxn v0.2s, v0.2d
+  return vcvtx_f32_f64(v);
+}
diff --git a/clang/test/CodeGen/arm64_vCMP.c b/clang/test/CodeGen/arm64_vCMP.c
new file mode 100644
index 00000000000..6216f40e1ac
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vCMP.c
@@ -0,0 +1,108 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+
+// Test ARM64 SIMD fused multiply add intrinsics
+
+#include <arm_neon.h>
+
+int64x2_t test_vabsq_s64(int64x2_t a1) {
+  // CHECK: test_vabsq_s64
+  return vabsq_s64(a1);
+  // CHECK: llvm.arm64.neon.abs.v2i64
+  // CHECK-NEXT: ret
+}
+
+int64_t test_vceqd_s64(int64_t a1, int64_t a2) {
+  // CHECK: test_vceqd_s64
+  return vceqd_s64(a1, a2);
+  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp eq i64 %a1, %a2
+  // CHECK: sext i1 [[BIT]] to i64
+}
+
+int64_t test_vceqd_f64(float64_t a1, float64_t a2) {
+  // CHECK: test_vceqd_f64
+  return vceqd_f64(a1, a2);
+  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = fcmp oeq double %a1, %a2
+  // CHECK: sext i1 [[BIT]] to i64
+}
+
+uint64_t test_vcgtd_u64(uint64_t a1, uint64_t a2) {
+  // CHECK: test_vcgtd_u64
+  return vcgtd_u64(a1, a2);
+  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp ugt i64 %a1, %a2
+  // CHECK: sext i1 [[BIT]] to i64
+}
+
+uint64_t test_vcled_u64(uint64_t a1, uint64_t a2) {
+  // CHECK: test_vcled_u64
+  return vcled_u64(a1, a2);
+  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp ule i64 %a1, %a2
+  // CHECK: sext i1 [[BIT]] to i64
+}
+
+int64_t test_vceqzd_s64(int64_t a1) {
+  // CHECK: test_vceqzd_s64
+  return vceqzd_s64(a1);
+  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp eq i64 %a1, 0
+  // CHECK: sext i1 [[BIT]] to i64
+}
+
+uint64x2_t test_vceqq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vceqq_u64
+  return vceqq_u64(a1, a2);
+  // CHECK:  icmp eq <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcgeq_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vcgeq_s64
+  return vcgeq_s64(a1, a2);
+  // CHECK:  icmp sge <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcgeq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vcgeq_u64
+  return vcgeq_u64(a1, a2);
+  // CHECK:  icmp uge <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcgtq_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vcgtq_s64
+  return vcgtq_s64(a1, a2);
+  // CHECK: icmp sgt <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcgtq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vcgtq_u64
+  return vcgtq_u64(a1, a2);
+  // CHECK: icmp ugt <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcleq_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vcleq_s64
+  return vcleq_s64(a1, a2);
+  // CHECK: icmp sle <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcleq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vcleq_u64
+  return vcleq_u64(a1, a2);
+  // CHECK: icmp ule <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcltq_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vcltq_s64
+  return vcltq_s64(a1, a2);
+  // CHECK: icmp slt <2 x i64> %a1, %a2
+}
+
+uint64x2_t test_vcltq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vcltq_u64
+  return vcltq_u64(a1, a2);
+  // CHECK: icmp ult <2 x i64> %a1, %a2
+}
+
+int64x2_t test_vqabsq_s64(int64x2_t a1) {
+  // CHECK: test_vqabsq_s64
+  return vqabsq_s64(a1);
+  // CHECK: llvm.arm64.neon.sqabs.v2i64(<2 x i64> %a1)
+  // CHECK-NEXT: ret
+}
diff --git a/clang/test/CodeGen/arm64_vLdStNum_lane.c b/clang/test/CodeGen/arm64_vLdStNum_lane.c
new file mode 100644
index 00000000000..6c4415845a6
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vLdStNum_lane.c
@@ -0,0 +1,141 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD load and stores of an N-element structure  intrinsics
+
+#include <arm_neon.h>
+
+int64x2x2_t test_vld2q_lane_s64(const void * a1, int64x2x2_t a2) {
+  // CHECK: test_vld2q_lane_s64
+  return vld2q_lane_s64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.ld2lane.v2i64.p0i8
+}
+
+uint64x2x2_t test_vld2q_lane_u64(const void * a1, uint64x2x2_t a2) {
+  // CHECK: test_vld2q_lane_u64
+  return vld2q_lane_u64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.ld2lane.v2i64.p0i8
+}
+
+int64x1x2_t test_vld2_lane_s64(const void * a1, int64x1x2_t a2) {
+  // CHECK: test_vld2_lane_s64
+  return vld2_lane_s64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld2lane.v1i64.p0i8
+}
+
+uint64x1x2_t test_vld2_lane_u64(const void * a1, uint64x1x2_t a2) {
+  // CHECK: test_vld2_lane_u64
+  return vld2_lane_u64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld2lane.v1i64.p0i8
+}
+
+poly8x16x2_t test_vld2q_lane_p8(const void * a1, poly8x16x2_t a2) {
+  // CHECK: test_vld2q_lane_p8
+  return vld2q_lane_p8(a1, a2, 0);
+  // CHECK: extractvalue {{.*}} 0{{ *$}}
+  // CHECK: extractvalue {{.*}} 1{{ *$}}
+}
+
+uint8x16x2_t test_vld2q_lane_u8(const void * a1, uint8x16x2_t a2) {
+  // CHECK: test_vld2q_lane_u8
+  return vld2q_lane_u8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld2lane.v16i8.p0i8
+}
+
+int64x2x3_t test_vld3q_lane_s64(const void * a1, int64x2x3_t a2) {
+  // CHECK: test_vld3q_lane_s64
+  return vld3q_lane_s64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.ld3lane.v2i64.p0i8
+}
+
+uint64x2x3_t test_vld3q_lane_u64(const void * a1, uint64x2x3_t a2) {
+  // CHECK: test_vld3q_lane_u64
+  return vld3q_lane_u64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.ld3lane.v2i64.p0i8
+}
+
+int64x1x3_t test_vld3_lane_s64(const void * a1, int64x1x3_t a2) {
+  // CHECK: test_vld3_lane_s64
+  return vld3_lane_s64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld3lane.v1i64.p0i8
+}
+
+uint64x1x3_t test_vld3_lane_u64(const void * a1, uint64x1x3_t a2) {
+  // CHECK: test_vld3_lane_u64
+  return vld3_lane_u64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld3lane.v1i64.p0i8
+}
+
+int8x8x3_t test_vld3_lane_s8(const void * a1, int8x8x3_t a2) {
+  // CHECK: test_vld3_lane_s8
+  return vld3_lane_s8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld3lane.v8i8.p0i8
+}
+
+poly8x16x3_t test_vld3q_lane_p8(const void * a1, poly8x16x3_t a2) {
+  // CHECK: test_vld3q_lane_p8
+  return vld3q_lane_p8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld3lane.v16i8.p0i8
+}
+
+uint8x16x3_t test_vld3q_lane_u8(const void * a1, uint8x16x3_t a2) {
+  // CHECK: test_vld3q_lane_u8
+  return vld3q_lane_u8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld3lane.v16i8.p0i8
+}
+
+int64x2x4_t test_vld4q_lane_s64(const void * a1, int64x2x4_t a2) {
+  // CHECK: test_vld4q_lane_s64
+  return vld4q_lane_s64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v2i64.p0i8
+}
+
+uint64x2x4_t test_vld4q_lane_u64(const void * a1, uint64x2x4_t a2) {
+  // CHECK: test_vld4q_lane_u64
+  return vld4q_lane_u64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v2i64.p0i8
+}
+
+int64x1x4_t test_vld4_lane_s64(const void * a1, int64x1x4_t a2) {
+  // CHECK: test_vld4_lane_s64
+  return vld4_lane_s64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v1i64.p0i8
+}
+
+uint64x1x4_t test_vld4_lane_u64(const void * a1, uint64x1x4_t a2) {
+  // CHECK: test_vld4_lane_u64
+  return vld4_lane_u64(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v1i64.p0i8
+}
+
+int8x8x4_t test_vld4_lane_s8(const void * a1, int8x8x4_t a2) {
+  // CHECK: test_vld4_lane_s8
+  return vld4_lane_s8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v8i8.p0i8
+}
+
+uint8x8x4_t test_vld4_lane_u8(const void * a1, uint8x8x4_t a2) {
+  // CHECK: test_vld4_lane_u8
+  return vld4_lane_u8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v8i8.p0i8
+}
+
+poly8x16x4_t test_vld4q_lane_p8(const void * a1, poly8x16x4_t a2) {
+  // CHECK: test_vld4q_lane_p8
+  return vld4q_lane_p8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v16i8.p0i8
+}
+
+int8x16x4_t test_vld4q_lane_s8(const void * a1, int8x16x4_t a2) {
+  // CHECK: test_vld4q_lane_s8
+  return vld4q_lane_s8(a1, a2, 0);
+  // CHECK: extractvalue {{.*}} 0{{ *$}}
+  // CHECK: extractvalue {{.*}} 1{{ *$}}
+  // CHECK: extractvalue {{.*}} 2{{ *$}}
+  // CHECK: extractvalue {{.*}} 3{{ *$}}
+}
+
+uint8x16x4_t test_vld4q_lane_u8(const void * a1, uint8x16x4_t a2) {
+  // CHECK: test_vld4q_lane_u8
+  return vld4q_lane_u8(a1, a2, 0);
+  // CHECK: llvm.arm64.neon.ld4lane.v16i8.p0i8
+}
+
diff --git a/clang/test/CodeGen/arm64_vMaxMin.c b/clang/test/CodeGen/arm64_vMaxMin.c
new file mode 100644
index 00000000000..8689bca701e
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vMaxMin.c
@@ -0,0 +1,207 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - %s | FileCheck -check-prefix=CHECK-CODEGEN %s
+// REQUIRES: arm64-registered-target
+// Test ARM64 SIMD max/min intrinsics
+
+#include <arm_neon.h>
+
+// Test a represntative sample of 8 and 16, signed and unsigned, 64 and 128 bit reduction
+int8_t test_vmaxv_s8(int8x8_t a1) {
+  // CHECK: test_vmaxv_s8
+  return vmaxv_s8(a1);
+  // CHECK @llvm.arm64.neon.smaxv.i32.v8i8
+}
+
+uint16_t test_vminvq_u16(uint16x8_t a1) {
+  // CHECK: test_vminvq_u16
+  return vminvq_u16(a1);
+  // CHECK llvm.arm64.neon.uminv.i16.v8i16
+}
+
+// Test a represntative sample of 8 and 16, signed and unsigned, 64 and 128 bit pairwise
+uint8x8_t test_vmin_u8(uint8x8_t a1, uint8x8_t a2) {
+  // CHECK: test_vmin_u8
+  return vmin_u8(a1, a2);
+  // CHECK llvm.arm64.neon.umin.v8i8
+}
+
+uint8x16_t test_vminq_u8(uint8x16_t a1, uint8x16_t a2) {
+  // CHECK: test_vminq_u8
+  return vminq_u8(a1, a2);
+  // CHECK llvm.arm64.neon.umin.v16i8
+}
+
+int16x8_t test_vmaxq_s16(int16x8_t a1, int16x8_t a2) {
+  // CHECK: test_vmaxq_s16
+  return vmaxq_s16(a1, a2);
+  // CHECK llvm.arm64.neon.smax.v8i16
+}
+
+// Test the more complicated cases of [suf]32 and f64
+float64x2_t test_vmaxq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK: test_vmaxq_f64
+  return vmaxq_f64(a1, a2);
+  // CHECK llvm.arm64.neon.fmax.v2f64
+}
+
+float32x4_t test_vmaxq_f32(float32x4_t a1, float32x4_t a2) {
+  // CHECK: test_vmaxq_f32
+  return vmaxq_f32(a1, a2);
+  // CHECK llvm.arm64.neon.fmax.v4f32
+}
+
+float64x2_t test_vminq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK: test_vminq_f64
+  return vminq_f64(a1, a2);
+  // CHECK llvm.arm64.neon.fmin.v2f64
+}
+
+float32x2_t test_vmax_f32(float32x2_t a1, float32x2_t a2) {
+  // CHECK: test_vmax_f32
+  return vmax_f32(a1, a2);
+  // CHECK llvm.arm64.neon.fmax.v2f32
+}
+
+int32x2_t test_vmax_s32(int32x2_t a1, int32x2_t a2) {
+  // CHECK: test_vmax_s32
+  return vmax_s32(a1, a2);
+  // CHECK llvm.arm64.neon.smax.v2i32
+}
+
+uint32x2_t test_vmin_u32(uint32x2_t a1, uint32x2_t a2) {
+  // CHECK: test_vmin_u32
+  return vmin_u32(a1, a2);
+  // CHECK llvm.arm64.neon.umin.v2i32
+}
+
+float32_t test_vmaxnmv_f32(float32x2_t a1) {
+  // CHECK: test_vmaxnmv_f32
+  return vmaxnmv_f32(a1);
+  // CHECK: llvm.arm64.neon.fmaxnmv.f32.v2f32
+  // CHECK-NEXT: ret
+}
+
+// this doesn't translate into a valid instruction, regardless of what the
+// ARM doc says.
+#if 0
+float64_t test_vmaxnmvq_f64(float64x2_t a1) {
+  // CHECK@ test_vmaxnmvq_f64
+  return vmaxnmvq_f64(a1);
+  // CHECK@ llvm.arm64.neon.saddlv.i64.v2i32
+  // CHECK-NEXT@ ret
+}
+#endif
+
+float32_t test_vmaxnmvq_f32(float32x4_t a1) {
+  // CHECK: test_vmaxnmvq_f32
+  return vmaxnmvq_f32(a1);
+  // CHECK: llvm.arm64.neon.fmaxnmv.f32.v4f32
+  // CHECK-NEXT: ret
+}
+
+float32_t test_vmaxv_f32(float32x2_t a1) {
+  // CHECK: test_vmaxv_f32
+  return vmaxv_f32(a1);
+  // CHECK: llvm.arm64.neon.fmaxv.f32.v2f32
+  // FIXME check that the 2nd and 3rd arguments are the same V register below
+  // CHECK-CODEGEN: fmaxp.2s
+  // CHECK-NEXT: ret
+}
+
+int32_t test_vmaxv_s32(int32x2_t a1) {
+  // CHECK: test_vmaxv_s32
+  return vmaxv_s32(a1);
+  // CHECK: llvm.arm64.neon.smaxv.i32.v2i32
+  // FIXME check that the 2nd and 3rd arguments are the same V register below
+  // CHECK-CODEGEN: smaxp.2s
+  // CHECK-NEXT: ret
+}
+
+uint32_t test_vmaxv_u32(uint32x2_t a1) {
+  // CHECK: test_vmaxv_u32
+  return vmaxv_u32(a1);
+  // CHECK: llvm.arm64.neon.umaxv.i32.v2i32
+  // FIXME check that the 2nd and 3rd arguments are the same V register below
+  // CHECK-CODEGEN: umaxp.2s
+  // CHECK-NEXT: ret
+}
+
+// FIXME punt on this for now; don't forget to fix CHECKs
+#if 0
+float64_t test_vmaxvq_f64(float64x2_t a1) {
+  // CHECK@ test_vmaxvq_f64
+  return vmaxvq_f64(a1);
+  // CHECK@ llvm.arm64.neon.fmaxv.i64.v2f64
+  // CHECK-NEXT@ ret
+}
+#endif
+
+float32_t test_vmaxvq_f32(float32x4_t a1) {
+  // CHECK: test_vmaxvq_f32
+  return vmaxvq_f32(a1);
+  // CHECK: llvm.arm64.neon.fmaxv.f32.v4f32
+  // CHECK-NEXT: ret
+}
+
+float32_t test_vminnmv_f32(float32x2_t a1) {
+  // CHECK: test_vminnmv_f32
+  return vminnmv_f32(a1);
+  // CHECK: llvm.arm64.neon.fminnmv.f32.v2f32
+  // CHECK-NEXT: ret
+}
+
+float32_t test_vminvq_f32(float32x4_t a1) {
+  // CHECK: test_vminvq_f32
+  return vminvq_f32(a1);
+  // CHECK: llvm.arm64.neon.fminv.f32.v4f32
+  // CHECK-NEXT: ret
+}
+
+// this doesn't translate into a valid instruction, regardless of what the ARM
+// doc says.
+#if 0
+float64_t test_vminnmvq_f64(float64x2_t a1) {
+  // CHECK@ test_vminnmvq_f64
+  return vminnmvq_f64(a1);
+  // CHECK@ llvm.arm64.neon.saddlv.i64.v2i32
+  // CHECK-NEXT@ ret
+}
+#endif
+
+float32_t test_vminnmvq_f32(float32x4_t a1) {
+  // CHECK: test_vminnmvq_f32
+  return vminnmvq_f32(a1);
+  // CHECK: llvm.arm64.neon.fminnmv.f32.v4f32
+  // CHECK-NEXT: ret
+}
+
+float32_t test_vminv_f32(float32x2_t a1) {
+  // CHECK: test_vminv_f32
+  return vminv_f32(a1);
+  // CHECK: llvm.arm64.neon.fminv.f32.v2f32
+  // CHECK-NEXT: ret
+}
+
+int32_t test_vminv_s32(int32x2_t a1) {
+  // CHECK: test_vminv_s32
+  return vminv_s32(a1);
+  // CHECK: llvm.arm64.neon.sminv.i32.v2i32
+  // CHECK-CODEGEN: sminp.2s
+  // CHECK-NEXT: ret
+}
+
+uint32_t test_vminv_u32(uint32x2_t a1) {
+  // CHECK: test_vminv_u32
+  return vminv_u32(a1);
+  // CHECK: llvm.arm64.neon.fminv.f32.v2f32
+}
+
+// FIXME punt on this for now; don't forget to fix CHECKs
+#if 0
+float64_t test_vminvq_f64(float64x2_t a1) {
+  // CHECK@ test_vminvq_f64
+  return vminvq_f64(a1);
+  // CHECK@ llvm.arm64.neon.saddlv.i64.v2i32
+  // CHECK-NEXT@ ret
+}
+#endif
diff --git a/clang/test/CodeGen/arm64_vadd.c b/clang/test/CodeGen/arm64_vadd.c
new file mode 100644
index 00000000000..acec888e260
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vadd.c
@@ -0,0 +1,102 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD add intrinsics
+
+#include <arm_neon.h>
+int64_t test_vaddlv_s32(int32x2_t a1) {
+  // CHECK: test_vaddlv_s32
+  return vaddlv_s32(a1);
+  // CHECK: llvm.arm64.neon.saddlv.i64.v2i32
+  // CHECK-NEXT: ret
+}
+
+uint64_t test_vaddlv_u32(uint32x2_t a1) {
+  // CHECK: test_vaddlv_u32
+  return vaddlv_u32(a1);
+  // CHECK: llvm.arm64.neon.uaddlv.i64.v2i32
+  // CHECK-NEXT: ret
+}
+
+int8_t test_vaddv_s8(int8x8_t a1) {
+  // CHECK: test_vaddv_s8
+  return vaddv_s8(a1);
+  // CHECK: llvm.arm64.neon.saddv.i32.v8i8
+  // don't check for return here (there's a trunc?)
+}
+
+int16_t test_vaddv_s16(int16x4_t a1) {
+  // CHECK: test_vaddv_s16
+  return vaddv_s16(a1);
+  // CHECK: llvm.arm64.neon.saddv.i32.v4i16
+  // don't check for return here (there's a trunc?)
+}
+
+int32_t test_vaddv_s32(int32x2_t a1) {
+  // CHECK: test_vaddv_s32
+  return vaddv_s32(a1);
+  // CHECK: llvm.arm64.neon.saddv.i32.v2i32
+  // CHECK-NEXT: ret
+}
+
+uint8_t test_vaddv_u8(int8x8_t a1) {
+  // CHECK: test_vaddv_u8
+  return vaddv_u8(a1);
+  // CHECK: llvm.arm64.neon.uaddv.i32.v8i8
+  // don't check for return here (there's a trunc?)
+}
+
+uint16_t test_vaddv_u16(int16x4_t a1) {
+  // CHECK: test_vaddv_u16
+  return vaddv_u16(a1);
+  // CHECK: llvm.arm64.neon.uaddv.i32.v4i16
+  // don't check for return here (there's a trunc?)
+}
+
+uint32_t test_vaddv_u32(int32x2_t a1) {
+  // CHECK: test_vaddv_u32
+  return vaddv_u32(a1);
+  // CHECK: llvm.arm64.neon.uaddv.i32.v2i32
+  // CHECK-NEXT: ret
+}
+
+int8_t test_vaddvq_s8(int8x16_t a1) {
+  // CHECK: test_vaddvq_s8
+  return vaddvq_s8(a1);
+  // CHECK: llvm.arm64.neon.saddv.i32.v16i8
+  // don't check for return here (there's a trunc?)
+}
+
+int16_t test_vaddvq_s16(int16x8_t a1) {
+  // CHECK: test_vaddvq_s16
+  return vaddvq_s16(a1);
+  // CHECK: llvm.arm64.neon.saddv.i32.v8i16
+  // don't check for return here (there's a trunc?)
+}
+
+int32_t test_vaddvq_s32(int32x4_t a1) {
+  // CHECK: test_vaddvq_s32
+  return vaddvq_s32(a1);
+  // CHECK: llvm.arm64.neon.saddv.i32.v4i32
+  // CHECK-NEXT: ret
+}
+
+uint8_t test_vaddvq_u8(int8x16_t a1) {
+  // CHECK: test_vaddvq_u8
+  return vaddvq_u8(a1);
+  // CHECK: llvm.arm64.neon.uaddv.i32.v16i8
+  // don't check for return here (there's a trunc?)
+}
+
+uint16_t test_vaddvq_u16(int16x8_t a1) {
+  // CHECK: test_vaddvq_u16
+  return vaddvq_u16(a1);
+  // CHECK: llvm.arm64.neon.uaddv.i32.v8i16
+  // don't check for return here (there's a trunc?)
+}
+
+uint32_t test_vaddvq_u32(int32x4_t a1) {
+  // CHECK: test_vaddvq_u32
+  return vaddvq_u32(a1);
+  // CHECK: llvm.arm64.neon.uaddv.i32.v4i32
+  // CHECK-NEXT: ret
+}
+
diff --git a/clang/test/CodeGen/arm64_vca.c b/clang/test/CodeGen/arm64_vca.c
new file mode 100644
index 00000000000..28109c806c5
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vca.c
@@ -0,0 +1,59 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 vector compare absolute intrinsics
+
+#include <arm_neon.h>
+
+uint32x2_t test_vcale_f32(float32x2_t a1, float32x2_t a2) {
+  // CHECK: test_vcale_f32
+  return vcale_f32(a1, a2);
+  // CHECK: llvm.arm64.neon.facge.v2i32.v2f32
+  // no check for ret here, as there is a bitcast
+}
+
+uint32x4_t test_vcaleq_f32(float32x4_t a1, float32x4_t a2) {
+  // CHECK: test_vcaleq_f32
+  return vcaleq_f32(a1, a2);
+  // CHECK: llvm.arm64.neon.facge.v4i32.v4f32{{.*a2,.*a1}}
+  // no check for ret here, as there is a bitcast
+}
+
+uint32x2_t test_vcalt_f32(float32x2_t a1, float32x2_t a2) {
+  // CHECK: test_vcalt_f32
+  return vcalt_f32(a1, a2);
+  // CHECK: llvm.arm64.neon.facgt.v2i32.v2f32{{.*a2,.*a1}}
+  // no check for ret here, as there is a bitcast
+}
+
+uint32x4_t test_vcaltq_f32(float32x4_t a1, float32x4_t a2) {
+  // CHECK: test_vcaltq_f32
+  return vcaltq_f32(a1, a2);
+  // CHECK: llvm.arm64.neon.facgt.v4i32.v4f32{{.*a2,.*a1}}
+}
+
+uint64x2_t test_vcagtq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK: test_vcagtq_f64
+  return vcagtq_f64(a1, a2);
+  // CHECK: llvm.arm64.neon.facgt.v2i64.v2f64{{.*a1,.*a2}}
+  // no check for ret here, as there is a bitcast
+}
+
+uint64x2_t test_vcaltq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK: test_vcaltq_f64
+  return vcaltq_f64(a1, a2);
+  // CHECK: llvm.arm64.neon.facgt.v2i64.v2f64{{.*a2,.*a1}}
+  // no check for ret here, as there is a bitcast
+}
+
+uint64x2_t test_vcageq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK: test_vcageq_f64
+  return vcageq_f64(a1, a2);
+  // CHECK: llvm.arm64.neon.facge.v2i64.v2f64{{.*a1,.*a2}}
+  // no check for ret here, as there is a bitcast
+}
+
+uint64x2_t test_vcaleq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK: test_vcaleq_f64
+  return vcaleq_f64(a1, a2);
+  // CHECK: llvm.arm64.neon.facge.v2i64.v2f64{{.*a2,.*a1}}
+  // no check for ret here, as there is a bitcast
+}
diff --git a/clang/test/CodeGen/arm64_vcopy.c b/clang/test/CodeGen/arm64_vcopy.c
new file mode 100644
index 00000000000..d01b7d0d351
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vcopy.c
@@ -0,0 +1,69 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+
+// Test ARM64 SIMD copy vector element to vector element: vcopyq_lane*
+
+#include <arm_neon.h>
+
+int8x16_t test_vcopyq_laneq_s8(int8x16_t a1, int8x16_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_s8
+  return vcopyq_laneq_s8(a1, (int64_t) 3, a2, (int64_t) 13);
+  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+}
+
+uint8x16_t test_vcopyq_laneq_u8(uint8x16_t a1, uint8x16_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_u8
+  return vcopyq_laneq_u8(a1, (int64_t) 3, a2, (int64_t) 13);
+  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+}
+
+int16x8_t test_vcopyq_laneq_s16(int16x8_t a1, int16x8_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_s16
+  return vcopyq_laneq_s16(a1, (int64_t) 3, a2, (int64_t) 7);
+  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
+
+}
+
+uint16x8_t test_vcopyq_laneq_u16(uint16x8_t a1, uint16x8_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_u16
+  return vcopyq_laneq_u16(a1, (int64_t) 3, a2, (int64_t) 7);
+  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
+
+}
+
+int32x4_t test_vcopyq_laneq_s32(int32x4_t a1, int32x4_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_s32
+  return vcopyq_laneq_s32(a1, (int64_t) 3, a2, (int64_t) 3);
+  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+}
+
+uint32x4_t test_vcopyq_laneq_u32(uint32x4_t a1, uint32x4_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_u32
+  return vcopyq_laneq_u32(a1, (int64_t) 3, a2, (int64_t) 3);
+  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+}
+
+int64x2_t test_vcopyq_laneq_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_s64
+  return vcopyq_laneq_s64(a1, (int64_t) 0, a2, (int64_t) 1);
+  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
+}
+
+uint64x2_t test_vcopyq_laneq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_u64
+  return vcopyq_laneq_u64(a1, (int64_t) 0, a2, (int64_t) 1);
+  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
+}
+
+float32x4_t test_vcopyq_laneq_f32(float32x4_t a1, float32x4_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_f32
+  return vcopyq_laneq_f32(a1, 0, a2, 3);
+  // CHECK: shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+}
+
+float64x2_t test_vcopyq_laneq_f64(float64x2_t a1, float64x2_t a2) {
+  // CHECK-LABEL: test_vcopyq_laneq_f64
+  return vcopyq_laneq_f64(a1, 0, a2, 1);
+  // CHECK: shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 3, i32 1>
+}
+
diff --git a/clang/test/CodeGen/arm64_vcreate.c b/clang/test/CodeGen/arm64_vcreate.c
new file mode 100644
index 00000000000..6447dc4d253
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vcreate.c
@@ -0,0 +1,23 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD vcreate intrinsics
+
+/*#include <arm_neon.h>*/
+#include <arm_neon.h>
+
+float32x2_t test_vcreate_f32(uint64_t a1) {
+  // CHECK: test_vcreate_f32
+  return vcreate_f32(a1);
+  // CHECK: bitcast {{.*}} to <2 x float>
+  // CHECK-NEXT: ret
+}
+
+// FIXME enable when scalar_to_vector in backend is fixed.  Also, change
+// CHECK@ to CHECK<colon> and CHECK-NEXT@ to CHECK-NEXT<colon>
+/*
+float64x1_t test_vcreate_f64(uint64_t a1) {
+  // CHECK@ test_vcreate_f64
+  return vcreate_f64(a1);
+  // CHECK@ llvm.arm64.neon.saddlv.i64.v2i32
+  // CHECK-NEXT@ ret
+}
+*/
diff --git a/clang/test/CodeGen/arm64_vcvtfp.c b/clang/test/CodeGen/arm64_vcvtfp.c
new file mode 100644
index 00000000000..030fe772e42
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vcvtfp.c
@@ -0,0 +1,48 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+
+#include <arm_neon.h>
+
+float64x2_t test_vcvt_f64_f32(float32x2_t x) {
+  // CHECK-LABEL: test_vcvt_f64_f32
+  return vcvt_f64_f32(x);
+  // CHECK: fpext <2 x float> {{%.*}} to <2 x double>
+  // CHECK-NEXT: ret
+}
+
+float64x2_t test_vcvt_high_f64_f32(float32x4_t x) {
+  // CHECK-LABEL: test_vcvt_high_f64_f32
+  return vcvt_high_f64_f32(x);
+  // CHECK: [[HIGH:%.*]] = shufflevector <4 x float> {{%.*}}, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK-NEXT: fpext <2 x float> [[HIGH]] to <2 x double>
+  // CHECK-NEXT: ret
+}
+
+float32x2_t test_vcvt_f32_f64(float64x2_t v) {
+  // CHECK: test_vcvt_f32_f64
+  return vcvt_f32_f64(v);
+  // CHECK: fptrunc <2 x double> {{%.*}} to <2 x float>
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vcvt_high_f32_f64(float32x2_t x, float64x2_t v) {
+  // CHECK: test_vcvt_high_f32_f64
+  return vcvt_high_f32_f64(x, v);
+  // CHECK: [[TRUNC:%.*]] = fptrunc <2 x double> {{.*}} to <2 x float>
+  // CHECK-NEXT: shufflevector <2 x float> {{.*}}, <2 x float> [[TRUNC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK-NEXT: ret
+}
+
+float32x2_t test_vcvtx_f32_f64(float64x2_t v) {
+  // CHECK: test_vcvtx_f32_f64
+  return vcvtx_f32_f64(v);
+  // CHECK: llvm.arm64.neon.fcvtxn.v2f32.v2f64
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vcvtx_high_f32_f64(float32x2_t x, float64x2_t v) {
+  // CHECK: test_vcvtx_high_f32_f64
+  return vcvtx_high_f32_f64(x, v);
+  // CHECK: llvm.arm64.neon.fcvtxn.v2f32.v2f64
+  // CHECK: shufflevector
+  // CHECK-NEXT: ret
+}
diff --git a/clang/test/CodeGen/arm64_vdup.c b/clang/test/CodeGen/arm64_vdup.c
new file mode 100644
index 00000000000..f9773e4f476
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vdup.c
@@ -0,0 +1,42 @@
+// RUN: %clang -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD duplicate lane and n intrinsics
+
+#include <arm_neon.h>
+
+void test_vdup_lane_s64(int64x1_t a1) {
+  // CHECK-LABEL: test_vdup_lane_s64
+  vdup_lane_s64(a1, 0);
+  // CHECK: shufflevector
+}
+
+void test_vdup_lane_u64(uint64x1_t a1) {
+  // CHECK-LABEL: test_vdup_lane_u64
+  vdup_lane_u64(a1, 0);
+  // CHECK: shufflevector
+}
+
+// uncomment out the following code once scalar_to_vector in the backend
+// works (for 64 bit?).  Change the "CHECK@" to "CHECK<colon>"
+/*
+float64x1_t test_vdup_n_f64(float64_t a1) {
+  // CHECK-LABEL@ test_vdup_n_f64
+  return vdup_n_f64(a1);
+  // match that an element is inserted into part 0
+  // CHECK@ insertelement {{.*, i32 0 *$}}
+}
+*/
+
+float16x8_t test_vdupq_n_f16(float16_t *a1) {
+  // CHECK-LABEL: test_vdupq_n_f16
+  return vdupq_n_f16(*a1);
+  // match that an element is inserted into parts 0-7.  The backend better
+  // turn that into a single dup intruction
+  // CHECK: insertelement {{.*, i32 0 *$}}
+  // CHECK: insertelement {{.*, i32 1 *$}}
+  // CHECK: insertelement {{.*, i32 2 *$}}
+  // CHECK: insertelement {{.*, i32 3 *$}}
+  // CHECK: insertelement {{.*, i32 4 *$}}
+  // CHECK: insertelement {{.*, i32 5 *$}}
+  // CHECK: insertelement {{.*, i32 6 *$}}
+  // CHECK: insertelement {{.*, i32 7 *$}}
+}
diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c
new file mode 100644
index 00000000000..139bfffb8d8
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c
@@ -0,0 +1,88 @@
+// RUN: %clang -O3 -target arm64-apple-ios7 -ffreestanding -S -o - %s | FileCheck %s
+// RUN: %clang -O3 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | \
+// RUN:   FileCheck -check-prefix=CHECK-IR %s
+// REQUIRES: arm64-registered-target
+
+/// Test vdupq_n_f64 and vmovq_nf64 ARM64 intrinsics
+// <rdar://problem/11778405> ARM64: vdupq_n_f64 and vdupq_lane_f64 intrinsics
+// missing
+
+
+#include <arm_neon.h>
+
+// vdupq_n_f64 -> dup.2d v0, v0[0]
+//
+float64x2_t test_vdupq_n_f64(float64_t w)
+{
+    return vdupq_n_f64(w);
+  // CHECK-LABEL: test_vdupq_n_f64:
+  // CHECK: dup.2d v0, v0[0]
+  // CHECK-NEXT: ret
+}
+
+// might as well test this while we're here
+// vdupq_n_f32 -> dup.4s v0, v0[0]
+float32x4_t test_vdupq_n_f32(float32_t w)
+{
+    return vdupq_n_f32(w);
+  // CHECK-LABEL: test_vdupq_n_f32:
+  // CHECK: dup.4s v0, v0[0]
+  // CHECK-NEXT: ret
+}
+
+// vdupq_lane_f64 -> dup.2d v0, v0[0]
+// this was in <rdar://problem/11778405>, but had already been implemented,
+// test anyway
+float64x2_t test_vdupq_lane_f64(float64x1_t V)
+{
+    return vdupq_lane_f64(V, 0);
+  // CHECK-LABEL: test_vdupq_lane_f64:
+  // CHECK: dup.2d v0, v0[0]
+  // CHECK-NEXT: ret
+}
+
+// vmovq_n_f64 -> dup Vd.2d,X0
+// this wasn't in <rdar://problem/11778405>, but it was between the vdups
+float64x2_t test_vmovq_n_f64(float64_t w)
+{
+  return vmovq_n_f64(w);
+  // CHECK-LABEL: test_vmovq_n_f64:
+  // CHECK: dup.2d v0, v0[0]
+  // CHECK-NEXT: ret
+}
+
+float16x4_t test_vmov_n_f16(float16_t *a1)
+{
+  // CHECK-IR-LABEL: test_vmov_n_f16
+  return vmov_n_f16(*a1);
+  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
+}
+
+// Disable until scalar problem in backend is fixed. Change CHECK-IR@ to
+// CHECK-IR<colon>
+/*
+float64x1_t test_vmov_n_f64(float64_t a1)
+{
+  // CHECK-IR@ test_vmov_n_f64
+  return vmov_n_f64(a1);
+  // CHECK-IR@ insertelement {{.*}} i32 0{{ *$}}
+}
+*/
+
+float16x8_t test_vmovq_n_f16(float16_t *a1)
+{
+  // CHECK-IR-LABEL: test_vmovq_n_f16
+  return vmovq_n_f16(*a1);
+  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 4{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 5{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 6{{ *$}}
+  // CHECK-IR: insertelement {{.*}} i32 7{{ *$}}
+}
+
diff --git a/clang/test/CodeGen/arm64_vecCmpBr.c b/clang/test/CodeGen/arm64_vecCmpBr.c
new file mode 100644
index 00000000000..3337df74a46
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vecCmpBr.c
@@ -0,0 +1,111 @@
+// RUN: %clang -O3 -target arm64-apple-ios7 -S -ffreestanding %s -o - | FileCheck %s
+// REQUIRES: arm64-registered-target
+// test code generation for <rdar://problem/11487757>
+#include <arm_neon.h>
+
+unsigned bar();
+
+// Branch if any lane of V0 is zero; 64 bit => !min
+unsigned anyZero64(uint16x4_t a) {
+// CHECK: anyZero64:
+// CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: b {{_bar|bar}}
+  if (!vminv_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if any lane of V0 is zero; 128 bit => !min
+unsigned anyZero128(uint16x8_t a) {
+// CHECK: anyZero128:
+// CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: b {{_bar|bar}}
+  if (!vminvq_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if any lane of V0 is non-zero; 64 bit => max
+unsigned anyNonZero64(uint16x4_t a) {
+// CHECK: anyNonZero64:
+// CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: movz w0, #0
+  if (vmaxv_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if any lane of V0 is non-zero; 128 bit => max
+unsigned anyNonZero128(uint16x8_t a) {
+// CHECK: anyNonZero128:
+// CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: movz w0, #0
+  if (vmaxvq_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if all lanes of V0 are zero; 64 bit => !max
+unsigned allZero64(uint16x4_t a) {
+// CHECK: allZero64:
+// CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: b {{_bar|bar}}
+  if (!vmaxv_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if all lanes of V0 are zero; 128 bit => !max
+unsigned allZero128(uint16x8_t a) {
+// CHECK: allZero128:
+// CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: b {{_bar|bar}}
+  if (!vmaxvq_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if all lanes of V0 are non-zero; 64 bit => min
+unsigned allNonZero64(uint16x4_t a) {
+// CHECK: allNonZero64:
+// CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: movz w0, #0
+  if (vminv_u8(a))
+    return bar();
+  return 0;
+}
+
+// Branch if all lanes of V0 are non-zero; 128 bit => min
+unsigned allNonZero128(uint16x8_t a) {
+// CHECK: allNonZero128:
+// CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
+// CHECK: [[LABEL]]:
+// CHECK-NEXT: movz w0, #0
+  if (vminvq_u8(a))
+    return bar();
+  return 0;
+}
+
diff --git a/clang/test/CodeGen/arm64_vext.c b/clang/test/CodeGen/arm64_vext.c
new file mode 100644
index 00000000000..2ae8fc518ff
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vext.c
@@ -0,0 +1,239 @@
+// RUN: %clang -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+
+// Test ARM64 extract intrinsics
+// can use as back end test by adding a run line with
+// -check-prefix=CHECK-CODEGEN on the FileCheck
+
+#include <arm_neon.h>
+
+void test_vext_s8()
+{
+  // CHECK: test_vext_s8
+  int8x8_t xS8x8;
+  xS8x8 = vext_s8(xS8x8, xS8x8, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_s8:
+  // CHECK-CODEGEN: {{ext.8.*#1}}
+}
+
+void test_vext_u8()
+{
+  // CHECK: test_vext_u8
+  uint8x8_t xU8x8;
+  xU8x8 = vext_u8(xU8x8, xU8x8, 2);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_u8:
+  // CHECK-CODEGEN: {{ext.8.*#2}}
+}
+
+void test_vext_p8()
+{
+  // CHECK: test_vext_p8
+  poly8x8_t xP8x8;
+  xP8x8 = vext_p8(xP8x8, xP8x8, 3);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_p8:
+  // CHECK-CODEGEN: {{ext.8.*#3}}
+}
+
+void test_vext_s16()
+{
+  // CHECK: test_vext_s16
+  int16x4_t xS16x4;
+  xS16x4 = vext_s16(xS16x4, xS16x4, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_s16:
+  // CHECK-CODEGEN: {{ext.8.*#2}}
+}
+
+void test_vext_u16()
+{
+  // CHECK: test_vext_u16
+  uint16x4_t xU16x4;
+  xU16x4 = vext_u16(xU16x4, xU16x4, 2);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_u16:
+  // CHECK-CODEGEN: {{ext.8.*#4}}
+}
+
+void test_vext_p16()
+{
+  // CHECK: test_vext_p16
+  poly16x4_t xP16x4;
+  xP16x4 = vext_p16(xP16x4, xP16x4, 3);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_p16:
+  // CHECK-CODEGEN: {{ext.8.*#6}}
+}
+
+void test_vext_s32()
+{
+  // CHECK: test_vext_s32
+  int32x2_t xS32x2;
+  xS32x2 = vext_s32(xS32x2, xS32x2, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_s32:
+  // CHECK-CODEGEN: {{ext.8.*#4}}
+}
+
+void test_vext_u32()
+{
+  // CHECK: test_vext_u32
+  uint32x2_t xU32x2;
+  xU32x2 = vext_u32(xU32x2, xU32x2, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_u32:
+  // CHECK-CODEGEN: {{ext.8.*#4}}
+}
+
+void test_vext_f32()
+{
+  // CHECK: test_vext_f32
+  float32x2_t xF32x2;
+  xF32x2 = vext_f32(xF32x2, xF32x2, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_f32:
+  // CHECK-CODEGEN: {{ext.8.*#4}}
+}
+
+void test_vext_s64()
+{
+  // CHECK: test_vext_s64
+  int64x1_t xS64x1;
+  // FIXME don't use 1 as index or check for now, clang has a bug?
+  xS64x1 = vext_s64(xS64x1, xS64x1, /*1*/0);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_s64:
+  // CHECK_FIXME: {{ext.8.*#0}}
+}
+
+void test_vext_u64()
+{
+  // CHECK: test_vext_u64
+  uint64x1_t xU64x1;
+  // FIXME don't use 1 as index or check for now, clang has a bug?
+  xU64x1 = vext_u64(xU64x1, xU64x1, /*1*/0);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vext_u64:
+  // CHECK_FIXME: {{ext.8.*#0}}
+}
+
+void test_vextq_s8()
+{
+  // CHECK: test_vextq_s8
+  int8x16_t xS8x16;
+  xS8x16 = vextq_s8(xS8x16, xS8x16, 4);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_s8:
+  // CHECK-CODEGEN: {{ext.16.*#4}}
+}
+
+void test_vextq_u8()
+{
+  // CHECK: test_vextq_u8
+  uint8x16_t xU8x16;
+  xU8x16 = vextq_u8(xU8x16, xU8x16, 5);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_u8:
+  // CHECK-CODEGEN: {{ext.16.*#5}}
+}
+
+void test_vextq_p8()
+{
+  // CHECK: test_vextq_p8
+  poly8x16_t xP8x16;
+  xP8x16 = vextq_p8(xP8x16, xP8x16, 6);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_p8:
+  // CHECK-CODEGEN: {{ext.16.*#6}}
+}
+
+void test_vextq_s16()
+{
+  // CHECK: test_vextq_s16
+  int16x8_t xS16x8;
+  xS16x8 = vextq_s16(xS16x8, xS16x8, 7);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_s16:
+  // CHECK-CODEGEN: {{ext.16.*#14}}
+}
+
+void test_vextq_u16()
+{
+  // CHECK: test_vextq_u16
+  uint16x8_t xU16x8;
+  xU16x8 = vextq_u16(xU16x8, xU16x8, 4);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_u16:
+  // CHECK-CODEGEN: {{ext.16.*#8}}
+}
+
+void test_vextq_p16()
+{
+  // CHECK: test_vextq_p16
+  poly16x8_t xP16x8;
+  xP16x8 = vextq_p16(xP16x8, xP16x8, 5);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_p16:
+  // CHECK-CODEGEN: {{ext.16.*#10}}
+}
+
+void test_vextq_s32()
+{
+  // CHECK: test_vextq_s32
+  int32x4_t xS32x4;
+  xS32x4 = vextq_s32(xS32x4, xS32x4, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_s32:
+  // CHECK-CODEGEN: {{ext.16.*#4}}
+}
+
+void test_vextq_u32()
+{
+  // CHECK: test_vextq_u32
+  uint32x4_t xU32x4;
+  xU32x4 = vextq_u32(xU32x4, xU32x4, 2);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_u32:
+  // CHECK-CODEGEN: {{ext.16.*#8}}
+}
+
+void test_vextq_f32()
+{
+  // CHECK: test_vextq_f32
+  float32x4_t xF32x4;
+  xF32x4 = vextq_f32(xF32x4, xF32x4, 3);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_f32:
+  // CHECK-CODEGEN: {{ext.16.*#12}}
+}
+
+void test_vextq_s64()
+{
+  // CHECK: test_vextq_s64
+  int64x2_t xS64x2;
+  xS64x2 = vextq_s64(xS64x2, xS64x2, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_s64:
+  // CHECK-CODEGEN: {{ext.16.*#8}}
+}
+
+void test_vextq_u64()
+{
+  // CHECK: test_vextq_u64
+  uint64x2_t xU64x2;
+  xU64x2 = vextq_u64(xU64x2, xU64x2, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_u64:
+  // CHECK-CODEGEN: {{ext.16.*#8}}
+}
+
+void test_vextq_f64()
+{
+  // CHECK: test_vextq_f64
+  float64x2_t xF64x2;
+  xF64x2 = vextq_f64(xF64x2, xF64x2, 1);
+  // CHECK: shufflevector
+  // CHECK-CODEGEN: test_vextq_u64:
+  // CHECK-CODEGEN: {{ext.16.*#8}}
+}
diff --git a/clang/test/CodeGen/arm64_vfma.c b/clang/test/CodeGen/arm64_vfma.c
new file mode 100644
index 00000000000..b18b59c9a15
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vfma.c
@@ -0,0 +1,136 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD fused multiply add intrinsics
+
+#include <arm_neon.h>
+
+float32x2_t test_vfma_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
+  // CHECK: test_vfma_f32
+  return vfma_f32(a1, a2, a3);
+  // CHECK: llvm.fma.v2f32({{.*a2, .*a3, .*a1}})
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vfmaq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
+  // CHECK: test_vfmaq_f32
+  return vfmaq_f32(a1, a2, a3);
+  // CHECK: llvm.fma.v4f32({{.*a2, .*a3, .*a1}})
+  // CHECK-NEXT: ret
+}
+
+float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
+  // CHECK: test_vfmaq_f64
+  return vfmaq_f64(a1, a2, a3);
+  // CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}})
+  // CHECK-NEXT: ret
+}
+
+float32x2_t test_vfma_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
+  // CHECK: test_vfma_lane_f32
+  return vfma_lane_f32(a1, a2, a3, 1);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 1 (usually a shufflevector)
+  // CHECK: llvm.fma.v2f32(<2 x float> %a2, <2 x float> {{.*}}, <2 x float> %a1)
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vfmaq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
+  // CHECK: test_vfmaq_lane_f32
+  return vfmaq_lane_f32(a1, a2, a3, 1);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 1 (usually a shufflevector)
+  // CHECK: llvm.fma.v4f32(<4 x float> %a2, <4 x float> {{.*}}, <4 x float> %a1)
+  // CHECK-NEXT: ret
+}
+
+float64x2_t test_vfmaq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
+  // CHECK: test_vfmaq_lane_f64
+  return vfmaq_lane_f64(a1, a2, a3, 0);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 1 (usually a shufflevector)
+  // CHECK: llvm.fma.v2f64(<2 x double> %a2, <2 x double> {{.*}}, <2 x double> %a1)
+  // CHECK-NEXT: ret
+}
+
+float32x2_t test_vfma_n_f32(float32x2_t a1, float32x2_t a2, float32_t a3) {
+  // CHECK: test_vfma_n_f32
+  return vfma_n_f32(a1, a2, a3);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 0 (usually two insertelements)
+  // CHECK: llvm.fma.v2f32
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vfmaq_n_f32(float32x4_t a1, float32x4_t a2, float32_t a3) {
+  // CHECK: test_vfmaq_n_f32
+  return vfmaq_n_f32(a1, a2, a3);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 0 (usually four insertelements)
+  // CHECK: llvm.fma.v4f32
+  // CHECK-NEXT: ret
+}
+
+float64x2_t test_vfmaq_n_f64(float64x2_t a1, float64x2_t a2, float64_t a3) {
+  // CHECK: test_vfmaq_n_f64
+  return vfmaq_n_f64(a1, a2, a3);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 0 (usually two insertelements)
+  // CHECK: llvm.fma.v2f64
+  // CHECK-NEXT: ret
+}
+
+float32x2_t test_vfms_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
+  // CHECK: test_vfms_f32
+  return vfms_f32(a1, a2, a3);
+  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
+  // CHECK: llvm.fma.v2f32(<2 x float> %a3, <2 x float> [[NEG]], <2 x float> %a1)
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vfmsq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
+  // CHECK: test_vfmsq_f32
+  return vfmsq_f32(a1, a2, a3);
+  // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
+  // CHECK: llvm.fma.v4f32(<4 x float> %a3, <4 x float> [[NEG]], <4 x float> %a1)
+  // CHECK-NEXT: ret
+}
+
+float64x2_t test_vfmsq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
+  // CHECK: test_vfmsq_f64
+  return vfmsq_f64(a1, a2, a3);
+  // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
+  // CHECK: llvm.fma.v2f64(<2 x double> %a3, <2 x double> [[NEG]], <2 x double> %a1)
+  // CHECK-NEXT: ret
+}
+
+float32x2_t test_vfms_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
+  // CHECK: test_vfms_lane_f32
+  return vfms_lane_f32(a1, a2, a3, 1);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 1 (usually a shufflevector)
+  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
+  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
+  // CHECK: llvm.fma.v2f32(<2 x float> {{.*}}, <2 x float> [[LANE]], <2 x float> %a1)
+  // CHECK-NEXT: ret
+}
+
+float32x4_t test_vfmsq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
+  // CHECK: test_vfmsq_lane_f32
+  return vfmsq_lane_f32(a1, a2, a3, 1);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 1 (usually a shufflevector)
+  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
+  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
+  // CHECK: llvm.fma.v4f32(<4 x float> {{.*}}, <4 x float> [[LANE]], <4 x float> %a1)
+  // CHECK-NEXT: ret
+}
+
+float64x2_t test_vfmsq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
+  // CHECK: test_vfmsq_lane_f64
+  return vfmsq_lane_f64(a1, a2, a3, 0);
+  // NB: the test below is deliberately lose, so that we don't depend too much
+  // upon the exact IR used to select lane 1 (usually a shufflevector)
+  // CHECK: [[NEG:%.*]] = fsub <1 x double> {{.*}}, %a3
+  // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[NEG]]
+  // CHECK: llvm.fma.v2f64(<2 x double> {{.*}}, <2 x double> [[LANE]], <2 x double> %a1)
+  // CHECK-NEXT: ret
+}
diff --git a/clang/test/CodeGen/arm64_vget.c b/clang/test/CodeGen/arm64_vget.c
new file mode 100644
index 00000000000..3a7a41ae6e4
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vget.c
@@ -0,0 +1,13 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD vget intrinsics
+
+#include <arm_neon.h>
+
+float64_t test_vget_lane_f64(float64x1_t a1) {
+  // CHECK: test_vget_lane_f64
+  // why isn't 1 allowed as second argument?
+  return vget_lane_f64(a1, 0);
+  // CHECK: extractelement {{.*}} i32 0
+  // CHECK-NEXT: ret
+}
+
diff --git a/clang/test/CodeGen/arm64_vneg.c b/clang/test/CodeGen/arm64_vneg.c
new file mode 100644
index 00000000000..0af4ca3a342
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vneg.c
@@ -0,0 +1,18 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD negate and saturating negate intrinsics
+
+#include <arm_neon.h>
+
+int64x2_t test_vnegq_s64(int64x2_t a1) {
+  // CHECK: test_vnegq_s64
+  return vnegq_s64(a1);
+  // CHECK: sub <2 x i64> zeroinitializer, %a1
+  // CHECK-NEXT: ret
+}
+
+int64x2_t test_vqnegq_s64(int64x2_t a1) {
+  // CHECK: test_vqnegq_s64
+  return vqnegq_s64(a1);
+  // CHECK: llvm.arm64.neon.sqneg.v2i64
+  // CHECK-NEXT: ret
+}
diff --git a/clang/test/CodeGen/arm64_vqmov.c b/clang/test/CodeGen/arm64_vqmov.c
new file mode 100644
index 00000000000..8e960b1dd63
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vqmov.c
@@ -0,0 +1,77 @@
+// RUN: %clang -O3 -target arm64-apple-ios7 -ffreestanding -S -o - %s | FileCheck %s
+// REQUIRES: arm64-registered-target
+/// Test vqmov[u]n_high_<su>{16,32,64) ARM64 intrinsics
+
+#include <arm_neon.h>
+
+// vqmovn_high_s16 -> UQXTN2 Vd.16b,Vn.8h
+int8x16_t test_vqmovn_high_s16(int8x8_t Vdlow, int16x8_t Vn)
+{
+    return vqmovn_high_s16(Vdlow, Vn);
+  // CHECK: test_vqmovn_high_s16:
+  // CHECK: sqxtn2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovun_high_s16 -> UQXTN2 Vd.16b,Vn.8h
+uint8x16_t test_vqmovun_high_s16(uint8x8_t Vdlow, uint16x8_t Vn)
+{
+    return vqmovun_high_s16(Vdlow, Vn);
+  // CHECK: test_vqmovun_high_s16:
+  // CHECK: sqxtun2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovn_high_s32 -> SQXTN2 Vd.8h,Vn.4s
+int16x8_t test_vqmovn_high_s32(int16x4_t Vdlow, int32x4_t Vn)
+{
+    return vqmovn_high_s32(Vdlow, Vn);
+  // CHECK: test_vqmovn_high_s32:
+  // CHECK: sqxtn2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovn_high_u32 -> UQXTN2 Vd.8h,Vn.4s
+uint16x8_t test_vqmovn_high_u32(uint16x4_t Vdlow, uint32x4_t Vn)
+{
+    return vqmovn_high_u32(Vdlow, Vn);
+  // CHECK: test_vqmovn_high_u32:
+  // CHECK: uqxtn2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovn_high_s64 -> SQXTN2 Vd.4s,Vn.2d
+int32x4_t test_vqmovn_high_s64(int32x2_t Vdlow, int64x2_t Vn)
+{
+    return vqmovn_high_s64(Vdlow, Vn);
+  // CHECK: test_vqmovn_high_s64:
+  // CHECK: sqxtn2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovn_high_u64 -> UQXTN2 Vd.4s,Vn.2d
+uint32x4_t test_vqmovn_high_u64(uint32x2_t Vdlow, uint64x2_t Vn)
+{
+    return vqmovn_high_u64(Vdlow, Vn);
+  // CHECK: test_vqmovn_high_u64:
+  // CHECK: uqxtn2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovn_high_u16 -> UQXTN2 Vd.16b,Vn.8h
+uint8x16_t test_vqmovn_high_u16(uint8x8_t Vdlow, uint16x8_t Vn)
+{
+    return vqmovn_high_u16(Vdlow, Vn);
+  // CHECK: test_vqmovn_high_u16:
+  // CHECK: uqxtn2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovun_high_s32 -> SQXTUN2 Vd.8h,Vn.4s
+uint16x8_t test_vqmovun_high_s32(uint16x4_t Vdlow, uint32x4_t Vn)
+{
+    return vqmovun_high_s32(Vdlow, Vn);
+  // CHECK: test_vqmovun_high_s32:
+  // CHECK: sqxtun2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
+
+// vqmovun_high_s64 -> SQXTUN2  Vd.4s,Vn.2d
+uint32x4_t test_vqmovun_high_s64(uint32x2_t Vdlow, uint64x2_t Vn)
+{
+    return vqmovun_high_s64(Vdlow, Vn);
+  // CHECK: test_vqmovun_high_s64:
+  // CHECK: sqxtun2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
+}
diff --git a/clang/test/CodeGen/arm64_vrecps.c b/clang/test/CodeGen/arm64_vrecps.c
new file mode 100644
index 00000000000..becfe618369
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vrecps.c
@@ -0,0 +1,26 @@
+// RUN: %clang -O3 -target arm64-apple-ios7 -ffreestanding -c -S -o - %s | FileCheck %s
+// REQUIRES: arm64-registered-target
+/// Test vrecpss_f32, vrecpsd_f64 ARM64 intrinsics
+
+
+#include <arm_neon.h>
+
+// vrecpss_f32 -> FRECPS Sd,Sa,Sb
+//
+float32_t test_vrecpss_f32(float32_t Vdlow, float32_t Vn)
+{
+    return vrecpss_f32(Vdlow, Vn);
+  // CHECK: test_vrecpss_f32:
+  // CHECK: frecps  s0, s0, s1
+  // CHECK-NEXT: ret
+}
+
+// vrecpsd_f64 -> FRECPS Dd,Da,Db
+//
+float64_t test_vrecpsd_f64(float64_t Vdlow, float64_t Vn)
+{
+    return vrecpsd_f64(Vdlow, Vn);
+  // CHECK: test_vrecpsd_f64:
+  // CHECK: frecps d0, d0, d1
+  // CHECK-NEXT: ret
+}
diff --git a/clang/test/CodeGen/arm64_vset_lane.c b/clang/test/CodeGen/arm64_vset_lane.c
new file mode 100644
index 00000000000..c2acc4cd783
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vset_lane.c
@@ -0,0 +1,31 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD set lane intrinsics INCOMPLETE
+
+#include <arm_neon.h>
+
+float16x4_t test_vset_lane_f16(float16_t *a1, float16x4_t a2) {
+  // CHECK-LABEL: test_vset_lane_f16
+  return vset_lane_f16(*a1, a2, 1);
+  // CHECK insertelement <4 x i16> %a2, i16 %a1, i32 1
+}
+
+float16x8_t test_vsetq_lane_f16(float16_t *a1, float16x8_t a2) {
+  // CHECK-LABEL: test_vsetq_lane_f16
+  return vsetq_lane_f16(*a1, a2, 4);
+  // CHECK insertelement <8 x i16> %a2, i16 %a1, i32 4
+}
+
+// problem with scalar_to_vector in backend.  Punt for now
+#if 0
+float64x1_t test_vset_lane_f64(float64_t a1, float64x1_t a2) {
+  // CHECK-LABEL@ test_vset_lane_f64
+  return vset_lane_f64(a1, a2, 0);
+  // CHECK@ @llvm.arm64.neon.smaxv.i32.v8i8
+}
+#endif
+
+float64x2_t test_vsetq_lane_f64(float64_t a1, float64x2_t a2) {
+  // CHECK-LABEL: test_vsetq_lane_f64
+  return vsetq_lane_f64(a1, a2, 0);
+  // CHECK insertelement <2 x double> %a2, double %a1, i32 0
+}
diff --git a/clang/test/CodeGen/arm64_vshift.c b/clang/test/CodeGen/arm64_vshift.c
new file mode 100644
index 00000000000..cac4d12bd91
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vshift.c
@@ -0,0 +1,357 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -ffreestanding -emit-llvm -o - -O1 %s | FileCheck %s
+#include <arm_neon.h>
+
+int8x8_t test_vqshl_n_s8(int8x8_t in) {
+  // CHECK-LABEL: @test_vqshl_n_s8
+  // CHECK: call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  return vqshl_n_s8(in, 1);
+}
+
+int16x4_t test_vqshl_n_s16(int16x4_t in) {
+  // CHECK-LABEL: @test_vqshl_n_s16
+  // CHECK: call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+  return vqshl_n_s16(in, 1);
+}
+
+int32x2_t test_vqshl_n_s32(int32x2_t in) {
+  // CHECK-LABEL: @test_vqshl_n_s32
+  // CHECK: call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
+  return vqshl_n_s32(in, 1);
+}
+
+int64x1_t test_vqshl_n_s64(int64x1_t in) {
+  // CHECK-LABEL: @test_vqshl_n_s64
+  // CHECK: call <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
+  return vqshl_n_s64(in, 1);
+}
+
+
+int8x16_t test_vqshlq_n_s8(int8x16_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_s8
+  // CHECK: call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  return vqshlq_n_s8(in, 1);
+}
+
+int16x8_t test_vqshlq_n_s16(int16x8_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_s16
+  // CHECK: call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  return vqshlq_n_s16(in, 1);
+}
+
+int32x4_t test_vqshlq_n_s32(int32x4_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_s32
+  // CHECK: call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  return vqshlq_n_s32(in, 1);
+}
+
+int64x2_t test_vqshlq_n_s64(int64x2_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_s64
+  // CHECK: call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
+  return vqshlq_n_s64(in, 1);
+}
+
+uint8x8_t test_vqshl_n_u8(uint8x8_t in) {
+  // CHECK-LABEL: @test_vqshl_n_u8
+  // CHECK: call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  return vqshl_n_u8(in, 1);
+}
+
+uint16x4_t test_vqshl_n_u16(uint16x4_t in) {
+  // CHECK-LABEL: @test_vqshl_n_u16
+  // CHECK: call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+  return vqshl_n_u16(in, 1);
+}
+
+uint32x2_t test_vqshl_n_u32(uint32x2_t in) {
+  // CHECK-LABEL: @test_vqshl_n_u32
+  // CHECK: call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
+  return vqshl_n_u32(in, 1);
+}
+
+uint64x1_t test_vqshl_n_u64(uint64x1_t in) {
+  // CHECK-LABEL: @test_vqshl_n_u64
+  // CHECK: call <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
+  return vqshl_n_u64(in, 1);
+}
+
+uint8x16_t test_vqshlq_n_u8(uint8x16_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_u8
+  // CHECK: call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  return vqshlq_n_u8(in, 1);
+}
+
+uint16x8_t test_vqshlq_n_u16(uint16x8_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_u16
+  // CHECK: call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  return vqshlq_n_u16(in, 1);
+}
+
+uint32x4_t test_vqshlq_n_u32(uint32x4_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_u32
+  // CHECK: call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  return vqshlq_n_u32(in, 1);
+}
+
+uint64x2_t test_vqshlq_n_u64(uint64x2_t in) {
+  // CHECK-LABEL: @test_vqshlq_n_u64
+  // CHECK: call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
+  return vqshlq_n_u64(in, 1);
+}
+
+int8x8_t test_vrshr_n_s8(int8x8_t in) {
+  // CHECK-LABEL: @test_vrshr_n_s8
+  // CHECK: call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  return vrshr_n_s8(in, 1);
+}
+
+int16x4_t test_vrshr_n_s16(int16x4_t in) {
+  // CHECK-LABEL: @test_vrshr_n_s16
+  // CHECK: call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+  return vrshr_n_s16(in, 1);
+}
+
+int32x2_t test_vrshr_n_s32(int32x2_t in) {
+  // CHECK-LABEL: @test_vrshr_n_s32
+  // CHECK: call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
+  return vrshr_n_s32(in, 1);
+}
+
+int64x1_t test_vrshr_n_s64(int64x1_t in) {
+  // CHECK-LABEL: @test_vrshr_n_s64
+  // CHECK: call <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
+  return vrshr_n_s64(in, 1);
+}
+
+
+int8x16_t test_vrshrq_n_s8(int8x16_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_s8
+  // CHECK: call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  return vrshrq_n_s8(in, 1);
+}
+
+int16x8_t test_vrshrq_n_s16(int16x8_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_s16
+  // CHECK: call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+  return vrshrq_n_s16(in, 1);
+}
+
+int32x4_t test_vrshrq_n_s32(int32x4_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_s32
+  // CHECK: call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  return vrshrq_n_s32(in, 1);
+}
+
+int64x2_t test_vrshrq_n_s64(int64x2_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_s64
+  // CHECK: call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>
+  return vrshrq_n_s64(in, 1);
+}
+
+uint8x8_t test_vrshr_n_u8(uint8x8_t in) {
+  // CHECK-LABEL: @test_vrshr_n_u8
+  // CHECK: call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  return vrshr_n_u8(in, 1);
+}
+
+uint16x4_t test_vrshr_n_u16(uint16x4_t in) {
+  // CHECK-LABEL: @test_vrshr_n_u16
+  // CHECK: call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+  return vrshr_n_u16(in, 1);
+}
+
+uint32x2_t test_vrshr_n_u32(uint32x2_t in) {
+  // CHECK-LABEL: @test_vrshr_n_u32
+  // CHECK: call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
+  return vrshr_n_u32(in, 1);
+}
+
+uint64x1_t test_vrshr_n_u64(uint64x1_t in) {
+  // CHECK-LABEL: @test_vrshr_n_u64
+  // CHECK: call <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
+  return vrshr_n_u64(in, 1);
+}
+
+uint8x16_t test_vrshrq_n_u8(uint8x16_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_u8
+  // CHECK: call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  return vrshrq_n_u8(in, 1);
+}
+
+uint16x8_t test_vrshrq_n_u16(uint16x8_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_u16
+  // CHECK: call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+  return vrshrq_n_u16(in, 1);
+}
+
+uint32x4_t test_vrshrq_n_u32(uint32x4_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_u32
+  // CHECK: call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  return vrshrq_n_u32(in, 1);
+}
+
+uint64x2_t test_vrshrq_n_u64(uint64x2_t in) {
+  // CHECK-LABEL: @test_vrshrq_n_u64
+  // CHECK: call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>
+  return vrshrq_n_u64(in, 1);
+}
+
+int8x8_t test_vqshlu_n_s8(int8x8_t in) {
+  // CHECK-LABEL: @test_vqshlu_n_s8
+  // CHECK: call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  return vqshlu_n_s8(in, 1);
+}
+
+int16x4_t test_vqshlu_n_s16(int16x4_t in) {
+  // CHECK-LABEL: @test_vqshlu_n_s16
+  // CHECK: call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+  return vqshlu_n_s16(in, 1);
+}
+
+int32x2_t test_vqshlu_n_s32(int32x2_t in) {
+  // CHECK-LABEL: @test_vqshlu_n_s32
+  // CHECK: call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
+  return vqshlu_n_s32(in, 1);
+}
+
+int64x1_t test_vqshlu_n_s64(int64x1_t in) {
+  // CHECK-LABEL: @test_vqshlu_n_s64
+  // CHECK: call <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
+  return vqshlu_n_s64(in, 1);
+}
+
+
+int8x16_t test_vqshluq_n_s8(int8x16_t in) {
+  // CHECK-LABEL: @test_vqshluq_n_s8
+  // CHECK: call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  return vqshluq_n_s8(in, 1);
+}
+
+int16x8_t test_vqshluq_n_s16(int16x8_t in) {
+  // CHECK-LABEL: @test_vqshluq_n_s16
+  // CHECK: call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  return vqshluq_n_s16(in, 1);
+}
+
+int32x4_t test_vqshluq_n_s32(int32x4_t in) {
+  // CHECK-LABEL: @test_vqshluq_n_s32
+  // CHECK: call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  return vqshluq_n_s32(in, 1);
+}
+
+int64x2_t test_vqshluq_n_s64(int64x2_t in) {
+  // CHECK-LABEL: @test_vqshluq_n_s64
+  // CHECK: call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
+  return vqshluq_n_s64(in, 1);
+}
+
+int8x8_t test_vrsra_n_s8(int8x8_t acc, int8x8_t in) {
+  // CHECK-LABEL: @test_vrsra_n_s8
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  // CHECK: add <8 x i8> [[TMP]], %acc
+  return vrsra_n_s8(acc, in, 1);
+}
+
+int16x4_t test_vrsra_n_s16(int16x4_t acc, int16x4_t in) {
+  // CHECK-LABEL: @test_vrsra_n_s16
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+  // CHECK: add <4 x i16> [[TMP]], %acc
+  return vrsra_n_s16(acc, in, 1);
+}
+
+int32x2_t test_vrsra_n_s32(int32x2_t acc, int32x2_t in) {
+  // CHECK-LABEL: @test_vrsra_n_s32
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
+  // CHECK: add <2 x i32> [[TMP]], %acc
+  return vrsra_n_s32(acc, in, 1);
+}
+
+int64x1_t test_vrsra_n_s64(int64x1_t acc, int64x1_t in) {
+  // CHECK-LABEL: @test_vrsra_n_s64
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
+  // CHECK: add <1 x i64> [[TMP]], %acc
+  return vrsra_n_s64(acc, in, 1);
+}
+
+int8x16_t test_vrsraq_n_s8(int8x16_t acc, int8x16_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_s8
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  // CHECK: add <16 x i8> [[TMP]], %acc
+  return vrsraq_n_s8(acc, in, 1);
+}
+
+int16x8_t test_vrsraq_n_s16(int16x8_t acc, int16x8_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_s16
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+  // CHECK: add <8 x i16> [[TMP]], %acc
+  return vrsraq_n_s16(acc, in, 1);
+}
+
+int32x4_t test_vrsraq_n_s32(int32x4_t acc, int32x4_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_s32
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  // CHECK: add <4 x i32> [[TMP]], %acc
+  return vrsraq_n_s32(acc, in, 1);
+}
+
+int64x2_t test_vrsraq_n_s64(int64x2_t acc, int64x2_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_s64
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>)
+  // CHECK: add <2 x i64> [[TMP]], %acc
+  return vrsraq_n_s64(acc, in, 1);
+}
+
+uint8x8_t test_vrsra_n_u8(uint8x8_t acc, uint8x8_t in) {
+  // CHECK-LABEL: @test_vrsra_n_u8
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  // CHECK: add <8 x i8> [[TMP]], %acc
+  return vrsra_n_u8(acc, in, 1);
+}
+
+uint16x4_t test_vrsra_n_u16(uint16x4_t acc, uint16x4_t in) {
+  // CHECK-LABEL: @test_vrsra_n_u16
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+  // CHECK: add <4 x i16> [[TMP]], %acc
+  return vrsra_n_u16(acc, in, 1);
+}
+
+uint32x2_t test_vrsra_n_u32(uint32x2_t acc, uint32x2_t in) {
+  // CHECK-LABEL: @test_vrsra_n_u32
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
+  // CHECK: add <2 x i32> [[TMP]], %acc
+  return vrsra_n_u32(acc, in, 1);
+}
+
+uint64x1_t test_vrsra_n_u64(uint64x1_t acc, uint64x1_t in) {
+  // CHECK-LABEL: @test_vrsra_n_u64
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
+  // CHECK: add <1 x i64> [[TMP]], %acc
+  return vrsra_n_u64(acc, in, 1);
+}
+
+uint8x16_t test_vrsraq_n_u8(uint8x16_t acc, uint8x16_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_u8
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  // CHECK: add <16 x i8> [[TMP]], %acc
+  return vrsraq_n_u8(acc, in, 1);
+}
+
+uint16x8_t test_vrsraq_n_u16(uint16x8_t acc, uint16x8_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_u16
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+  // CHECK: add <8 x i16> [[TMP]], %acc
+  return vrsraq_n_u16(acc, in, 1);
+}
+
+uint32x4_t test_vrsraq_n_u32(uint32x4_t acc, uint32x4_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_u32
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  // CHECK: add <4 x i32> [[TMP]], %acc
+  return vrsraq_n_u32(acc, in, 1);
+}
+
+uint64x2_t test_vrsraq_n_u64(uint64x2_t acc, uint64x2_t in) {
+  // CHECK-LABEL: @test_vrsraq_n_u64
+  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>)
+  // CHECK: add <2 x i64> [[TMP]], %acc
+  return vrsraq_n_u64(acc, in, 1);
+}
diff --git a/clang/test/CodeGen/arm64_vsli.c b/clang/test/CodeGen/arm64_vsli.c
new file mode 100644
index 00000000000..7a1813d0c8f
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vsli.c
@@ -0,0 +1,148 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - %s | \
+// RUN:   FileCheck -check-prefix=CHECK_CODEGEN %s
+// REQUIRES: arm64-registered-target
+// Test
+
+#include <arm_neon.h>
+
+int8x8_t test_vsli_n_s8(int8x8_t a1, int8x8_t a2) {
+  // CHECK: test_vsli_n_s8
+  return vsli_n_s8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v8i8
+  // CHECK_CODEGEN: sli.8b  v0, v1, #3
+}
+
+int16x4_t test_vsli_n_s16(int16x4_t a1, int16x4_t a2) {
+  // CHECK: test_vsli_n_s16
+  return vsli_n_s16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v4i16
+  // CHECK_CODEGEN: sli.4h  v0, v1, #3
+}
+
+int32x2_t test_vsli_n_s32(int32x2_t a1, int32x2_t a2) {
+  // CHECK: test_vsli_n_s32
+  return vsli_n_s32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v2i32
+  // CHECK_CODEGEN: sli.2s  v0, v1, #1
+}
+
+int64x1_t test_vsli_n_s64(int64x1_t a1, int64x1_t a2) {
+  // CHECK: test_vsli_n_s64
+  return vsli_n_s64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v1i64
+  // CHECK_CODEGEN: sli     d0, d1, #1
+}
+
+uint8x8_t test_vsli_n_u8(uint8x8_t a1, uint8x8_t a2) {
+  // CHECK: test_vsli_n_u8
+  return vsli_n_u8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v8i8
+  // CHECK_CODEGEN: sli.8b  v0, v1, #3
+}
+
+uint16x4_t test_vsli_n_u16(uint16x4_t a1, uint16x4_t a2) {
+  // CHECK: test_vsli_n_u16
+  return vsli_n_u16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v4i16
+  // CHECK_CODEGEN: sli.4h  v0, v1, #3
+}
+
+uint32x2_t test_vsli_n_u32(uint32x2_t a1, uint32x2_t a2) {
+  // CHECK: test_vsli_n_u32
+  return vsli_n_u32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v2i32
+  // CHECK_CODEGEN: sli.2s  v0, v1, #1
+}
+
+uint64x1_t test_vsli_n_u64(uint64x1_t a1, uint64x1_t a2) {
+  // CHECK: test_vsli_n_u64
+  return vsli_n_u64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v1i64
+  // CHECK_CODEGEN: sli     d0, d1, #1
+}
+
+poly8x8_t test_vsli_n_p8(poly8x8_t a1, poly8x8_t a2) {
+  // CHECK: test_vsli_n_p8
+  return vsli_n_p8(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v8i8
+  // CHECK_CODEGEN: sli.8b  v0, v1, #1
+}
+
+poly16x4_t test_vsli_n_p16(poly16x4_t a1, poly16x4_t a2) {
+  // CHECK: test_vsli_n_p16
+  return vsli_n_p16(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v4i16
+  // CHECK_CODEGEN: sli.4h  v0, v1, #1
+}
+
+int8x16_t test_vsliq_n_s8(int8x16_t a1, int8x16_t a2) {
+  // CHECK: test_vsliq_n_s8
+  return vsliq_n_s8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v16i8
+  // CHECK_CODEGEN: sli.16b v0, v1, #3
+}
+
+int16x8_t test_vsliq_n_s16(int16x8_t a1, int16x8_t a2) {
+  // CHECK: test_vsliq_n_s16
+  return vsliq_n_s16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v8i16
+  // CHECK_CODEGEN: sli.8h  v0, v1, #3
+}
+
+int32x4_t test_vsliq_n_s32(int32x4_t a1, int32x4_t a2) {
+  // CHECK: test_vsliq_n_s32
+  return vsliq_n_s32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v4i32
+  // CHECK_CODEGEN: sli.4s  v0, v1, #1
+}
+
+int64x2_t test_vsliq_n_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vsliq_n_s64
+  return vsliq_n_s64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v2i64
+  // CHECK_CODEGEN: sli.2d  v0, v1, #1
+}
+
+uint8x16_t test_vsliq_n_u8(uint8x16_t a1, uint8x16_t a2) {
+  // CHECK: test_vsliq_n_u8
+  return vsliq_n_u8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v16i8
+  // CHECK_CODEGEN: sli.16b v0, v1, #3
+}
+
+uint16x8_t test_vsliq_n_u16(uint16x8_t a1, uint16x8_t a2) {
+  // CHECK: test_vsliq_n_u16
+  return vsliq_n_u16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsli.v8i16
+  // CHECK_CODEGEN: sli.8h  v0, v1, #3
+}
+
+uint32x4_t test_vsliq_n_u32(uint32x4_t a1, uint32x4_t a2) {
+  // CHECK: test_vsliq_n_u32
+  return vsliq_n_u32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v4i32
+  // CHECK_CODEGEN: sli.4s  v0, v1, #1
+}
+
+uint64x2_t test_vsliq_n_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vsliq_n_u64
+  return vsliq_n_u64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v2i64
+  // CHECK_CODEGEN: sli.2d  v0, v1, #1
+}
+
+poly8x16_t test_vsliq_n_p8(poly8x16_t a1, poly8x16_t a2) {
+  // CHECK: test_vsliq_n_p8
+  return vsliq_n_p8(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v16i8
+  // CHECK_CODEGEN: sli.16b v0, v1, #1
+}
+
+poly16x8_t test_vsliq_n_p16(poly16x8_t a1, poly16x8_t a2) {
+  // CHECK: test_vsliq_n_p16
+  return vsliq_n_p16(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsli.v8i16
+  // CHECK_CODEGEN: sli.8h  v0, v1, #1
+}
+
diff --git a/clang/test/CodeGen/arm64_vsri.c b/clang/test/CodeGen/arm64_vsri.c
new file mode 100644
index 00000000000..2a722877525
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vsri.c
@@ -0,0 +1,149 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - %s | \
+// RUN:   FileCheck -check-prefix=CHECK_CODEGEN %s
+// REQUIRES: arm64-registered-target
+
+// Test ARM64 SIMD vector shift right and insert: vsri[q]_n_*
+
+#include <arm_neon.h>
+
+int8x8_t test_vsri_n_s8(int8x8_t a1, int8x8_t a2) {
+  // CHECK: test_vsri_n_s8
+  return vsri_n_s8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v8i8
+  // CHECK_CODEGEN: sri.8b  v0, v1, #3
+}
+
+int16x4_t test_vsri_n_s16(int16x4_t a1, int16x4_t a2) {
+  // CHECK: test_vsri_n_s16
+  return vsri_n_s16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v4i16
+  // CHECK_CODEGEN: sri.4h  v0, v1, #3
+}
+
+int32x2_t test_vsri_n_s32(int32x2_t a1, int32x2_t a2) {
+  // CHECK: test_vsri_n_s32
+  return vsri_n_s32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v2i32
+  // CHECK_CODEGEN: sri.2s  v0, v1, #1
+}
+
+int64x1_t test_vsri_n_s64(int64x1_t a1, int64x1_t a2) {
+  // CHECK: test_vsri_n_s64
+  return vsri_n_s64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v1i64
+  // CHECK_CODEGEN: sri     d0, d1, #1
+}
+
+uint8x8_t test_vsri_n_u8(uint8x8_t a1, uint8x8_t a2) {
+  // CHECK: test_vsri_n_u8
+  return vsri_n_u8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v8i8
+  // CHECK_CODEGEN: sri.8b  v0, v1, #3
+}
+
+uint16x4_t test_vsri_n_u16(uint16x4_t a1, uint16x4_t a2) {
+  // CHECK: test_vsri_n_u16
+  return vsri_n_u16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v4i16
+  // CHECK_CODEGEN: sri.4h  v0, v1, #3
+}
+
+uint32x2_t test_vsri_n_u32(uint32x2_t a1, uint32x2_t a2) {
+  // CHECK: test_vsri_n_u32
+  return vsri_n_u32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v2i32
+  // CHECK_CODEGEN: sri.2s  v0, v1, #1
+}
+
+uint64x1_t test_vsri_n_u64(uint64x1_t a1, uint64x1_t a2) {
+  // CHECK: test_vsri_n_u64
+  return vsri_n_u64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v1i64
+  // CHECK_CODEGEN: sri     d0, d1, #1
+}
+
+poly8x8_t test_vsri_n_p8(poly8x8_t a1, poly8x8_t a2) {
+  // CHECK: test_vsri_n_p8
+  return vsri_n_p8(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v8i8
+  // CHECK_CODEGEN: sri.8b  v0, v1, #1
+}
+
+poly16x4_t test_vsri_n_p16(poly16x4_t a1, poly16x4_t a2) {
+  // CHECK: test_vsri_n_p16
+  return vsri_n_p16(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v4i16
+  // CHECK_CODEGEN: sri.4h  v0, v1, #1
+}
+
+int8x16_t test_vsriq_n_s8(int8x16_t a1, int8x16_t a2) {
+  // CHECK: test_vsriq_n_s8
+  return vsriq_n_s8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v16i8
+  // CHECK_CODEGEN: sri.16b v0, v1, #3
+}
+
+int16x8_t test_vsriq_n_s16(int16x8_t a1, int16x8_t a2) {
+  // CHECK: test_vsriq_n_s16
+  return vsriq_n_s16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v8i16
+  // CHECK_CODEGEN: sri.8h  v0, v1, #3
+}
+
+int32x4_t test_vsriq_n_s32(int32x4_t a1, int32x4_t a2) {
+  // CHECK: test_vsriq_n_s32
+  return vsriq_n_s32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v4i32
+  // CHECK_CODEGEN: sri.4s  v0, v1, #1
+}
+
+int64x2_t test_vsriq_n_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vsriq_n_s64
+  return vsriq_n_s64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v2i64
+  // CHECK_CODEGEN: sri.2d  v0, v1, #1
+}
+
+uint8x16_t test_vsriq_n_u8(uint8x16_t a1, uint8x16_t a2) {
+  // CHECK: test_vsriq_n_u8
+  return vsriq_n_u8(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v16i8
+  // CHECK_CODEGEN: sri.16b v0, v1, #3
+}
+
+uint16x8_t test_vsriq_n_u16(uint16x8_t a1, uint16x8_t a2) {
+  // CHECK: test_vsriq_n_u16
+  return vsriq_n_u16(a1, a2, 3);
+  // CHECK: llvm.arm64.neon.vsri.v8i16
+  // CHECK_CODEGEN: sri.8h  v0, v1, #3
+}
+
+uint32x4_t test_vsriq_n_u32(uint32x4_t a1, uint32x4_t a2) {
+  // CHECK: test_vsriq_n_u32
+  return vsriq_n_u32(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v4i32
+  // CHECK_CODEGEN: sri.4s  v0, v1, #1
+}
+
+uint64x2_t test_vsriq_n_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vsriq_n_u64
+  return vsriq_n_u64(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v2i64
+  // CHECK_CODEGEN: sri.2d  v0, v1, #1
+}
+
+poly8x16_t test_vsriq_n_p8(poly8x16_t a1, poly8x16_t a2) {
+  // CHECK: test_vsriq_n_p8
+  return vsriq_n_p8(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v16i8
+  // CHECK_CODEGEN: sri.16b v0, v1, #1
+}
+
+poly16x8_t test_vsriq_n_p16(poly16x8_t a1, poly16x8_t a2) {
+  // CHECK: test_vsriq_n_p16
+  return vsriq_n_p16(a1, a2, 1);
+  // CHECK: llvm.arm64.neon.vsri.v8i16
+  // CHECK_CODEGEN: sri.8h  v0, v1, #1
+}
+
diff --git a/clang/test/CodeGen/arm64_vtst.c b/clang/test/CodeGen/arm64_vtst.c
new file mode 100644
index 00000000000..f259f0d0d94
--- /dev/null
+++ b/clang/test/CodeGen/arm64_vtst.c
@@ -0,0 +1,22 @@
+// RUN: %clang -O1 -target arm64-apple-ios7 -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// Test ARM64 SIMD comparison test intrinsics
+
+#include <arm_neon.h>
+
+uint64x2_t test_vtstq_s64(int64x2_t a1, int64x2_t a2) {
+  // CHECK: test_vtstq_s64
+  return vtstq_s64(a1, a2);
+  // CHECK: [[COMMONBITS:%[A-Za-z0-9.]+]] = and <2 x i64> %a1, %a2
+  // CHECK: [[MASK:%[A-Za-z0-9.]+]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
+  // CHECK: [[RES:%[A-Za-z0-9.]+]] = sext <2 x i1> [[MASK]] to <2 x i64>
+  // CHECK: ret <2 x i64> [[RES]]
+}
+
+uint64x2_t test_vtstq_u64(uint64x2_t a1, uint64x2_t a2) {
+  // CHECK: test_vtstq_u64
+  return vtstq_u64(a1, a2);
+  // CHECK: [[COMMONBITS:%[A-Za-z0-9.]+]] = and <2 x i64> %a1, %a2
+  // CHECK: [[MASK:%[A-Za-z0-9.]+]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
+  // CHECK: [[RES:%[A-Za-z0-9.]+]] = sext <2 x i1> [[MASK]] to <2 x i64>
+  // CHECK: ret <2 x i64> [[RES]]
+}
diff --git a/clang/test/CodeGen/asm_arm64.c b/clang/test/CodeGen/asm_arm64.c
new file mode 100644
index 00000000000..0eaa347a851
--- /dev/null
+++ b/clang/test/CodeGen/asm_arm64.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -emit-llvm -o - %s | FileCheck %s
+
+// rdar://9167275
+
+int t1()
+{
+  int x;
+  __asm__("mov %0, 7" : "=r" (x));
+  return x;
+}
+
+long t2()
+{
+  long x;
+  __asm__("mov %0, 7" : "=r" (x));
+  return x;
+}
+
+long t3()
+{
+  long x;
+  __asm__("mov %w0, 7" : "=r" (x));
+  return x;
+}
+
+// rdar://9281206
+
+void t4(long op) {
+  long x1;
+  asm ("mov x0, %1; svc #0;" : "=r"(x1) :"r"(op),"r"(x1) :"x0" );
+}
+
+// rdar://9394290
+
+float t5(float x) {
+  __asm__("fadd %0, %0, %0" : "+w" (x));
+  return x;
+}
+
+// rdar://9865712
+void t6 (void *f, int g) {
+  // CHECK: t6
+  // CHECK: call void asm "str $1, $0", "=*Q,r"
+  asm("str %1, %0" : "=Q"(f) : "r"(g));
+}
diff --git a/clang/test/CodeGen/atomic-arm64.c b/clang/test/CodeGen/atomic-arm64.c
new file mode 100644
index 00000000000..147b570e21c
--- /dev/null
+++ b/clang/test/CodeGen/atomic-arm64.c
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=arm64-apple-ios7 | FileCheck %s
+
+// Memory ordering values.
+enum {
+  memory_order_relaxed = 0,
+  memory_order_consume = 1,
+  memory_order_acquire = 2,
+  memory_order_release = 3,
+  memory_order_acq_rel = 4,
+  memory_order_seq_cst = 5
+};
+
+typedef struct { void *a, *b; } pointer_pair_t;
+typedef struct { void *a, *b, *c, *d; } pointer_quad_t;
+
+// rdar://13489679
+
+extern _Atomic(_Bool) a_bool;
+extern _Atomic(float) a_float;
+extern _Atomic(void*) a_pointer;
+extern _Atomic(pointer_pair_t) a_pointer_pair;
+extern _Atomic(pointer_quad_t) a_pointer_quad;
+
+// CHECK:    define void @test0()
+// CHECK:      [[TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT: store i8 1, i8* [[TEMP]]
+// CHECK-NEXT: [[T0:%.*]] = load i8* [[TEMP]], align 1
+// CHECK-NEXT: store atomic i8 [[T0]], i8* @a_bool seq_cst, align 1
+void test0() {
+  __c11_atomic_store(&a_bool, 1, memory_order_seq_cst);
+}
+
+// CHECK:    define void @test1()
+// CHECK:      [[TEMP:%.*]] = alloca float, align 4
+// CHECK-NEXT: store float 3.000000e+00, float* [[TEMP]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast float* [[TEMP]] to i32*
+// CHECK-NEXT: [[T1:%.*]] = load i32* [[T0]], align 4
+// CHECK-NEXT: store atomic i32 [[T1]], i32* bitcast (float* @a_float to i32*) seq_cst, align 4
+void test1() {
+  __c11_atomic_store(&a_float, 3, memory_order_seq_cst);
+}
+
+// CHECK:    define void @test2()
+// CHECK:      [[TEMP:%.*]] = alloca i8*, align 8
+// CHECK-NEXT: store i8* @a_bool, i8** [[TEMP]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast i8** [[TEMP]] to i64*
+// CHECK-NEXT: [[T1:%.*]] = load i64* [[T0]], align 8
+// CHECK-NEXT: store atomic i64 [[T1]], i64* bitcast (i8** @a_pointer to i64*) seq_cst, align 8
+void test2() {
+  __c11_atomic_store(&a_pointer, &a_bool, memory_order_seq_cst);
+}
+
+// CHECK:    define void @test3(
+// CHECK:      [[PAIR:%.*]] = alloca [[PAIR_T:%.*]], align 8
+// CHECK-NEXT: [[TEMP:%.*]] = alloca [[PAIR_T]], align 8
+// CHECK:      llvm.memcpy
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[PAIR_T]]* [[TEMP]] to i128*
+// CHECK-NEXT: [[T1:%.*]] = load i128* [[T0]], align 16
+// CHECK-NEXT: store atomic i128 [[T1]], i128* bitcast ([[PAIR_T]]* @a_pointer_pair to i128*) seq_cst, align 16
+void test3(pointer_pair_t pair) {
+  __c11_atomic_store(&a_pointer_pair, pair, memory_order_seq_cst);
+}
+
+// CHECK:    define void @test4([[QUAD_T:%.*]]*
+// CHECK:      [[TEMP:%.*]] = alloca [[QUAD_T:%.*]], align 8
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i8*
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[QUAD_T]]* {{%.*}} to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 32, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i8*
+// CHECK-NEXT: call void @__atomic_store(i64 32, i8* bitcast ([[QUAD_T]]* @a_pointer_quad to i8*), i8* [[T0]], i32 5)
+void test4(pointer_quad_t quad) {
+  __c11_atomic_store(&a_pointer_quad, quad, memory_order_seq_cst);
+}
diff --git a/clang/test/CodeGen/blockstret.c b/clang/test/CodeGen/blockstret.c
index d5dae6f3a61..edd1218b13e 100644
--- a/clang/test/CodeGen/blockstret.c
+++ b/clang/test/CodeGen/blockstret.c
@@ -1,13 +1,20 @@
 // RUN: %clang_cc1 -fblocks -triple x86_64-apple-darwin9 %s -emit-llvm -o - | FileCheck %s -check-prefix=X64
 // RUN: %clang_cc1 -fblocks -triple i686-apple-darwin9 %s -emit-llvm -o - | FileCheck %s -check-prefix=X32
+// RUN: %clang_cc1 -fblocks -triple arm64-apple-darwin %s -emit-llvm -o - | FileCheck %s -check-prefix=ARM64
 
 // X64:   internal constant {{.*}} { i8** @_NSConcreteGlobalBlock, i32 1879048192
-// X64:     store i32 1610612736, i32* %want
+// X64:   store i32 1610612736, i32* %want
 
 // X32:   @_NSConcreteGlobalBlock, i32 1879048192, i32 0,
 // X32:   store i32 1610612736, i32* %want
 
 // rdar://7677537
+
+// ARM64: @_NSConcreteGlobalBlock, i32 1342177280, i32 0,
+// ARM64: store i32 1610612736, i32* %want
+
+// rdar://9757126
+
 int printf(const char *, ...);
 void *malloc(__SIZE_TYPE__ size);
 
diff --git a/clang/test/CodeGen/builtins-arm-exclusive.c b/clang/test/CodeGen/builtins-arm-exclusive.c
index 7eccb9e27bb..3d6cc5433ec 100644
--- a/clang/test/CodeGen/builtins-arm-exclusive.c
+++ b/clang/test/CodeGen/builtins-arm-exclusive.c
@@ -1,5 +1,6 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -Wall -Werror -triple thumbv7-linux-gnueabi -fno-signed-char -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -O3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
 
 // Make sure the canonical use works before going into smaller details:
 int atomic_inc(int *addr) {
@@ -12,46 +13,80 @@ int atomic_inc(int *addr) {
   return OldVal;
 }
 
-// CHECK: @atomic_inc
+// CHECK-LABEL: @atomic_inc
 // CHECK:   [[OLDVAL:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
 // CHECK:   [[INC:%.*]] = add nsw i32 [[OLDVAL]], 1
 // CHECK:   [[FAILURE:%.*]] = tail call i32 @llvm.arm.strex.p0i32(i32 [[INC]], i32* %addr)
 // CHECK:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
 // CHECK:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
 
+// CHECK-ARM64-LABEL: @atomic_inc
+// CHECK-ARM64:   [[OLDVAL:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
+// CHECK-ARM64:   [[INC:%.*]] = add i64 [[OLDVAL]], 1
+// CHECK-ARM64:   [[TRUNC:%.*]] = and i64 [[INC]], 4294967295
+// CHECK-ARM64:   [[FAILURE:%.*]] = tail call i32 @llvm.arm64.stxr.p0i32(i64 [[TRUNC]], i32* %addr)
+// CHECK-ARM64:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
+// CHECK-ARM64:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
+
 struct Simple {
   char a, b;
 };
 
 int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
-// CHECK: @test_ldrex
+// CHECK-LABEL: @test_ldrex
+// CHECK-ARM64-LABEL: @test_ldrex
   int sum = 0;
   sum += __builtin_arm_ldrex(addr);
 // CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
 // CHECK: and i32 [[INTRES]], 255
 
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
+// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
+// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+
   sum += __builtin_arm_ldrex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
 // CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
 // CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
 // CHECK: ashr exact i32 [[TMPSEXT]], 16
 
+// CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
+// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
+// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+
   sum += __builtin_arm_ldrex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
 // CHECK:  call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
 
+// CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: trunc i64 [[INTRES]] to i32
+
   sum += __builtin_arm_ldrex((long long *)addr);
 // CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
 
+// CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
+// CHECK-ARM64: call i64 @llvm.arm64.ldxr.p0i64(i64* [[ADDR64]])
+
   sum += __builtin_arm_ldrex(addr64);
 // CHECK: [[ADDR64_AS8:%.*]] = bitcast i64* %addr64 to i8*
 // CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* [[ADDR64_AS8]])
 
+// CHECK-ARM64: call i64 @llvm.arm64.ldxr.p0i64(i64* %addr64)
+
   sum += __builtin_arm_ldrex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
 // CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
+// CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
+// CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
+
   sum += __builtin_arm_ldrex((double *)addr);
 // CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
@@ -62,51 +97,110 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: bitcast i64 [[INTRES]] to double
+
   sum += *__builtin_arm_ldrex((int **)addr);
 // CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
+
   sum += __builtin_arm_ldrex((struct Simple **)addr)->a;
 // CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
+// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.arm64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
 
 int test_strex(char *addr) {
-// CHECK: @test_strex
+// CHECK-LABEL: @test_strex
+// CHECK-ARM64-LABEL: @test_strex
   int res = 0;
   struct Simple var = {0};
   res |= __builtin_arm_strex(4, addr);
 // CHECK: call i32 @llvm.arm.strex.p0i8(i32 4, i8* %addr)
 
+// CHECK-ARM64: call i32 @llvm.arm64.stxr.p0i8(i64 4, i8* %addr)
+
   res |= __builtin_arm_strex(42, (short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
 // CHECK:  call i32 @llvm.arm.strex.p0i16(i32 42, i16* [[ADDR16]])
 
+// CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
+// CHECK-ARM64:  call i32 @llvm.arm64.stxr.p0i16(i64 42, i16* [[ADDR16]])
+
   res |= __builtin_arm_strex(42, (int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
 // CHECK: call i32 @llvm.arm.strex.p0i32(i32 42, i32* [[ADDR32]])
 
+// CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
+// CHECK-ARM64: call i32 @llvm.arm64.stxr.p0i32(i64 42, i32* [[ADDR32]])
+
   res |= __builtin_arm_strex(42, (long long *)addr);
 // CHECK: call i32 @llvm.arm.strexd(i32 42, i32 0, i8* %addr)
 
+// CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
+// CHECK-ARM64: call i32 @llvm.arm64.stxr.p0i64(i64 42, i64* [[ADDR64]])
+
   res |= __builtin_arm_strex(2.71828f, (float *)addr);
 // CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[ADDR32]])
 
+// CHECK-ARM64: call i32 @llvm.arm64.stxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+
   res |= __builtin_arm_strex(3.14159, (double *)addr);
 // CHECK: call i32 @llvm.arm.strexd(i32 -266631570, i32 1074340345, i8* %addr)
 
+// CHECK-ARM64: call i32 @llvm.arm64.stxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+
   res |= __builtin_arm_strex(&var, (struct Simple **)addr);
 // CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
 // CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
 
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
+// CHECK-ARM64: call i32 @llvm.arm64.stxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+
   return res;
 }
 
 void test_clrex() {
-// CHECK: @test_clrex
+// CHECK-LABEL: @test_clrex
+// CHECK-ARM64-LABEL: @test_clrex
 
   __builtin_arm_clrex();
 // CHECK: call void @llvm.arm.clrex()
+// CHECK-ARM64: call void @llvm.arm64.clrex()
+}
+
+#ifdef __aarch64__
+// 128-bit tests
+
+__int128 test_ldrex_128(__int128 *addr) {
+// CHECK-ARM64-LABEL: @test_ldrex_128
+
+  return __builtin_arm_ldrex(addr);
+// CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
+// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.arm64.ldxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
+// CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
+// CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
+// CHECK-ARM64: [[RESLO64:%.*]] = zext i64 [[RESLO]] to i128
+// CHECK-ARM64: [[RESHIHI:%.*]] = shl nuw i128 [[RESHI64]], 64
+// CHECK-ARM64: [[INTRES:%.*]] = or i128 [[RESHIHI]], [[RESLO64]]
+// CHECK-ARM64: ret i128 [[INTRES]]
+}
+
+int test_strex_128(__int128 *addr, __int128 val) {
+// CHECK-ARM64-LABEL: @test_strex_128
+
+  return __builtin_arm_strex(val, addr);
+// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
+// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
+// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
+// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.arm64.stxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
 }
+#endif
diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c
new file mode 100644
index 00000000000..fcda92e4fc6
--- /dev/null
+++ b/clang/test/CodeGen/builtins-arm64.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -O3 -emit-llvm -o - %s | FileCheck %s
+
+void f0(void *a, void *b) {
+	__clear_cache(a,b);
+// CHECK: call {{.*}} @__clear_cache
+}
author	Tim Northover <tnorthover@apple.com>	2014-03-29 15:09:45 +0000
committer	Tim Northover <tnorthover@apple.com>	2014-03-29 15:09:45 +0000
commit	a2ee433c8d99632419d4a13a66cc4d06eada4014 (patch)
tree	e6ab2db8facbc4c5ed2fb11df260db8138572ace /clang/test/CodeGen
parent	af3698066a1ea2e5ab4cc08ae9a59620cf18adb7 (diff)
download	bcm5719-llvm-a2ee433c8d99632419d4a13a66cc4d06eada4014.tar.gz bcm5719-llvm-a2ee433c8d99632419d4a13a66cc4d06eada4014.zip