31 files changed, 1834 insertions, 160 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-aapcs.ll b/llvm/test/CodeGen/AArch64/arm64-aapcs.ll
index 79a25c1e3b6..7887facb9ac 100644
--- a/llvm/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -25,7 +25,7 @@ define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) {
 @var64 = global i64 0, align 8
 
 ; Check stack slots are 64-bit at all times.
-define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+define void @test_stack_slots([8 x i64], i1 %bool, i8 %char, i16 %short,
                                 i32 %int, i64 %long) {
 ; CHECK-LABEL: test_stack_slots:
 ; CHECK-DAG: ldr w[[ext1:[0-9]+]], [sp, #24]
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
index 727c189721f..05f467e1934 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios -O3 -aarch64-enable-collect-loh | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64_32-apple-watchos -O3 -aarch64-enable-collect-loh | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index 773286ef1d7..962e36ddb61 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index eb3607dd437..816e5a7cc6f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64_32-apple-watchos -O2 | FileCheck %s
 ; RUN: llc -o - %s -mtriple=arm64-linux-gnu -O2 | FileCheck %s --check-prefix=CHECK-ELF
 
 ; CHECK-ELF-NOT: .loh
@@ -60,9 +61,9 @@ if.end4:                                          ; preds = %if.then2, %if.then,
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getC() {
@@ -76,9 +77,9 @@ define i32 @getC() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsw x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsw x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExtC() {
@@ -94,10 +95,10 @@ define i64 @getSExtC() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
-; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
-; CHECK-NEXT: str [[ADD]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str [[ADD]], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define void @getSeveralC(i32 %t) {
@@ -114,9 +115,9 @@ entry:
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define void @setC(i32 %t) {
@@ -142,7 +143,7 @@ entry:
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getInternalCPlus4() {
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   %res = load i32, i32* %addr, align 4
   ret i32 %res
 }
@@ -159,7 +160,7 @@ define i32 @getInternalCPlus4() {
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExtInternalCPlus4() {
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   %res = load i32, i32* %addr, align 4
   %sextres = sext i32 %res to i64
   ret i64 %sextres
@@ -180,7 +181,7 @@ define i64 @getSExtInternalCPlus4() {
 ; CHECK: .loh AdrpAdd [[ADRP_LABEL]], [[ADDGOT_LABEL]]
 define void @getSeveralInternalCPlus4(i32 %t) {
 entry:
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   %tmp = load i32, i32* %addr, align 4
   %add = add nsw i32 %tmp, %t
   store i32 %add, i32* %addr, align 4
@@ -200,7 +201,7 @@ entry:
 ; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define void @setInternalCPlus4(i32 %t) {
 entry:
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   store i32 %t, i32* %addr, align 4
   ret void
 }
@@ -276,8 +277,8 @@ entry:
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
-; CHECK-NEXT: ldrb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldrb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define i8 @getD() {
@@ -289,9 +290,9 @@ define i8 @getD() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: strb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: strb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setD(i8 %t) {
@@ -305,9 +306,9 @@ define void @setD(i8 %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getSExtD() {
@@ -322,9 +323,9 @@ define i32 @getSExtD() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsb x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsb x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExt64D() {
@@ -341,8 +342,8 @@ define i64 @getSExt64D() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
-; CHECK-NEXT: ldrh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldrh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define i16 @getE() {
@@ -356,9 +357,9 @@ define i16 @getE() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getSExtE() {
@@ -371,9 +372,9 @@ define i32 @getSExtE() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: strh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: strh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setE(i16 %t) {
@@ -387,9 +388,9 @@ define void @setE(i16 %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsh x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsh x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExt64E() {
@@ -406,9 +407,9 @@ define i64 @getSExt64E() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getF() {
@@ -420,9 +421,9 @@ define i64 @getF() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setF(i64 %t) {
@@ -438,9 +439,9 @@ define void @setF(i64 %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr s0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define float @getG() {
@@ -452,9 +453,9 @@ define float @getG() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str s0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setG(float %t) {
@@ -470,9 +471,9 @@ define void @setG(float %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr h0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define half @getH() {
@@ -484,9 +485,9 @@ define half @getH() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str h0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setH(half %t) {
@@ -502,9 +503,9 @@ define void @setH(half %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define double @getI() {
@@ -516,9 +517,9 @@ define double @getI() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setI(double %t) {
@@ -534,9 +535,9 @@ define void @setI(double %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define <2 x i32> @getJ() {
@@ -548,9 +549,9 @@ define <2 x i32> @getJ() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setJ(<2 x i32> %t) {
@@ -566,9 +567,9 @@ define void @setJ(<2 x i32> %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr q0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define <4 x i32> @getK() {
@@ -580,9 +581,9 @@ define <4 x i32> @getK() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str q0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setK(<4 x i32> %t) {
@@ -598,9 +599,9 @@ define void @setK(<4 x i32> %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr b0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr b0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define <1 x i8> @getL() {
@@ -612,11 +613,11 @@ define <1 x i8> @getL() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
 ; CHECK-NEXT: ; kill
 ; Ultimately we should generate str b0, but right now, we match the vector
 ; variant which does not allow to fold the immediate into the store.
-; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define void @setL(<1 x i8> %t) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index 7dcd6e25ae1..018a1143fc3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-redzone | FileCheck %s
+; RUN: llc < %s -mtriple=arm64_32-apple-ios -aarch64-redzone | FileCheck %s
 
 define i64* @store64(i64* %ptr, i64 %index, i64 %spacing) {
 ; CHECK-LABEL: store64:
diff --git a/llvm/test/CodeGen/AArch64/arm64-stacksave.ll b/llvm/test/CodeGen/AArch64/arm64-stacksave.ll
index a79e99ba323..13d4ae23db6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-stacksave.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stacksave.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -verify-coalescing
+; RUN: llc -mtriple=arm64-apple-macosx10.8.0 < %s -verify-coalescing
+; RUN: llc -mtriple=arm64_32-apple-ios9.0 < %s -verify-coalescing
 ; <rdar://problem/11522048>
-target triple = "arm64-apple-macosx10.8.0"
 
 ; Verify that we can handle spilling the stack pointer without attempting
 ; spilling it directly.
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
new file mode 100644
index 00000000000..5995de2942e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+; If %base < 96 then the sum will not wrap (in an unsigned sense), but "ldr w0,
+; [x0, #-96]" would.
+define i32 @test_valid_wrap(i32 %base) {
+; CHECK-LABEL: test_valid_wrap:
+; CHECK: sub w[[ADDR:[0-9]+]], w0, #96
+; CHECK: ldr w0, [x[[ADDR]]]
+
+  %newaddr = add nuw i32 %base, -96
+  %ptr = inttoptr i32 %newaddr to i32*
+  %val = load i32, i32* %ptr
+  ret i32 %val
+}
+
+define i8 @test_valid_wrap_optimizable(i8* %base) {
+; CHECK-LABEL: test_valid_wrap_optimizable:
+; CHECK: ldurb w0, [x0, #-96]
+
+  %newaddr = getelementptr inbounds i8, i8* %base, i32 -96
+  %val = load i8, i8* %newaddr
+  ret i8 %val
+}
+
+define i8 @test_valid_wrap_optimizable1(i8* %base, i32 %offset) {
+; CHECK-LABEL: test_valid_wrap_optimizable1:
+; CHECK: ldrb w0, [x0, w1, sxtw]
+
+  %newaddr = getelementptr inbounds i8, i8* %base, i32 %offset
+  %val = load i8, i8* %newaddr
+  ret i8 %val
+}
+
+;
+define i8 @test_valid_wrap_optimizable2(i8* %base, i32 %offset) {
+; CHECK-LABEL: test_valid_wrap_optimizable2:
+; CHECK: sxtw x[[OFFSET:[0-9]+]], w1
+; CHECK: mov w[[BASE:[0-9]+]], #-100
+; CHECK: ldrb w0, [x[[OFFSET]], x[[BASE]]]
+
+  %newaddr = getelementptr inbounds i8, i8* inttoptr(i32 -100 to i8*), i32 %offset
+  %val = load i8, i8* %newaddr
+  ret i8 %val
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll b/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll
new file mode 100644
index 00000000000..34682e82f3d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll
@@ -0,0 +1,261 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 -o - %s | FileCheck %s
+
+define i8 @test_load_8(i8* %addr) {
+; CHECK-LABAL: test_load_8:
+; CHECK: ldarb w0, [x0]
+  %val = load atomic i8, i8* %addr seq_cst, align 1
+  ret i8 %val
+}
+
+define i16 @test_load_16(i16* %addr) {
+; CHECK-LABAL: test_load_16:
+; CHECK: ldarh w0, [x0]
+  %val = load atomic i16, i16* %addr acquire, align 2
+  ret i16 %val
+}
+
+define i32 @test_load_32(i32* %addr) {
+; CHECK-LABAL: test_load_32:
+; CHECK: ldar w0, [x0]
+  %val = load atomic i32, i32* %addr seq_cst, align 4
+  ret i32 %val
+}
+
+define i64 @test_load_64(i64* %addr) {
+; CHECK-LABAL: test_load_64:
+; CHECK: ldar x0, [x0]
+  %val = load atomic i64, i64* %addr seq_cst, align 8
+  ret i64 %val
+}
+
+define i8* @test_load_ptr(i8** %addr) {
+; CHECK-LABAL: test_load_ptr:
+; CHECK: ldar w0, [x0]
+  %val = load atomic i8*, i8** %addr seq_cst, align 8
+  ret i8* %val
+}
+
+define void @test_store_8(i8* %addr) {
+; CHECK-LABAL: test_store_8:
+; CHECK: stlrb wzr, [x0]
+  store atomic i8 0, i8* %addr seq_cst, align 1
+  ret void
+}
+
+define void @test_store_16(i16* %addr) {
+; CHECK-LABAL: test_store_16:
+; CHECK: stlrh wzr, [x0]
+  store atomic i16 0, i16* %addr seq_cst, align 2
+  ret void
+}
+
+define void @test_store_32(i32* %addr) {
+; CHECK-LABAL: test_store_32:
+; CHECK: stlr wzr, [x0]
+  store atomic i32 0, i32* %addr seq_cst, align 4
+  ret void
+}
+
+define void @test_store_64(i64* %addr) {
+; CHECK-LABAL: test_store_64:
+; CHECK: stlr xzr, [x0]
+  store atomic i64 0, i64* %addr seq_cst, align 8
+  ret void
+}
+
+define void @test_store_ptr(i8** %addr) {
+; CHECK-LABAL: test_store_ptr:
+; CHECK: stlr wzr, [x0]
+  store atomic i8* null, i8** %addr seq_cst, align 8
+  ret void
+}
+
+declare i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+declare i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+declare i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+declare i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+
+define i8 @test_ldxr_8(i8* %addr) {
+; CHECK-LABEL: test_ldxr_8:
+; CHECK: ldxrb w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+  %val8 = trunc i64 %val to i8
+  ret i8 %val8
+}
+
+define i16 @test_ldxr_16(i16* %addr) {
+; CHECK-LABEL: test_ldxr_16:
+; CHECK: ldxrh w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+  %val16 = trunc i64 %val to i16
+  ret i16 %val16
+}
+
+define i32 @test_ldxr_32(i32* %addr) {
+; CHECK-LABEL: test_ldxr_32:
+; CHECK: ldxr w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+  %val32 = trunc i64 %val to i32
+  ret i32 %val32
+}
+
+define i64 @test_ldxr_64(i64* %addr) {
+; CHECK-LABEL: test_ldxr_64:
+; CHECK: ldxr x0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+  ret i64 %val
+}
+
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+
+define i8 @test_ldaxr_8(i8* %addr) {
+; CHECK-LABEL: test_ldaxr_8:
+; CHECK: ldaxrb w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+  %val8 = trunc i64 %val to i8
+  ret i8 %val8
+}
+
+define i16 @test_ldaxr_16(i16* %addr) {
+; CHECK-LABEL: test_ldaxr_16:
+; CHECK: ldaxrh w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+  %val16 = trunc i64 %val to i16
+  ret i16 %val16
+}
+
+define i32 @test_ldaxr_32(i32* %addr) {
+; CHECK-LABEL: test_ldaxr_32:
+; CHECK: ldaxr w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+  %val32 = trunc i64 %val to i32
+  ret i32 %val32
+}
+
+define i64 @test_ldaxr_64(i64* %addr) {
+; CHECK-LABEL: test_ldaxr_64:
+; CHECK: ldaxr x0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+  ret i64 %val
+}
+
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*)
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*)
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*)
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*)
+
+define i32 @test_stxr_8(i8* %addr, i8 %val) {
+; CHECK-LABEL: test_stxr_8:
+; CHECK: stxrb [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i8 %val to i64
+  %success = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stxr_16(i16* %addr, i16 %val) {
+; CHECK-LABEL: test_stxr_16:
+; CHECK: stxrh [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i16 %val to i64
+  %success = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stxr_32(i32* %addr, i32 %val) {
+; CHECK-LABEL: test_stxr_32:
+; CHECK: stxr [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i32 %val to i64
+  %success = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stxr_64(i64* %addr, i64 %val) {
+; CHECK-LABEL: test_stxr_64:
+; CHECK: stxr [[TMP:w[0-9]+]], x1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %success = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %success
+}
+
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*)
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*)
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*)
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*)
+
+define i32 @test_stlxr_8(i8* %addr, i8 %val) {
+; CHECK-LABEL: test_stlxr_8:
+; CHECK: stlxrb [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i8 %val to i64
+  %success = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stlxr_16(i16* %addr, i16 %val) {
+; CHECK-LABEL: test_stlxr_16:
+; CHECK: stlxrh [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i16 %val to i64
+  %success = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stlxr_32(i32* %addr, i32 %val) {
+; CHECK-LABEL: test_stlxr_32:
+; CHECK: stlxr [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i32 %val to i64
+  %success = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stlxr_64(i64* %addr, i64 %val) {
+; CHECK-LABEL: test_stlxr_64:
+; CHECK: stlxr [[TMP:w[0-9]+]], x1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %success = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %success
+}
+
+define {i8*, i1} @test_cmpxchg_ptr(i8** %addr, i8* %cmp, i8* %new) {
+; CHECK-LABEL: test_cmpxchg_ptr:
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK:     ldaxr [[OLD:w[0-9]+]], [x0]
+; CHECK:     cmp [[OLD]], w1
+; CHECK:     b.ne [[DONE:LBB[0-9]+_[0-9]+]]
+; CHECK:     stlxr [[SUCCESS:w[0-9]+]], w2, [x0]
+; CHECK:     cbnz [[SUCCESS]], [[LOOP]]
+
+; CHECK:     mov w1, #1
+; CHECK:     mov w0, [[OLD]]
+; CHECK:     ret
+
+; CHECK: [[DONE]]:
+; CHECK:     clrex
+; CHECK:     mov w1, wzr
+; CHECK:     mov w0, [[OLD]]
+; CHECK:     ret
+  %res = cmpxchg i8** %addr, i8* %cmp, i8* %new acq_rel acquire
+  ret {i8*, i1} %res
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-fastisel.ll b/llvm/test/CodeGen/AArch64/arm64_32-fastisel.ll
new file mode 100644
index 00000000000..15baad215a1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-fastisel.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=arm64_32-apple-ios -O0 -fast-isel %s -o - | FileCheck %s
+@var = global i8* null
+
+define void @test_store_release_ptr() {
+; CHECK-LABEL: test_store_release_ptr
+; CHECK: mov [[ZERO:w[0-9]+]], wzr
+; CHECK: stlr [[ZERO]]
+  store atomic i8* null, i8** @var release, align 4
+  br label %next
+
+next:
+  ret void
+}
+
+declare [2 x i32] @callee()
+
+define void @test_struct_return(i32* %addr) {
+; CHECK-LABEL: test_struct_return:
+; CHECK: bl _callee
+; CHECK-DAG: lsr [[HI:x[0-9]+]], x0, #32
+; CHECK-DAG: str w0
+  %res = call [2 x i32] @callee()
+  %res.0 = extractvalue [2 x i32] %res, 0
+  store i32 %res.0, i32* %addr
+  %res.1 = extractvalue [2 x i32] %res, 1
+  store i32 %res.1, i32* %addr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-frame-pointers.ll b/llvm/test/CodeGen/AArch64/arm64_32-frame-pointers.ll
new file mode 100644
index 00000000000..34f5d9b3160
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-frame-pointers.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64_32-apple-ios8.0 %s -o - | FileCheck %s
+
+; We're provoking LocalStackSlotAllocation to create some shared frame bases
+; here: it wants multiple <fi#N> using instructions that can be satisfied by a
+; single base, but not within the addressing-mode.
+;
+; When that happens it's important that we don't mix our pointer sizes
+; (e.g. try to create an ldr from a w-register base).
+define i8 @test_register_wrangling() {
+; CHECK-LABEL: test_register_wrangling:
+; CHECK: add [[TMP:x[0-9]+]], sp,
+; CHECK: add x[[BASE:[0-9]+]], [[TMP]],
+; CHECK: ldrb {{w[0-9]+}}, [x[[BASE]], #1]
+; CHECK: ldrb {{w[0-9]+}}, [x[[BASE]]]
+
+  %var1 = alloca i8, i32 4100
+  %var3 = alloca i8
+  %dummy = alloca i8, i32 4100
+
+  %var1p1 = getelementptr i8, i8* %var1, i32 1
+  %val1 = load i8, i8* %var1
+  %val2 = load i8, i8* %var3
+
+  %sum = add i8 %val1, %val2
+  ret i8 %sum
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
new file mode 100644
index 00000000000..21c49d38877
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -0,0 +1,61 @@
+; RUN: opt -codegenprepare -mtriple=arm64_32-apple-ios %s -S -o - | FileCheck %s
+
+define void @test_simple_sink(i1* %base, i64 %offset) {
+; CHECK-LABEL: @test_simple_sink
+; CHECK: next:
+; CHECK:   [[BASE8:%.*]] = bitcast i1* %base to i8*
+; CHECK:   [[ADDR8:%.*]] = getelementptr i8, i8* [[BASE8]], i64 %offset
+; CHECK:   [[ADDR:%.*]] = bitcast i8* [[ADDR8]] to i1*
+; CHECK:   load volatile i1, i1* [[ADDR]]
+  %addr = getelementptr i1, i1* %base, i64 %offset
+  %tst = load i1, i1* %addr
+  br i1 %tst, label %next, label %end
+
+next:
+  load volatile i1, i1* %addr
+  ret void
+
+end:
+  ret void
+}
+
+define void @test_inbounds_sink(i1* %base, i64 %offset) {
+; CHECK-LABEL: @test_inbounds_sink
+; CHECK: next:
+; CHECK:   [[BASE8:%.*]] = bitcast i1* %base to i8*
+; CHECK:   [[ADDR8:%.*]] = getelementptr inbounds i8, i8* [[BASE8]], i64 %offset
+; CHECK:   [[ADDR:%.*]] = bitcast i8* [[ADDR8]] to i1*
+; CHECK:   load volatile i1, i1* [[ADDR]]
+  %addr = getelementptr inbounds i1, i1* %base, i64 %offset
+  %tst = load i1, i1* %addr
+  br i1 %tst, label %next, label %end
+
+next:
+  load volatile i1, i1* %addr
+  ret void
+
+end:
+  ret void
+}
+
+; No address derived via an add can be guaranteed inbounds
+define void @test_add_sink(i1* %base, i64 %offset) {
+; CHECK-LABEL: @test_add_sink
+; CHECK: next:
+; CHECK:   [[BASE8:%.*]] = bitcast i1* %base to i8*
+; CHECK:   [[ADDR8:%.*]] = getelementptr i8, i8* [[BASE8]], i64 %offset
+; CHECK:   [[ADDR:%.*]] = bitcast i8* [[ADDR8]] to i1*
+; CHECK:   load volatile i1, i1* [[ADDR]]
+  %base64 = ptrtoint i1* %base to i64
+  %addr64 = add nsw nuw i64 %base64, %offset
+  %addr = inttoptr i64 %addr64 to i1*
+  %tst = load i1, i1* %addr
+  br i1 %tst, label %next, label %end
+
+next:
+  load volatile i1, i1* %addr
+  ret void
+
+end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll b/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll
new file mode 100644
index 00000000000..f484a2fe651
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=arm64_32-apple-ios9.0 -o - %s | FileCheck %s
+
+define i64 @test_memcpy(i64* %addr, i8* %src, i1 %tst) minsize {
+; CHECK-LABEL: test_memcpy:
+; CHECK: ldr [[VAL64:x[0-9]+]], [x0]
+; [...]
+; CHECK: and x0, [[VAL64]], #0xffffffff
+; CHECK: bl _memcpy
+
+  %val64 = load i64, i64* %addr
+  br i1 %tst, label %true, label %false
+
+true:
+  ret i64 %val64
+
+false:
+  %val32 = trunc i64 %val64 to i32
+  %val.ptr = inttoptr i32 %val32 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %val.ptr, i8* %src, i32 128, i32 0, i1 1)
+  ret i64 undef
+}
+
+define i64 @test_memmove(i64* %addr, i8* %src, i1 %tst) minsize {
+; CHECK-LABEL: test_memmove:
+; CHECK: ldr [[VAL64:x[0-9]+]], [x0]
+; [...]
+; CHECK: and x0, [[VAL64]], #0xffffffff
+; CHECK: bl _memmove
+
+  %val64 = load i64, i64* %addr
+  br i1 %tst, label %true, label %false
+
+true:
+  ret i64 %val64
+
+false:
+  %val32 = trunc i64 %val64 to i32
+  %val.ptr = inttoptr i32 %val32 to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %val.ptr, i8* %src, i32 128, i32 0, i1 1)
+  ret i64 undef
+}
+
+define i64 @test_memset(i64* %addr, i8* %src, i1 %tst) minsize {
+; CHECK-LABEL: test_memset:
+; CHECK: ldr [[VAL64:x[0-9]+]], [x0]
+; [...]
+; CHECK: and x0, [[VAL64]], #0xffffffff
+; CHECK: bl _memset
+
+  %val64 = load i64, i64* %addr
+  br i1 %tst, label %true, label %false
+
+true:
+  ret i64 %val64
+
+false:
+  %val32 = trunc i64 %val64 to i32
+  %val.ptr = inttoptr i32 %val32 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %val.ptr, i8 42, i32 256, i32 0, i1 1)
+  ret i64 undef
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
+
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-neon.ll b/llvm/test/CodeGen/AArch64/arm64_32-neon.ll
new file mode 100644
index 00000000000..9a1ecb2bc16
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-neon.ll
@@ -0,0 +1,198 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
+
+define <2 x double> @test_insert_elt(<2 x double> %vec, double %val) {
+; CHECK-LABEL: test_insert_elt:
+; CHECK: mov.d v0[0], v1[0]
+  %res = insertelement <2 x double> %vec, double %val, i32 0
+  ret <2 x double> %res
+}
+
+define void @test_split_16B(<4 x float> %val, <4 x float>* %addr) {
+; CHECK-LABEL: test_split_16B:
+; CHECK: str q0, [x0]
+  store <4 x float> %val, <4 x float>* %addr, align 8
+  ret void
+}
+
+define void @test_split_16B_splat(<4 x i32>, <4 x i32>* %addr) {
+; CHECK-LABEL: test_split_16B_splat:
+; CHECK: str {{q[0-9]+}}
+
+  %vec.tmp0 = insertelement <4 x i32> undef, i32 42, i32 0
+  %vec.tmp1 = insertelement <4 x i32> %vec.tmp0, i32 42, i32 1
+  %vec.tmp2 = insertelement <4 x i32> %vec.tmp1, i32 42, i32 2
+  %vec = insertelement <4 x i32> %vec.tmp2, i32 42, i32 3
+
+  store <4 x i32> %vec, <4 x i32>* %addr, align 8
+  ret void
+}
+
+
+%vec = type <2 x double>
+
+declare {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0i8(i8*)
+define {%vec, %vec} @test_neon_load(i8* %addr) {
+; CHECK-LABEL: test_neon_load:
+; CHECK: ld2r.2d { v0, v1 }, [x0]
+  %res = call {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0i8(i8* %addr)
+  ret {%vec, %vec} %res
+}
+
+declare {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0i8(%vec, %vec, i64, i8*)
+define {%vec, %vec} @test_neon_load_lane(i8* %addr, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_load_lane:
+; CHECK: ld2.d { v0, v1 }[0], [x0]
+  %res = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 0, i8* %addr)
+  ret {%vec, %vec} %res
+}
+
+declare void @llvm.aarch64.neon.st2.v2f64.p0i8(%vec, %vec, i8*)
+define void @test_neon_store(i8* %addr, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store:
+; CHECK: st2.2d { v0, v1 }, [x0]
+  call void @llvm.aarch64.neon.st2.v2f64.p0i8(%vec %in1, %vec %in2, i8* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0i8(%vec, %vec, i64, i8*)
+define void @test_neon_store_lane(i8* %addr, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store_lane:
+; CHECK: st2.d { v0, v1 }[1], [x0]
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 1, i8* %addr)
+  ret void
+}
+
+declare {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0i8(i8*)
+define {{%vec, %vec}, i8*} @test_neon_load_post(i8* %addr, i32 %offset) {
+; CHECK-LABEL: test_neon_load_post:
+; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: ld2.2d { v0, v1 }, [x0], [[OFFSET]]
+
+  %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0i8(i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  %res.tmp = insertvalue {{%vec, %vec}, i8*} undef, {%vec, %vec} %vecs, 0
+  %res = insertvalue {{%vec, %vec}, i8*} %res.tmp, i8* %addr.new, 1
+  ret {{%vec, %vec}, i8*} %res
+}
+
+define {{%vec, %vec}, i8*} @test_neon_load_post_lane(i8* %addr, i32 %offset, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_load_post_lane:
+; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: ld2.d { v0, v1 }[1], [x0], [[OFFSET]]
+
+  %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 1, i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  %res.tmp = insertvalue {{%vec, %vec}, i8*} undef, {%vec, %vec} %vecs, 0
+  %res = insertvalue {{%vec, %vec}, i8*} %res.tmp, i8* %addr.new, 1
+  ret {{%vec, %vec}, i8*} %res
+}
+
+define i8* @test_neon_store_post(i8* %addr, i32 %offset, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store_post:
+; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: st2.2d { v0, v1 }, [x0], [[OFFSET]]
+
+  call void @llvm.aarch64.neon.st2.v2f64.p0i8(%vec %in1, %vec %in2, i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  ret i8* %addr.new
+}
+
+define i8* @test_neon_store_post_lane(i8* %addr, i32 %offset, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store_post_lane:
+; CHECK: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: st2.d { v0, v1 }[0], [x0], [[OFFSET]]
+
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 0, i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  ret i8* %addr.new
+}
+
+; ld1 is slightly different because it goes via ISelLowering of normal IR ops
+; rather than an intrinsic.
+define {%vec, double*} @test_neon_ld1_post_lane(double* %addr, i32 %offset, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_post_lane:
+; CHECK: sbfiz [[OFFSET:x[0-9]+]], x1, #3, #32
+; CHECK: ld1.d { v0 }[0], [x0], [[OFFSET]]
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr inbounds double, double* %addr, i32 %offset
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
+
+define {{%vec, %vec}, i8*} @test_neon_load_post_exact(i8* %addr) {
+; CHECK-LABEL: test_neon_load_post_exact:
+; CHECK: ld2.2d { v0, v1 }, [x0], #32
+
+  %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0i8(i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 32
+
+  %res.tmp = insertvalue {{%vec, %vec}, i8*} undef, {%vec, %vec} %vecs, 0
+  %res = insertvalue {{%vec, %vec}, i8*} %res.tmp, i8* %addr.new, 1
+  ret {{%vec, %vec}, i8*} %res
+}
+
+define {%vec, double*} @test_neon_ld1_post_lane_exact(double* %addr, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_post_lane_exact:
+; CHECK: ld1.d { v0 }[0], [x0], #8
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr inbounds double, double* %addr, i32 1
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
+
+; As in the general load/store case, this GEP has defined semantics when the
+; address wraps. We cannot use post-indexed addressing.
+define {%vec, double*} @test_neon_ld1_notpost_lane_exact(double* %addr, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_notpost_lane_exact:
+; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], #8
+; CHECK: add w0, w0, #8
+; CHECK: ret
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr double, double* %addr, i32 1
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
+
+define {%vec, double*} @test_neon_ld1_notpost_lane(double* %addr, i32 %offset, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_notpost_lane:
+; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], {{x[0-9]+|sp}}
+; CHECK: add w0, w0, w1, lsl #3
+; CHECK: ret
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr double, double* %addr, i32 %offset
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-null.ll b/llvm/test/CodeGen/AArch64/arm64_32-null.ll
new file mode 100644
index 00000000000..9d62c56248b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-null.ll
@@ -0,0 +1,28 @@
+; RUN: llc -fast-isel=true  -global-isel=false -O0 -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+; RUN: llc -fast-isel=false -global-isel=false -O0 -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+define void @test_store(i8** %p) {
+; CHECK-LABEL: test_store:
+; CHECK: mov [[R1:w[0-9]+]], wzr
+; CHECK: str [[R1]], [x0]
+
+  store i8* null, i8** %p
+  ret void
+}
+
+define void @test_phi(i8** %p) {
+; CHECK-LABEL: test_phi:
+; CHECK: mov [[R1:x[0-9]+]], xzr
+; CHECK: str [[R1]], [sp]
+; CHECK: b [[BB:LBB[0-9_]+]]
+; CHECK: [[BB]]:
+; CHECK: ldr x0, [sp]
+; CHECK: str w0, [x{{.*}}]
+
+bb0:
+  br label %bb1
+bb1:
+  %tmp0 = phi i8* [ null, %bb0 ]
+  store i8* %tmp0, i8** %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll b/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll
new file mode 100644
index 00000000000..74b88305b57
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -o - | FileCheck %s
+
+define void @pass_pointer(i64 %in) {
+; CHECK-LABEL: pass_pointer:
+; CHECK: and x0, x0, #0xffffffff
+; CHECK: bl _take_pointer
+
+  %in32 = trunc i64 %in to i32
+  %ptr = inttoptr i32 %in32 to i8*
+  call i64 @take_pointer(i8* %ptr)
+  ret void
+}
+
+define i64 @take_pointer(i8* %ptr) nounwind {
+; CHECK-LABEL: take_pointer:
+; CHECK-NEXT: %bb.0
+; CHECK-NEXT: ret
+
+  %val = ptrtoint i8* %ptr to i32
+  %res = zext i32 %val to i64
+  ret i64 %res
+}
+
+define i32 @callee_ptr_stack_slot([8 x i64], i8*, i32 %val) {
+; CHECK-LABEL: callee_ptr_stack_slot:
+; CHECK: ldr w0, [sp, #4]
+
+  ret i32 %val
+}
+
+define void @caller_ptr_stack_slot(i8* %ptr) {
+; CHECK-LABEL: caller_ptr_stack_slot:
+; CHECK-DAG: mov [[VAL:w[0-9]]], #42
+; CHECK: stp w0, [[VAL]], [sp]
+
+  call i32 @callee_ptr_stack_slot([8 x i64] undef, i8* %ptr, i32 42)
+  ret void
+}
+
+define i8* @return_ptr(i64 %in, i64 %r) {
+; CHECK-LABEL: return_ptr:
+; CHECK: sdiv [[VAL64:x[0-9]+]], x0, x1
+; CHECK: and x0, [[VAL64]], #0xffffffff
+
+  %sum = sdiv i64 %in, %r
+  %sum32 = trunc i64 %sum to i32
+  %res = inttoptr i32 %sum32 to i8*
+  ret i8* %res
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-stack-pointers.ll b/llvm/test/CodeGen/AArch64/arm64_32-stack-pointers.ll
new file mode 100644
index 00000000000..a233e3416c1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-stack-pointers.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64_32-apple-ios9.0 -o - %s | FileCheck %s
+
+declare void @callee([8 x i64], i8*, i8*)
+
+; Make sure we don't accidentally store X0 or XZR, which might well
+; clobber other arguments or data.
+define void @test_stack_ptr_32bits(i8* %in) {
+; CHECK-LABEL: test_stack_ptr_32bits:
+; CHECK-DAG: stp wzr, w0, [sp]
+
+  call void @callee([8 x i64] undef, i8* null, i8* %in)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-tls.ll b/llvm/test/CodeGen/AArch64/arm64_32-tls.ll
new file mode 100644
index 00000000000..fada715304c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-tls.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+define i32 @test_thread_local() {
+; CHECK-LABEL: test_thread_local:
+; CHECK: adrp x[[TMP:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr w0, [x[[TMP]], _var@TLVPPAGEOFF]
+; CHECK: ldr w[[DEST:[0-9]+]], [x0]
+; CHECK: blr x[[DEST]]
+
+  %val = load i32, i32* @var
+  ret i32 %val
+}
+
+@var = thread_local global i32 zeroinitializer
+
+; CHECK: .tbss _var$tlv$init, 4, 2
+
+; CHECK-LABEL: __DATA,__thread_vars
+; CHECK: _var:
+; CHECK:    .long __tlv_bootstrap
+; CHECK:    .long 0
+; CHECK:    .long _var$tlv$init
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-va.ll b/llvm/test/CodeGen/AArch64/arm64_32-va.ll
new file mode 100644
index 00000000000..94ff4716139
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32-va.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+define void @test_va_copy(i8* %dst, i8* %src) {
+; CHECK-LABEL: test_va_copy:
+; CHECK: ldr [[PTR:w[0-9]+]], [x1]
+; CHECK: str [[PTR]], [x0]
+
+  call void @llvm.va_copy(i8* %dst, i8* %src)
+  ret void
+}
+
+define void @test_va_start(i32, ...)  {
+; CHECK-LABEL: test_va_start
+; CHECK: add x[[LIST:[0-9]+]], sp, #16
+; CHECK: str w[[LIST]],
+  %slot = alloca i8*, align 4
+  %list = bitcast i8** %slot to i8*
+  call void @llvm.va_start(i8* %list)
+  ret void
+}
+
+define void @test_va_start_odd([8 x i64], i32, ...) {
+; CHECK-LABEL: test_va_start_odd:
+; CHECK: add x[[LIST:[0-9]+]], sp, #20
+; CHECK: str w[[LIST]],
+  %slot = alloca i8*, align 4
+  %list = bitcast i8** %slot to i8*
+  call void @llvm.va_start(i8* %list)
+  ret void
+}
+
+define i8* @test_va_arg(i8** %list) {
+; CHECK-LABEL: test_va_arg:
+; CHECK: ldr w[[LOC:[0-9]+]], [x0]
+; CHECK: add [[NEXTLOC:w[0-9]+]], w[[LOC]], #4
+; CHECK: str [[NEXTLOC]], [x0]
+; CHECK: ldr w0, [x[[LOC]]]
+  %res = va_arg i8** %list, i8*
+  ret i8* %res
+}
+
+define i8* @really_test_va_arg(i8** %list, i1 %tst) {
+; CHECK-LABEL: really_test_va_arg:
+; CHECK: ldr w[[LOC:[0-9]+]], [x0]
+; CHECK: add [[NEXTLOC:w[0-9]+]], w[[LOC]], #4
+; CHECK: str [[NEXTLOC]], [x0]
+; CHECK: ldr w[[VAARG:[0-9]+]], [x[[LOC]]]
+; CHECK: csel x0, x[[VAARG]], xzr
+  %tmp = va_arg i8** %list, i8*
+  %res = select i1 %tst, i8* %tmp, i8* null
+  ret i8* %res
+}
+
+declare void @llvm.va_start(i8*) 
+
+declare void @llvm.va_copy(i8*, i8*)
diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll
new file mode 100644
index 00000000000..5fd619409a1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64_32.ll
@@ -0,0 +1,715 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -filetype=obj -o - -disable-post-ra -frame-pointer=all | \
+; RUN:     llvm-objdump -private-headers - | \
+; RUN:     FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -o - -aarch64-enable-atomic-cfg-tidy=0 -disable-post-ra -frame-pointer=all | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -o - -fast-isel -aarch64-enable-atomic-cfg-tidy=0 -disable-post-ra -frame-pointer=all | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST
+
+; CHECK-MACHO: Mach header
+; CHECK-MACHO: MH_MAGIC ARM64_32 V8
+
+@var64 = global i64 zeroinitializer, align 8
+@var32 = global i32 zeroinitializer, align 4
+
+@var_got = external global i8
+
+define i32* @test_global_addr() {
+; CHECK-LABEL: test_global_addr:
+; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE
+; CHECK: add x0, [[PAGE]], _var32@PAGEOFF
+  ret i32* @var32
+}
+
+; ADRP is necessarily 64-bit. The important point to check is that, however that
+; gets truncated to 32-bits, it's free. No need to zero out higher bits of that
+; register.
+define i64 @test_global_addr_extension() {
+; CHECK-LABEL: test_global_addr_extension:
+; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE
+; CHECK: add x0, [[PAGE]], _var32@PAGEOFF
+; CHECK-NOT: and
+; CHECK: ret
+
+  ret i64 ptrtoint(i32* @var32 to i64)
+}
+
+define i32 @test_global_value() {
+; CHECK-LABEL: test_global_value:
+; CHECK: adrp x[[PAGE:[0-9]+]], _var32@PAGE
+; CHECK: ldr w0, [x[[PAGE]], _var32@PAGEOFF]
+  %val = load i32, i32* @var32, align 4
+  ret i32 %val
+}
+
+; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here.
+define i32 @test_unsafe_indexed_add() {
+; CHECK-LABEL: test_unsafe_indexed_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_32 = add i32 %addr_int, 32
+  %addr = inttoptr i32 %addr_plus_32 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+; Since we've promised there is no unsigned overflow, @var32 must be at least
+; 32-bytes below 2^32, and we can use the load this time.
+define i32 @test_safe_indexed_add() {
+; CHECK-LABEL: test_safe_indexed_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i64
+  %addr_plus_32 = add nuw i64 %addr_int, 32
+  %addr = inttoptr i64 %addr_plus_32 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+define i32 @test_safe_indexed_or(i32 %in) {
+; CHECK-LABEL: test_safe_indexed_or:
+; CHECK: and [[TMP:w[0-9]+]], {{w[0-9]+}}, #0xfffffff0
+; CHECK: orr w[[ADDR:[0-9]+]], [[TMP]], #0x4
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = and i32 %in, -16
+  %addr_plus_4 = or i32 %addr_int, 4
+  %addr = inttoptr i32 %addr_plus_4 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+
+; Promising nsw is not sufficient because the addressing mode basically
+; calculates "zext(base) + zext(offset)" and nsw only guarantees
+; "sext(base) + sext(offset) == base + offset".
+define i32 @test_unsafe_nsw_indexed_add() {
+; CHECK-LABEL: test_unsafe_nsw_indexed_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32
+; CHECK-NOT: ubfx
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_32 = add nsw i32 %addr_int, 32
+  %addr = inttoptr i32 %addr_plus_32 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here.
+define i32 @test_unsafe_unscaled_add() {
+; CHECK-LABEL: test_unsafe_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_3 = add i32 %addr_int, 3
+  %addr = inttoptr i32 %addr_plus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+; Since we've promised there is no unsigned overflow, @var32 must be at least
+; 32-bytes below 2^32, and we can use the load this time.
+define i32 @test_safe_unscaled_add() {
+; CHECK-LABEL: test_safe_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_3 = add nuw i32 %addr_int, 3
+  %addr = inttoptr i32 %addr_plus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+; Promising nsw is not sufficient because the addressing mode basically
+; calculates "zext(base) + zext(offset)" and nsw only guarantees
+; "sext(base) + sext(offset) == base + offset".
+define i32 @test_unsafe_nsw_unscaled_add() {
+; CHECK-LABEL: test_unsafe_nsw_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK-NOT: ubfx
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_3 = add nsw i32 %addr_int, 3
+  %addr = inttoptr i32 %addr_plus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+; Because the addition may wrap, it is not safe to use "ldur w0, [xN, #-3]"
+; here.
+define i32 @test_unsafe_negative_unscaled_add() {
+; CHECK-LABEL: test_unsafe_negative_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: sub w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_minus_3 = add i32 %addr_int, -3
+  %addr = inttoptr i32 %addr_minus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+define i8* @test_got_addr() {
+; CHECK-LABEL: test_got_addr:
+; CHECK: adrp x[[PAGE:[0-9]+]], _var_got@GOTPAGE
+; CHECK: ldr w0, [x[[PAGE]], _var_got@GOTPAGEOFF]
+  ret i8* @var_got
+}
+
+define float @test_va_arg_f32(i8** %list) {
+; CHECK-LABEL: test_va_arg_f32:
+
+; CHECK: ldr w[[START:[0-9]+]], [x0]
+; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #8
+; CHECK: str [[AFTER]], [x0]
+
+  ; Floating point arguments get promoted to double as per C99.
+; CHECK: ldr [[DBL:d[0-9]+]], [x[[START]]]
+; CHECK: fcvt s0, [[DBL]]
+  %res = va_arg i8** %list, float
+  ret float %res
+}
+
+; Interesting point is that the slot is 4 bytes.
+define i8 @test_va_arg_i8(i8** %list) {
+; CHECK-LABEL: test_va_arg_i8:
+
+; CHECK: ldr w[[START:[0-9]+]], [x0]
+; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #4
+; CHECK: str [[AFTER]], [x0]
+
+  ; i8 gets promoted to int (again, as per C99).
+; CHECK: ldr w0, [x[[START]]]
+
+  %res = va_arg i8** %list, i8
+  ret i8 %res
+}
+
+; Interesting point is that the slot needs aligning (again, min size is 4
+; bytes).
+define i64 @test_va_arg_i64(i64** %list) {
+; CHECK-LABEL: test_va_arg_i64:
+
+  ; Update the list for the next user (minimum slot size is 4, but the actual
+  ; argument is 8 which had better be reflected!)
+; CHECK: ldr w[[UNALIGNED_START:[0-9]+]], [x0]
+; CHECK: add [[ALIGN_TMP:x[0-9]+]], x[[UNALIGNED_START]], #7
+; CHECK: and x[[START:[0-9]+]], [[ALIGN_TMP]], #0x1fffffff8
+; CHECK: add w[[AFTER:[0-9]+]], w[[START]], #8
+; CHECK: str w[[AFTER]], [x0]
+
+; CHECK: ldr x0, [x[[START]]]
+
+  %res = va_arg i64** %list, i64
+  ret i64 %res
+}
+
+declare void @bar(...)
+define void @test_va_call(i8 %l, i8 %r, float %in, i8* %ptr) {
+; CHECK-LABEL: test_va_call:
+; CHECK: add [[SUM:w[0-9]+]], {{w[0-9]+}}, w1
+
+; CHECK-DAG: str w2, [sp, #32]
+; CHECK-DAG: str xzr, [sp, #24]
+; CHECK-DAG: str s0, [sp, #16]
+; CHECK-DAG: str xzr, [sp, #8]
+; CHECK-DAG: str [[SUM]], [sp]
+
+  ; Add them to ensure real promotion occurs.
+  %sum = add i8 %l, %r
+  call void(...) @bar(i8 %sum, i64 0, float %in, double 0.0, i8* %ptr)
+  ret void
+}
+
+declare i8* @llvm.frameaddress(i32)
+
+define i8* @test_frameaddr() {
+; CHECK-LABEL: test_frameaddr:
+; CHECK: ldr {{w0|x0}}, [x29]
+  %val = call i8* @llvm.frameaddress(i32 1)
+  ret i8* %val
+}
+
+declare i8* @llvm.returnaddress(i32)
+
+define i8* @test_toplevel_returnaddr() {
+; CHECK-LABEL: test_toplevel_returnaddr:
+; CHECK: mov x0, x30
+  %val = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %val
+}
+
+define i8* @test_deep_returnaddr() {
+; CHECK-LABEL: test_deep_returnaddr:
+; CHECK: ldr x[[FRAME_REC:[0-9]+]], [x29]
+; CHECK: ldr x0, [x[[FRAME_REC]], #8]
+  %val = call i8* @llvm.returnaddress(i32 1)
+  ret i8* %val
+}
+
+define void @test_indirect_call(void()* %func) {
+; CHECK-LABEL: test_indirect_call:
+; CHECK: blr x0
+  call void() %func()
+  ret void
+}
+
+; Safe to use the unextended address here
+define void @test_indirect_safe_call(i32* %weird_funcs) {
+; CHECK-LABEL: test_indirect_safe_call:
+; CHECK: add w[[ADDR32:[0-9]+]], w0, #4
+; CHECK-OPT-NOT: ubfx
+; CHECK: blr x[[ADDR32]]
+  %addr = getelementptr i32, i32* %weird_funcs, i32 1
+  %func = bitcast i32* %addr to void()*
+  call void() %func()
+  ret void
+}
+
+declare void @simple()
+define void @test_simple_tail_call() {
+; CHECK-LABEL: test_simple_tail_call:
+; CHECK: b _simple
+  tail call void @simple()
+  ret void
+}
+
+define void @test_indirect_tail_call(void()* %func) {
+; CHECK-LABEL: test_indirect_tail_call:
+; CHECK: br x0
+  tail call void() %func()
+  ret void
+}
+
+; Safe to use the unextended address here
+define void @test_indirect_safe_tail_call(i32* %weird_funcs) {
+; CHECK-LABEL: test_indirect_safe_tail_call:
+; CHECK: add w[[ADDR32:[0-9]+]], w0, #4
+; CHECK-OPT-NOT: ubfx
+; CHECK-OPT: br x[[ADDR32]]
+  %addr = getelementptr i32, i32* %weird_funcs, i32 1
+  %func = bitcast i32* %addr to void()*
+  tail call void() %func()
+  ret void
+}
+
+; For the "armv7k" slice, Clang will be emitting some small structs as [N x
+; i32]. For ABI compatibility with arm64_32 these need to be passed in *X*
+; registers (e.g. [2 x i32] would be packed into a single register).
+
+define i32 @test_in_smallstruct_low([3 x i32] %in) {
+; CHECK-LABEL: test_in_smallstruct_low:
+; CHECK: mov x0, x1
+  %val = extractvalue [3 x i32] %in, 2
+  ret i32 %val
+}
+
+define i32 @test_in_smallstruct_high([3 x i32] %in) {
+; CHECK-LABEL: test_in_smallstruct_high:
+; CHECK: lsr x0, x0, #32
+  %val = extractvalue [3 x i32] %in, 1
+  ret i32 %val
+}
+
+; The 64-bit DarwinPCS ABI has the quirk that structs on the stack are always
+; 64-bit aligned. This must not happen for arm64_32 since othwerwise va_arg will
+; be incompatible with the armv7k ABI.
+define i32 @test_in_smallstruct_stack([8 x i64], i32, [3 x i32] %in) {
+; CHECK-LABEL: test_in_smallstruct_stack:
+; CHECK: ldr w0, [sp, #4]
+  %val = extractvalue [3 x i32] %in, 0
+  ret i32 %val
+}
+
+define [2 x i32] @test_ret_smallstruct([3 x i32] %in) {
+; CHECK-LABEL: test_ret_smallstruct:
+; CHECK: mov x0, #1
+; CHECK: movk x0, #2, lsl #32
+
+  ret [2 x i32] [i32 1, i32 2]
+}
+
+declare void @smallstruct_callee([4 x i32])
+define void @test_call_smallstruct() {
+; CHECK-LABEL: test_call_smallstruct:
+; CHECK: mov x0, #1
+; CHECK: movk x0, #2, lsl #32
+; CHECK: mov x1, #3
+; CHECK: movk x1, #4, lsl #32
+; CHECK: bl _smallstruct_callee
+
+  call void @smallstruct_callee([4 x i32] [i32 1, i32 2, i32 3, i32 4])
+  ret void
+}
+
+declare void @smallstruct_callee_stack([8 x i64], i32, [2 x i32])
+define void @test_call_smallstruct_stack() {
+; CHECK-LABEL: test_call_smallstruct_stack:
+; CHECK: mov [[VAL:x[0-9]+]], #1
+; CHECK: movk [[VAL]], #2, lsl #32
+; CHECK: stur [[VAL]], [sp, #4]
+
+  call void @smallstruct_callee_stack([8 x i64] undef, i32 undef, [2 x i32] [i32 1, i32 2])
+  ret void
+}
+
+declare [3 x i32] @returns_smallstruct()
+define i32 @test_use_smallstruct_low() {
+; CHECK-LABEL: test_use_smallstruct_low:
+; CHECK: bl _returns_smallstruct
+; CHECK: mov x0, x1
+
+  %struct = call [3 x i32] @returns_smallstruct()
+  %val = extractvalue [3 x i32] %struct, 2
+  ret i32 %val
+}
+
+define i32 @test_use_smallstruct_high() {
+; CHECK-LABEL: test_use_smallstruct_high:
+; CHECK: bl _returns_smallstruct
+; CHECK: lsr x0, x0, #32
+
+  %struct = call [3 x i32] @returns_smallstruct()
+  %val = extractvalue [3 x i32] %struct, 1
+  ret i32 %val
+}
+
+; If a small struct can't be allocated to x0-x7, the remaining registers should
+; be marked as unavailable and subsequent GPR arguments should also be on the
+; stack. Obviously the struct itself should be passed entirely on the stack.
+define i32 @test_smallstruct_padding([7 x i64], [4 x i32] %struct, i32 %in) {
+; CHECK-LABEL: test_smallstruct_padding:
+; CHECK-DAG: ldr [[IN:w[0-9]+]], [sp, #16]
+; CHECK-DAG: ldr [[LHS:w[0-9]+]], [sp]
+; CHECK: add w0, [[LHS]], [[IN]]
+  %lhs = extractvalue [4 x i32] %struct, 0
+  %sum = add i32 %lhs, %in
+  ret i32 %sum
+}
+
+declare void @take_small_smallstruct(i64, [1 x i32])
+define void @test_small_smallstruct() {
+; CHECK-LABEL: test_small_smallstruct:
+; CHECK-DAG: mov w0, #1
+; CHECK-DAG: mov w1, #2
+; CHECK: bl _take_small_smallstruct
+  call void @take_small_smallstruct(i64 1, [1 x i32] [i32 2])
+  ret void
+}
+
+define void @test_bare_frameaddr(i8** %addr) {
+; CHECK-LABEL: test_bare_frameaddr:
+; CHECK: add x[[LOCAL:[0-9]+]], sp, #{{[0-9]+}}
+; CHECK: str w[[LOCAL]],
+
+  %ptr = alloca i8
+  store i8* %ptr, i8** %addr, align 4
+  ret void
+}
+
+define void @test_sret_use([8 x i64]* sret %out) {
+; CHECK-LABEL: test_sret_use:
+; CHECK: str xzr, [x8]
+  %addr = getelementptr [8 x i64], [8 x i64]* %out, i32 0, i32 0
+  store i64 0, i64* %addr
+  ret void
+}
+
+define i64 @test_sret_call() {
+; CHECK-LABEL: test_sret_call:
+; CHECK: mov x8, sp
+; CHECK: bl _test_sret_use
+  %arr = alloca [8 x i64]
+  call void @test_sret_use([8 x i64]* sret %arr)
+
+  %addr = getelementptr [8 x i64], [8 x i64]* %arr, i32 0, i32 0
+  %val = load i64, i64* %addr
+  ret i64 %val
+}
+
+define double @test_constpool() {
+; CHECK-LABEL: test_constpool:
+; CHECK: adrp x[[PAGE:[0-9]+]], [[POOL:lCPI[0-9]+_[0-9]+]]@PAGE
+; CHECK: ldr d0, [x[[PAGE]], [[POOL]]@PAGEOFF]
+  ret double 1.0e-6
+}
+
+define i8* @test_blockaddress() {
+; CHECK-LABEL: test_blockaddress:
+; CHECK: [[BLOCK:Ltmp[0-9]+]]:
+; CHECK: adrp [[PAGE:x[0-9]+]], [[BLOCK]]@PAGE
+; CHECK: add x0, [[PAGE]], [[BLOCK]]@PAGEOFF
+  br label %dest
+dest:
+  ret i8* blockaddress(@test_blockaddress, %dest)
+}
+
+define i8* @test_indirectbr(i8* %dest) {
+; CHECK-LABEL: test_indirectbr:
+; CHECK: br x0
+  indirectbr i8* %dest, [label %true, label %false]
+
+true:
+  ret i8* blockaddress(@test_indirectbr, %true)
+false:
+  ret i8* blockaddress(@test_indirectbr, %false)
+}
+
+; ISelDAGToDAG tries to fold an offset FI load (in this case var+4) into the
+; actual load instruction. This needs to be done slightly carefully since we
+; claim the FI in the process -- it doesn't need extending.
+define float @test_frameindex_offset_load() {
+; CHECK-LABEL: test_frameindex_offset_load:
+; CHECK: ldr s0, [sp, #4]
+  %arr = alloca float, i32 4, align 8
+  %addr = getelementptr inbounds float, float* %arr, i32 1
+
+  %val = load float, float* %addr, align 4
+  ret float %val
+}
+
+define void @test_unaligned_frameindex_offset_store() {
+; CHECK-LABEL: test_unaligned_frameindex_offset_store:
+; CHECK: mov x[[TMP:[0-9]+]], sp
+; CHECK: orr w[[ADDR:[0-9]+]], w[[TMP]], #0x2
+; CHECK: mov [[VAL:w[0-9]+]], #42
+; CHECK: str [[VAL]], [x[[ADDR]]]
+  %arr = alloca [4 x i32]
+
+  %addr.int = ptrtoint [4 x i32]* %arr to i32
+  %addr.nextint = add nuw i32 %addr.int, 2
+  %addr.next = inttoptr i32 %addr.nextint to i32*
+  store i32 42, i32* %addr.next
+  ret void
+}
+
+
+define {i64, i64*} @test_pre_idx(i64* %addr) {
+; CHECK-LABEL: test_pre_idx:
+
+; CHECK: add w[[ADDR:[0-9]+]], w0, #8
+; CHECK: ldr x0, [x[[ADDR]]]
+  %addr.int = ptrtoint i64* %addr to i32
+  %addr.next.int = add nuw i32 %addr.int, 8
+  %addr.next = inttoptr i32 %addr.next.int to i64*
+  %val = load i64, i64* %addr.next
+
+  %tmp = insertvalue {i64, i64*} undef, i64 %val, 0
+  %res = insertvalue {i64, i64*} %tmp, i64* %addr.next, 1
+
+  ret {i64, i64*} %res
+}
+
+; Forming a post-indexed load is invalid here since the GEP needs to work when
+; %addr wraps round to 0.
+define {i64, i64*} @test_invalid_pre_idx(i64* %addr) {
+; CHECK-LABEL: test_invalid_pre_idx:
+; CHECK: add w1, w0, #8
+; CHECK: ldr x0, [x1]
+  %addr.next = getelementptr i64, i64* %addr, i32 1
+  %val = load i64, i64* %addr.next
+
+  %tmp = insertvalue {i64, i64*} undef, i64 %val, 0
+  %res = insertvalue {i64, i64*} %tmp, i64* %addr.next, 1
+
+  ret {i64, i64*} %res
+}
+
+declare void @callee([8 x i32]*)
+define void @test_stack_guard() ssp {
+; CHECK-LABEL: test_stack_guard:
+; CHECK: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF]
+; CHECK: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]]
+; CHECK: stur [[GUARD_VAL]], [x29, #[[GUARD_OFFSET:-[0-9]+]]]
+
+; CHECK: add x0, sp, #{{[0-9]+}}
+; CHECK: bl _callee
+
+; CHECK-OPT: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK-OPT: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF]
+; CHECK-OPT: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]]
+; CHECK-OPT: ldur [[NEW_VAL:w[0-9]+]], [x29, #[[GUARD_OFFSET]]]
+; CHECK-OPT: cmp [[GUARD_VAL]], [[NEW_VAL]]
+; CHECK-OPT: b.ne [[FAIL:LBB[0-9]+_[0-9]+]]
+
+; CHECK-OPT: [[FAIL]]:
+; CHECK-OPT-NEXT: bl ___stack_chk_fail
+  %arr = alloca [8 x i32]
+  call void @callee([8 x i32]* %arr)
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
+declare void @eat_landingpad_args(i32, i8*, i32)
+@_ZTI8Whatever = external global i8
+define void @test_landingpad_marshalling() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: test_landingpad_marshalling:
+; CHECK-OPT: mov x2, x1
+; CHECK-OPT: mov x1, x0
+; CHECK: bl _eat_landingpad_args
+  invoke void @callee([8 x i32]* undef) to label %done unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %exc = landingpad { i8*, i32 }
+          catch i8* @_ZTI8Whatever
+  %pointer = extractvalue { i8*, i32 } %exc, 0
+  %selector = extractvalue { i8*, i32 } %exc, 1
+  call void @eat_landingpad_args(i32 undef, i8* %pointer, i32 %selector)
+  ret void
+
+done:
+  ret void
+}
+
+define void @test_dynamic_stackalloc() {
+; CHECK-LABEL: test_dynamic_stackalloc:
+; CHECK: sub [[REG:x[0-9]+]], sp, #32
+; CHECK: mov sp, [[REG]]
+; CHECK-OPT-NOT: ubfx
+; CHECK: bl _callee
+  br label %next
+
+next:
+  %val = alloca [8 x i32]
+  call void @callee([8 x i32]* %val)
+  ret void
+}
+
+define void @test_asm_memory(i32* %base.addr) {
+; CHECK-LABEL: test_asm_memory:
+; CHECK: add w[[ADDR:[0-9]+]], w0, #4
+; CHECK: str wzr, [x[[ADDR]]
+  %addr = getelementptr i32, i32* %base.addr, i32 1
+  call void asm sideeffect "str wzr, $0", "*m"(i32* %addr)
+  ret void
+}
+
+define void @test_unsafe_asm_memory(i64 %val) {
+; CHECK-LABEL: test_unsafe_asm_memory:
+; CHECK: and x[[ADDR:[0-9]+]], x0, #0xffffffff
+; CHECK: str wzr, [x[[ADDR]]]
+  %addr_int = trunc i64 %val to i32
+  %addr = inttoptr i32 %addr_int to i32*
+  call void asm sideeffect "str wzr, $0", "*m"(i32* %addr)
+  ret void
+}
+
+define [9 x i8*] @test_demoted_return(i8* %in) {
+; CHECK-LABEL: test_demoted_return:
+; CHECK: str w0, [x8, #32]
+  %res = insertvalue [9 x i8*] undef, i8* %in, 8
+  ret [9 x i8*] %res
+}
+
+define i8* @test_inttoptr(i64 %in) {
+; CHECK-LABEL: test_inttoptr:
+; CHECK: and x0, x0, #0xffffffff
+  %res = inttoptr i64 %in to i8*
+  ret i8* %res
+}
+
+declare i32 @llvm.get.dynamic.area.offset.i32()
+define i32 @test_dynamic_area() {
+; CHECK-LABEL: test_dynamic_area:
+; CHECK: mov w0, wzr
+  %res = call i32 @llvm.get.dynamic.area.offset.i32()
+  ret i32 %res
+}
+
+define void @test_pointer_vec_store(<2 x i8*>* %addr) {
+; CHECK-LABEL: test_pointer_vec_store:
+; CHECK: str xzr, [x0]
+; CHECK-NOT: str
+; CHECK-NOT: stp
+
+  store <2 x i8*> zeroinitializer, <2 x i8*>* %addr, align 16
+  ret void
+}
+
+define <2 x i8*> @test_pointer_vec_load(<2 x i8*>* %addr) {
+; CHECK-LABEL: test_pointer_vec_load:
+; CHECK: ldr d[[TMP:[0-9]+]], [x0]
+; CHECK: ushll.2d v0, v[[TMP]], #0
+  %val = load <2 x i8*>, <2 x i8*>* %addr, align 16
+  ret <2 x i8*> %val
+}
+
+define void @test_inline_asm_mem_pointer(i32* %in) {
+; CHECK-LABEL: test_inline_asm_mem_pointer:
+; CHECK: str w0,
+  tail call void asm sideeffect "ldr x0, $0", "rm"(i32* %in)
+  ret void
+}
+
+
+define void @test_struct_hi(i32 %hi) nounwind {
+; CHECK-LABEL: test_struct_hi:
+; CHECK: mov w[[IN:[0-9]+]], w0
+; CHECK: bl _get_int
+; CHECK-NEXT: bfi x0, x[[IN]], #32, #32
+; CHECK-NEXT: bl _take_pair
+  %val.64 = call i64 @get_int()
+  %val.32 = trunc i64 %val.64 to i32
+
+  %pair.0 = insertvalue [2 x i32] undef, i32 %val.32, 0
+  %pair.1 = insertvalue [2 x i32] %pair.0, i32 %hi, 1
+  call void @take_pair([2 x i32] %pair.1)
+
+  ret void
+}
+declare void @take_pair([2 x i32])
+declare i64 @get_int()
+
+define i1 @test_icmp_ptr(i8* %in) {
+; CHECK-LABEL: test_icmp_ptr
+; CHECK: ubfx x0, x0, #31, #1
+  %res = icmp slt i8* %in, null
+  ret i1 %res
+}
+
+define void @test_multiple_icmp_ptr(i8* %l, i8* %r) {
+; CHECK-LABEL: test_multiple_icmp_ptr:
+; CHECK: tbnz w0, #31, [[FALSEBB:LBB[0-9]+_[0-9]+]]
+; CHECK: tbnz w1, #31, [[FALSEBB]]
+  %tst1 = icmp sgt i8* %l, inttoptr (i32 -1 to i8*)
+  %tst2 = icmp sgt i8* %r, inttoptr (i32 -1 to i8*)
+  %tst = and i1 %tst1, %tst2
+  br i1 %tst, label %true, label %false
+
+true:
+  call void(...) @bar()
+  ret void
+
+false:
+  ret void
+}
+
+define { [18 x i8] }* @test_gep_nonpow2({ [18 x i8] }* %a0, i32 %a1) {
+; CHECK-LABEL: test_gep_nonpow2:
+; CHECK:      mov w[[SIZE:[0-9]+]], #18
+; CHECK-NEXT: smaddl x0, w1, w[[SIZE]], x0
+; CHECK-NEXT: ret
+  %tmp0 = getelementptr inbounds { [18 x i8] }, { [18 x i8] }* %a0, i32 %a1
+  ret { [18 x i8] }* %tmp0
+}
+
+define void @test_bzero(i64 %in)  {
+; CHECK-LABEL: test_bzero:
+; CHECK-DAG: lsr x1, x0, #32
+; CHECK-DAG: and x0, x0, #0xffffffff
+; CHECK: bl _bzero
+
+  %ptr.i32 = trunc i64 %in to i32
+  %size.64 = lshr i64 %in, 32
+  %size = trunc i64 %size.64 to i32
+  %ptr = inttoptr i32 %ptr.i32 to i8*
+  tail call void @llvm.memset.p0i8.i32(i8* align 4 %ptr, i8 0, i32 %size, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1)
diff --git a/llvm/test/CodeGen/AArch64/fastcc-reserved.ll b/llvm/test/CodeGen/AArch64/fastcc-reserved.ll
index b5e03f08280..a463e622179 100644
--- a/llvm/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/llvm/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -4,7 +4,7 @@
 ; call-frame is not reserved (hence disable-fp-elim), but where
 ; callee-pop can occur (hence tailcallopt).
 
-declare fastcc void @will_pop([8 x i32], i32 %val)
+declare fastcc void @will_pop([8 x i64], i32 %val)
 
 define fastcc void @foo(i32 %in) {
 ; CHECK-LABEL: foo:
@@ -18,7 +18,7 @@ define fastcc void @foo(i32 %in) {
 ; Reserve space for call-frame:
 ; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
-  call fastcc void @will_pop([8 x i32] undef, i32 42)
+  call fastcc void @will_pop([8 x i64] undef, i32 42)
 ; CHECK: bl will_pop
 
 ; Since @will_pop is fastcc with tailcallopt, it will put the stack
@@ -31,7 +31,7 @@ define fastcc void @foo(i32 %in) {
   ret void
 }
 
-declare void @wont_pop([8 x i32], i32 %val)
+declare void @wont_pop([8 x i64], i32 %val)
 
 define void @foo1(i32 %in) {
 ; CHECK-LABEL: foo1:
@@ -44,7 +44,7 @@ define void @foo1(i32 %in) {
 ; Reserve space for call-frame
 ; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
-  call void @wont_pop([8 x i32] undef, i32 42)
+  call void @wont_pop([8 x i64] undef, i32 42)
 ; CHECK: bl wont_pop
 
 ; This time we *do* need to unreserve the call-frame
diff --git a/llvm/test/CodeGen/AArch64/fastcc.ll b/llvm/test/CodeGen/AArch64/fastcc.ll
index d4e116134cd..fbdbf60ac8f 100644
--- a/llvm/test/CodeGen/AArch64/fastcc.ll
+++ b/llvm/test/CodeGen/AArch64/fastcc.ll
@@ -18,7 +18,7 @@ define fastcc void @func_stack0() {
 ; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
-  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+  call fastcc void @func_stack8([8 x i64] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 ; CHECK-NOT: [sp, #{{[-0-9]+}}]!
@@ -28,7 +28,7 @@ define fastcc void @func_stack0() {
 ; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
-  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+  call fastcc void @func_stack32([8 x i64] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
@@ -56,7 +56,7 @@ define fastcc void @func_stack0() {
 ; CHECK-TAIL-NEXT: ret
 }
 
-define fastcc void @func_stack8([8 x i32], i32 %stacked) {
+define fastcc void @func_stack8([8 x i64], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
 ; CHECK: sub sp, sp, #48
 ; CHECK: stp x29, x30, [sp, #32]
@@ -71,7 +71,7 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
-  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+  call fastcc void @func_stack8([8 x i64] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 ; CHECK-NOT: [sp, #{{[-0-9]+}}]!
@@ -82,7 +82,7 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
-  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+  call fastcc void @func_stack32([8 x i64] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
@@ -109,7 +109,7 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-TAIL-NEXT: ret
 }
 
-define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
 ; CHECK: add x29, sp, #32
 
@@ -117,7 +117,7 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-TAIL: add x29, sp, #32
 
 
-  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+  call fastcc void @func_stack8([8 x i64] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 ; CHECK-NOT: [sp, #{{[-0-9]+}}]!
@@ -127,7 +127,7 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
-  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+  call fastcc void @func_stack32([8 x i64] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
@@ -155,7 +155,7 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 }
 
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
-define fastcc void @func_stack32_leaf([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32_leaf([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf:
 ; CHECK: str     x20, [sp, #-16]!
 ; CHECK: nop
@@ -186,7 +186,7 @@ define fastcc void @func_stack32_leaf([8 x i32], i128 %stacked0, i128 %stacked1)
 }
 
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
-define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32_leaf_local([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf_local:
 ; CHECK: sub     sp, sp, #32
 ; CHECK-NEXT: str     x20, [sp, #16]
@@ -222,7 +222,7 @@ define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %sta
 }
 
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
-define fastcc void @func_stack32_leaf_local_nocs([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32_leaf_local_nocs([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf_local_nocs:
 ; CHECK: sub     sp, sp, #16
 ; CHECK: add     sp, sp, #16
diff --git a/llvm/test/CodeGen/AArch64/jump-table-32.ll b/llvm/test/CodeGen/AArch64/jump-table-32.ll
new file mode 100644
index 00000000000..339a44fc95a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/jump-table-32.ll
@@ -0,0 +1,42 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64_32-apple-ios7.0 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+
+define i32 @test_jumptable(i32 %in) {
+; CHECK: test_jumptable
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK: adrp    [[JTPAGE:x[0-9]+]], LJTI0_0@PAGE
+; CHECK: mov     w[[INDEX:[0-9]+]], w0
+; CHECK: add     x[[JT:[0-9]+]], [[JTPAGE]], LJTI0_0@PAGEOFF
+; CHECK: adr     [[BASE_BLOCK:x[0-9]+]], LBB0_2
+; CHECK: ldrb    w[[OFFSET:[0-9]+]], [x[[JT]], x[[INDEX]]]
+; CHECK: add     [[DEST:x[0-9]+]], [[BASE_BLOCK]], x[[OFFSET]], lsl #2
+; CHECK: br      [[DEST]]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+; CHECK: LJTI0_0:
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
diff --git a/llvm/test/CodeGen/AArch64/sibling-call.ll b/llvm/test/CodeGen/AArch64/sibling-call.ll
index be59f27fa85..a9e0225187e 100644
--- a/llvm/test/CodeGen/AArch64/sibling-call.ll
+++ b/llvm/test/CodeGen/AArch64/sibling-call.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-ldst-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
-declare void @callee_stack8([8 x i32], i64)
-declare void @callee_stack16([8 x i32], i64, i64)
+declare void @callee_stack8([8 x i64], i64)
+declare void @callee_stack16([8 x i64], i64, i64)
 
 define void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
@@ -12,7 +12,7 @@ define void @caller_to0_from0() nounwind {
 ; CHECK-NEXT: b callee_stack0
 }
 
-define void @caller_to0_from8([8 x i32], i64) nounwind{
+define void @caller_to0_from8([8 x i64], i64) nounwind{
 ; CHECK-LABEL: caller_to0_from8:
 ; CHECK-NEXT: // %bb.
 
@@ -26,51 +26,51 @@ define void @caller_to8_from0() {
 
 ; Caller isn't going to clean up any extra stack we allocate, so it
 ; can't be a tail call.
-  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  tail call void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 ; CHECK: bl callee_stack8
 }
 
-define void @caller_to8_from8([8 x i32], i64 %a) {
+define void @caller_to8_from8([8 x i64], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK-NOT: sub sp, sp,
 
 ; This should reuse our stack area for the 42
-  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  tail call void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 ; CHECK: str {{x[0-9]+}}, [sp]
 ; CHECK-NEXT: b callee_stack8
 }
 
-define void @caller_to16_from8([8 x i32], i64 %a) {
+define void @caller_to16_from8([8 x i64], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 
 ; Shouldn't be a tail call: we can't use SP+8 because our caller might
 ; have something there. This may sound obvious but implementation does
 ; some funky aligning.
-  tail call void @callee_stack16([8 x i32] undef, i64 undef, i64 undef)
+  tail call void @callee_stack16([8 x i64] undef, i64 undef, i64 undef)
 ; CHECK: bl callee_stack16
   ret void
 }
 
-define void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
+define void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK-NOT: sub sp, sp
 
 ; Reuse our area, putting "42" at incoming sp
-  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  tail call void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 ; CHECK: str {{x[0-9]+}}, [sp]
 ; CHECK-NEXT: b callee_stack8
 }
 
-define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
+define void @caller_to16_from16([8 x i64], i64 %a, i64 %b) {
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK-NOT: sub sp, sp,
 
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
-  tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
+  tail call void @callee_stack16([8 x i64] undef, i64 %b, i64 %a)
   ret void
 
 ; CHECK: ldr [[VAL0:x[0-9]+]],
diff --git a/llvm/test/CodeGen/AArch64/swift-return.ll b/llvm/test/CodeGen/AArch64/swift-return.ll
index 3d0bed4c934..2bf5e379b37 100644
--- a/llvm/test/CodeGen/AArch64/swift-return.ll
+++ b/llvm/test/CodeGen/AArch64/swift-return.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
 ; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-O0
+; RUN: llc -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s
+; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-O0
 
 ; CHECK-LABEL: test1
 ; CHECK: bl      _gen
@@ -8,7 +10,7 @@
 ; CHECK-O0-LABEL: test1
 ; CHECK-O0: bl      _gen
 ; CHECK-O0: sxth    [[TMP:w.*]], w0
-; CHECK-O0: add     w8, [[TMP]], w1, sxtb
+; CHECK-O0: add     {{w[0-9]+}}, [[TMP]], w1, sxtb
 define i16 @test1(i32) {
 entry:
   %call = call swiftcc { i16, i8 } @gen(i32 %0)
diff --git a/llvm/test/CodeGen/AArch64/swiftcc.ll b/llvm/test/CodeGen/AArch64/swiftcc.ll
index 43249542715..fb74fe4a6b1 100644
--- a/llvm/test/CodeGen/AArch64/swiftcc.ll
+++ b/llvm/test/CodeGen/AArch64/swiftcc.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s
 
 ; CHECK: t1
 ; CHECK: fadd s0, s0, s1
diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index 823599d5de6..93529d1ee86 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -1,5 +1,7 @@
-; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -enable-shrink-wrap=false < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-APPLE %s
-; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -O0 -fast-isel < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-O0 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -enable-shrink-wrap=false < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-APPLE --check-prefix=CHECK-APPLE-AARCH64 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -O0 -fast-isel < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-O0 --check-prefix=CHECK-O0-AARCH64 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -enable-shrink-wrap=false < %s -mtriple=arm64_32-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-APPLE --check-prefix=CHECK-APPLE-ARM64_32 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -O0 -fast-isel < %s -mtriple=arm64_32-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-O0 --check-prefix=CHECK-O0-ARM64_32 %s
 
 declare i8* @malloc(i64)
 declare void @free(i8*)
@@ -40,7 +42,8 @@ define float @caller(i8* %error_ref) {
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
 ; CHECK-APPLE: mov x0, x21
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: cbnz x21
+; CHECK-APPLE-ARM64_32: cbnz w0
 ; Access part of the error object and save it to error_ref
 ; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
@@ -50,7 +53,8 @@ define float @caller(i8* %error_ref) {
 ; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
 ; CHECK-O0: mov [[ID:x[0-9]+]], x21
-; CHECK-O0: cbnz x21
+; CHECK-O0-AARCH64: cbnz x21
+; CHECK-O0-ARM64_32: cmp x21, #0
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -76,7 +80,8 @@ define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE: fmov [[CMP:s[0-9]+]], #1.0
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: cbnz x21
+; CHECK-APPLE-ARM64_32: cbnz w21
 ; CHECK-APPLE: fcmp s0, [[CMP]]
 ; CHECK-APPLE: b.le
 ; Access part of the error object and save it to error_ref
@@ -89,7 +94,8 @@ define float @caller2(i8* %error_ref) {
 ; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
 ; CHECK-O0: mov [[ID:x[0-9]+]], x21
-; CHECK-O0: cbnz x21
+; CHECK-O0-AARCH64: cbnz x21
+; CHECK-O0-ARM64_32: cmp x21, #0
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   br label %bb_loop
@@ -171,29 +177,52 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-APPLE: mov x21, x0
 ; CHECK-APPLE: ret
 
-; CHECK-O0-LABEL: foo_loop:
+; CHECK-O0-AARCH64-LABEL: foo_loop:
 ; spill x21
-; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]]
-; CHECK-O0: b [[BB1:[A-Za-z0-9_]*]]
-; CHECK-O0: [[BB1]]:
-; CHECK-O0: ldr     x0, [sp, [[SLOT]]]
-; CHECK-O0: str     x0, [sp, [[SLOT2:#[0-9]+]]]
-; CHECK-O0: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]]
-; CHECK-O0: mov w{{.*}}, #16
-; CHECK-O0: malloc
-; CHECK-O0: mov [[ID:x[0-9]+]], x0
-; CHECK-O0: strb w{{.*}}, [{{.*}}[[ID]], #8]
+; CHECK-O0-AARCH64: str x21, [sp, [[SLOT:#[0-9]+]]]
+; CHECK-O0-AARCH64: b [[BB1:[A-Za-z0-9_]*]]
+; CHECK-O0-AARCH64: [[BB1]]:
+; CHECK-O0-AARCH64: ldr     x0, [sp, [[SLOT]]]
+; CHECK-O0-AARCH64: str     x0, [sp, [[SLOT2:#[0-9]+]]]
+; CHECK-O0-AARCH64: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]]
+; CHECK-O0-AARCH64: mov w{{.*}}, #16
+; CHECK-O0-AARCH64: malloc
+; CHECK-O0-AARCH64: mov [[ID:x[0-9]+]], x0
+; CHECK-O0-AARCH64: strb w{{.*}}, [{{.*}}[[ID]], #8]
 ; spill x0
-; CHECK-O0: str x0, [sp, [[SLOT2]]]
-; CHECK-O0:[[BB2]]:
-; CHECK-O0: ldr     x0, [sp, [[SLOT2]]]
-; CHECK-O0: fcmp
-; CHECK-O0: str     x0, [sp]
-; CHECK-O0: b.le [[BB1]]
+; CHECK-O0-AARCH64: str x0, [sp, [[SLOT2]]]
+; CHECK-O0-AARCH64:[[BB2]]:
+; CHECK-O0-AARCH64: ldr     x0, [sp, [[SLOT2]]]
+; CHECK-O0-AARCH64: fcmp
+; CHECK-O0-AARCH64: str     x0, [sp]
+; CHECK-O0-AARCH64: b.le [[BB1]]
 ; reload from stack
-; CHECK-O0: ldr [[ID3:x[0-9]+]], [sp]
-; CHECK-O0: mov x21, [[ID3]]
-; CHECK-O0: ret
+; CHECK-O0-AARCH64: ldr [[ID3:x[0-9]+]], [sp]
+; CHECK-O0-AARCH64: mov x21, [[ID3]]
+; CHECK-O0-AARCH64: ret
+
+; CHECK-O0-ARM64_32-LABEL: foo_loop:
+; spill x21
+; CHECK-O0-ARM64_32: str x21, [sp, [[SLOT:#[0-9]+]]]
+; CHECK-O0-ARM64_32: b [[BB1:[A-Za-z0-9_]*]]
+; CHECK-O0-ARM64_32: [[BB1]]:
+; CHECK-O0-ARM64_32: ldr     x0, [sp, [[SLOT]]]
+; CHECK-O0-ARM64_32: str     x0, [sp, [[SLOT2:#[0-9]+]]]
+; CHECK-O0-ARM64_32: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]]
+; CHECK-O0-ARM64_32: mov w{{.*}}, #16
+; CHECK-O0-ARM64_32: malloc
+; CHECK-O0-ARM64_32: mov {{.*}}, x0
+; CHECK-O0-ARM64_32: strb w{{.*}},
+; CHECK-O0-ARM64_32:[[BB2]]:
+; CHECK-O0-ARM64_32: ldr     x0, [sp, [[SLOT2]]]
+; CHECK-O0-ARM64_32: fcmp
+; CHECK-O0-ARM64_32: str     x0, [sp[[OFFSET:.*]]]
+; CHECK-O0-ARM64_32: b.le [[BB1]]
+; reload from stack
+; CHECK-O0-ARM64_32: ldr [[ID3:x[0-9]+]], [sp[[OFFSET]]]
+; CHECK-O0-ARM64_32: mov x21, [[ID3]]
+; CHECK-O0-ARM64_32: ret
+
 entry:
   br label %bb_loop
 
@@ -261,7 +290,8 @@ define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_sret
 ; CHECK-APPLE: mov x0, x21
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: cbnz x21
+; CHECK-APPLE-ARM64_32: cbnz w0
 ; Access part of the error object and save it to error_ref
 ; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
@@ -273,7 +303,8 @@ define float @caller3(i8* %error_ref) {
 ; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo_sret
 ; CHECK-O0: mov [[ID2:x[0-9]+]], x21
-; CHECK-O0: cbnz x21
+; CHECK-O0-AARCH64: cbnz x21
+; CHECK-O0-ARM64_32: cmp x21, #0
 ; Access part of the error object and save it to error_ref
 ; reload from stack
 ; CHECK-O0: ldrb [[CODE:w[0-9]+]]
@@ -306,20 +337,22 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE-LABEL: foo_vararg:
 ; CHECK-APPLE: mov w0, #16
 ; CHECK-APPLE: malloc
-; CHECK-APPLE-DAG: mov [[ID:w[0-9]+]], #1
-; CHECK-APPLE-DAG: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
-; CHECK-APPLE-DAG: strb [[ID]], [x0, #8]
 
 ; First vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16]
+; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP:x[0-9]+]], #16]
+; CHECK-APPLE-AARCH64: mov [[ID:w[0-9]+]], #1
+; CHECK-APPLE-AARCH64: add [[ARGS:x[0-9]+]], [[TMP]], #16
+; CHECK-APPLE-AARCH64: strb [[ID]], [x0, #8]
 ; Second vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24]
-; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16
+; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24]
 ; Third vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32]
+; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32]
+
+; CHECK-APPLE-ARM64_32: mov [[ID:w[0-9]+]], #1
+; CHECK-APPLE-ARM64_32: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
+; CHECK-APPLE-ARM64_32: strb [[ID]], [x0, #8]
+
 
-; CHECK-APPLE: mov x21, x0
-; CHECK-APPLE-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -347,18 +380,18 @@ entry:
 define float @caller4(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller4:
 
-; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
-; CHECK-APPLE: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK-APPLE: str {{x[0-9]+}}, [sp]
+; CHECK-APPLE-AARCH64: mov [[ID:x[0-9]+]], x0
+; CHECK-APPLE-AARCH64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK-APPLE-AARCH64: str {{x[0-9]+}}, [sp]
 
-; CHECK-APPLE: mov x21, xzr
-; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: mov x0, x21
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: mov x21, xzr
+; CHECK-APPLE-AARCH64: bl {{.*}}foo_vararg
+; CHECK-APPLE-AARCH64: mov x0, x21
+; CHECK-APPLE-AARCH64: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
-; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: bl {{.*}}free
+; CHECK-APPLE-AARCH64: ldrb [[CODE:w[0-9]+]], [x0, #8]
+; CHECK-APPLE-AARCH64: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE-AARCH64: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   store %swift_error* null, %swift_error** %error_ptr_ref
diff --git a/llvm/test/CodeGen/AArch64/swiftself.ll b/llvm/test/CodeGen/AArch64/swiftself.ll
index 063085636b3..a13fbb8d5a6 100644
--- a/llvm/test/CodeGen/AArch64/swiftself.ll
+++ b/llvm/test/CodeGen/AArch64/swiftself.ll
@@ -1,6 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=OPTAARCH64 %s
 ; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=OPTAARCH64 %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=OPTARM64_32 %s
 
 ; Parameter with swiftself should be allocated to x20.
 ; CHECK-LABEL: swiftself_param:
@@ -48,8 +49,9 @@ define void @swiftself_passthrough(i8* swiftself %addr0) {
 ; We can use a tail call if the callee swiftself is the same as the caller one.
 ; This should also work with fast-isel.
 ; CHECK-LABEL: swiftself_tail:
-; CHECK: b {{_?}}swiftself_param
-; CHECK-NOT: ret
+; OPTAARCH64: b {{_?}}swiftself_param
+; OPTAARCH64-NOT: ret
+; OPTARM64_32: bl {{_?}}swiftself_param
 define i8* @swiftself_tail(i8* swiftself %addr0) {
   call void asm sideeffect "", "~{x20}"()
   %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
@@ -71,12 +73,19 @@ define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
 ; we normally would. We marked the first parameter with swiftself which means it
 ; will no longer be passed in x0.
 declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
-; OPT-LABEL: swiftself_nothisreturn:
-; OPT-DAG: ldr  x20, [x20]
-; OPT-DAG: mov [[CSREG:x[1-9].*]], x8
-; OPT: bl {{_?}}thisreturn_attribute
-; OPT: str x0, {{\[}}[[CSREG]]
-; OPT: ret
+; OPTAARCH64-LABEL: swiftself_nothisreturn:
+; OPTAARCH64-DAG: ldr  x20, [x20]
+; OPTAARCH64-DAG: mov [[CSREG:x[1-9].*]], x8
+; OPTAARCH64: bl {{_?}}thisreturn_attribute
+; OPTAARCH64: str x0, {{\[}}[[CSREG]]
+; OPTAARCH64: ret
+
+; OPTARM64_32-LABEL: swiftself_nothisreturn:
+; OPTARM64_32-DAG: ldr  w20, [x20]
+; OPTARM64_32-DAG: mov [[CSREG:x[1-9].*]], x8
+; OPTARM64_32: bl {{_?}}thisreturn_attribute
+; OPTARM64_32: str w0, {{\[}}[[CSREG]]
+; OPTARM64_32: ret
 define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret, i8** noalias nocapture readonly swiftself) {
 entry:
   %2 = load i8*, i8** %1, align 8
diff --git a/llvm/test/CodeGen/AArch64/tail-call.ll b/llvm/test/CodeGen/AArch64/tail-call.ll
index fe3e7d7c6a6..fafad4b0c58 100644
--- a/llvm/test/CodeGen/AArch64/tail-call.ll
+++ b/llvm/test/CodeGen/AArch64/tail-call.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefixes=GISEL,COMMON
 
 declare fastcc void @callee_stack0()
-declare fastcc void @callee_stack8([8 x i32], i64)
-declare fastcc void @callee_stack16([8 x i32], i64, i64)
+declare fastcc void @callee_stack8([8 x i64], i64)
+declare fastcc void @callee_stack16([8 x i64], i64, i64)
 declare extern_weak fastcc void @callee_weak()
 
 define fastcc void @caller_to0_from0() nounwind {
@@ -16,7 +16,7 @@ define fastcc void @caller_to0_from0() nounwind {
 ; COMMON-NEXT: b callee_stack0
 }
 
-define fastcc void @caller_to0_from8([8 x i32], i64) {
+define fastcc void @caller_to0_from8([8 x i64], i64) {
 ; COMMON-LABEL: caller_to0_from8:
 
   tail call fastcc void @callee_stack0()
@@ -32,33 +32,33 @@ define fastcc void @caller_to8_from0() {
 
 ; Key point is that the "42" should go #16 below incoming stack
 ; pointer (we didn't have arg space to reuse).
-  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  tail call fastcc void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 
 ; COMMON: str {{x[0-9]+}}, [sp, #16]!
 ; COMMON-NEXT: b callee_stack8
 }
 
-define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
+define fastcc void @caller_to8_from8([8 x i64], i64 %a) {
 ; COMMON-LABEL: caller_to8_from8:
 ; COMMON: sub sp, sp, #16
 
 ; Key point is that the "%a" should go where at SP on entry.
-  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  tail call fastcc void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 
 ; COMMON: str {{x[0-9]+}}, [sp, #16]!
 ; COMMON-NEXT: b callee_stack8
 }
 
-define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
+define fastcc void @caller_to16_from8([8 x i64], i64 %a) {
 ; COMMON-LABEL: caller_to16_from8:
 ; COMMON: sub sp, sp, #16
 
 ; Important point is that the call reuses the "dead" argument space
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
-  tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
+  tail call fastcc void @callee_stack16([8 x i64] undef, i64 42, i64 2)
 
 ; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]!
 ; COMMON-NEXT: b callee_stack16
@@ -66,12 +66,12 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 }
 
 
-define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
+define fastcc void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) {
 ; COMMON-LABEL: caller_to8_from24:
 ; COMMON: sub sp, sp, #16
 
 ; Key point is that the "%a" should go where at #16 above SP on entry.
-  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  tail call fastcc void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 
 ; COMMON: str {{x[0-9]+}}, [sp, #32]!
@@ -79,13 +79,13 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
 }
 
 
-define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
+define fastcc void @caller_to16_from16([8 x i64], i64 %a, i64 %b) {
 ; COMMON-LABEL: caller_to16_from16:
 ; COMMON: sub sp, sp, #16
 
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
-  tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
+  tail call fastcc void @callee_stack16([8 x i64] undef, i64 %b, i64 %a)
   ret void
 
 ; COMMON: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index 51522e1d12e..8edd867ff16 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -27,8 +27,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-NEXT:    orr w10, w10, w11
 ; AARCH-NEXT:    orr w9, w10, w9
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    mov x1, x8
-; AARCH-NEXT:    mov w2, w9
+; AARCH-DAG:    mov x1, x8
+; AARCH-DAG:    mov w2, w9
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll
index 50f7bf5907b..38775f2874f 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll
@@ -261,11 +261,11 @@ define i32 @snprintf(i8*, i64, i8*, ...) local_unnamed_addr #5 {
 ; CHECK-DAG: mov     w6,  w3
 ; CHECK-DAG: mov     [[REG1:w[0-9]+]],  w2
 ; CHECK: mov     w2, w1
-; CHECK: str     w4,  [sp]
 ; CHECK: fmov    x1,  d0
 ; CHECK: fmov    x3,  d1
 ; CHECK: fmov    x5,  d2
 ; CHECK: fmov    x7,  d3
+; CHECK: str     w4,  [sp]
 ; CHECK: mov     w4,  [[REG1]]
 ; CHECK: str     x30, [sp, #16]
 ; CHECK: str     d4,  [sp, #8]
diff --git a/llvm/test/MC/AArch64/arm64_32-compact-unwind.s b/llvm/test/MC/AArch64/arm64_32-compact-unwind.s
new file mode 100644
index 00000000000..59d882ae3a5
--- /dev/null
+++ b/llvm/test/MC/AArch64/arm64_32-compact-unwind.s
@@ -0,0 +1,15 @@
+; RUN: llvm-mc -triple=arm64_32-ios7.0 -filetype=obj %s -o %t
+; RUN: llvm-objdump -s %t | FileCheck %s
+
+; The compact unwind format in ILP32 mode is pretty much the same, except
+; references to addresses (function, personality, LSDA) are pointer-sized.
+
+; CHECK: Contents of section __compact_unwind:
+; CHECK-NEXT:  0004 00000000 04000000 00000002 00000000
+; CHECK-NEXT:  0014 00000000
+        .globl  _test_compact_unwind
+        .align  2
+_test_compact_unwind:
+        .cfi_startproc
+        ret
+        .cfi_endproc