summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp8
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp18
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrFP.td16
-rw-r--r--llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll131
-rw-r--r--llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll131
-rw-r--r--llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll806
-rw-r--r--llvm/test/CodeGen/X86/fp-intrinsics.ll24
-rw-r--r--llvm/test/CodeGen/X86/fp128-cast-strict.ll12
-rw-r--r--llvm/test/CodeGen/X86/fp128-libcalls-strict.ll144
-rw-r--r--llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll72
10 files changed, 769 insertions, 593 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0aeb3c14aa3..9952d4d9ac9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6888,7 +6888,10 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
ValueVTs.push_back(MVT::Other); // Out chain
- SDValue Chain = getRoot();
+ // We do not need to serialize constrained FP intrinsics against
+ // each other or against (nonvolatile) loads, so they can be
+ // chained like loads.
+ SDValue Chain = DAG.getRoot();
SmallVector<SDValue, 4> Opers;
Opers.push_back(Chain);
if (FPI.isUnaryOp()) {
@@ -6926,8 +6929,9 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
}
assert(Result.getNode()->getNumValues() == 2);
+ // See above -- chain is handled like for loads here.
SDValue OutChain = Result.getValue(1);
- DAG.setRoot(OutChain);
+ PendingLoads.push_back(OutChain);
SDValue FPResult = Result.getValue(0);
setValue(&FPI, FPResult);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index a03f7923d71..b16d4af86a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -3171,13 +3171,19 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
case OPC_CheckFoldableChainNode: {
assert(NodeStack.size() != 1 && "No parent node");
// Verify that all intermediate nodes between the root and this one have
- // a single use.
+ // a single use (ignoring chains, which are handled in UpdateChains).
bool HasMultipleUses = false;
- for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
- if (!NodeStack[i].getNode()->hasOneUse()) {
- HasMultipleUses = true;
- break;
- }
+ for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) {
+ unsigned NNonChainUses = 0;
+ SDNode *NS = NodeStack[i].getNode();
+ for (auto UI = NS->use_begin(), UE = NS->use_end(); UI != UE; ++UI)
+ if (UI.getUse().getValueType() != MVT::Other)
+ if (++NNonChainUses > 1) {
+ HasMultipleUses = true;
+ break;
+ }
+ if (HasMultipleUses) break;
+ }
if (HasMultipleUses) break;
// Check to see that the target thinks this is profitable to fold and that
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 9c95e8aec94..3a185e538be 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -467,16 +467,16 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
// f64 multiplication of two FP32 registers.
let Uses = [FPC], mayRaiseFPException = 1 in
def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
-def : Pat<(any_fmul (f64 (fpextend FP32:$src1)),
- (f64 (fpextend FP32:$src2))),
+def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
+ (f64 (any_fpextend FP32:$src2))),
(MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
FP32:$src1, subreg_h32), FP32:$src2)>;
// f64 multiplication of an FP32 register and an f32 memory.
let Uses = [FPC], mayRaiseFPException = 1 in
def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
-def : Pat<(any_fmul (f64 (fpextend FP32:$src1)),
- (f64 (extloadf32 bdxaddr12only:$addr))),
+def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
+ (f64 (any_extloadf32 bdxaddr12only:$addr))),
(MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
bdxaddr12only:$addr)>;
@@ -484,8 +484,8 @@ def : Pat<(any_fmul (f64 (fpextend FP32:$src1)),
let Uses = [FPC], mayRaiseFPException = 1 in
def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
let Predicates = [FeatureNoVectorEnhancements1] in
- def : Pat<(any_fmul (f128 (fpextend FP64:$src1)),
- (f128 (fpextend FP64:$src2))),
+ def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)),
+ (f128 (any_fpextend FP64:$src2))),
(MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
FP64:$src1, subreg_h64), FP64:$src2)>;
@@ -493,8 +493,8 @@ let Predicates = [FeatureNoVectorEnhancements1] in
let Uses = [FPC], mayRaiseFPException = 1 in
def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
let Predicates = [FeatureNoVectorEnhancements1] in
- def : Pat<(any_fmul (f128 (fpextend FP64:$src1)),
- (f128 (extloadf64 bdxaddr12only:$addr))),
+ def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)),
+ (f128 (any_extloadf64 bdxaddr12only:$addr))),
(MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
bdxaddr12only:$addr)>;
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll
index 33e865d3d93..a5cd0b8c0bc 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll
@@ -1,6 +1,4 @@
; Test strict multiplication of two f32s, producing an f64 result.
-; FIXME: We should use llvm.experimental.constrained.fpext, but we currently
-; cannot match a combination of two strict operations in ISel.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
@@ -8,14 +6,19 @@ declare float @foo()
declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
; Check register multiplication.
define double @f1(float %f1, float %f2) #0 {
; CHECK-LABEL: f1:
; CHECK: mdebr %f0, %f2
; CHECK: br %r14
- %f1x = fpext float %f1 to double
- %f2x = fpext float %f2 to double
+ %f1x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f2,
+ metadata !"fpexcept.strict") #0
%res = call double @llvm.experimental.constrained.fmul.f64(
double %f1x, double %f2x,
metadata !"round.dynamic",
@@ -29,8 +32,12 @@ define double @f2(float %f1, float *%ptr) #0 {
; CHECK: mdeb %f0, 0(%r2)
; CHECK: br %r14
%f2 = load float, float *%ptr
- %f1x = fpext float %f1 to double
- %f2x = fpext float %f2 to double
+ %f1x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f2,
+ metadata !"fpexcept.strict") #0
%res = call double @llvm.experimental.constrained.fmul.f64(
double %f1x, double %f2x,
metadata !"round.dynamic",
@@ -45,8 +52,12 @@ define double @f3(float %f1, float *%base) #0 {
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 1023
%f2 = load float, float *%ptr
- %f1x = fpext float %f1 to double
- %f2x = fpext float %f2 to double
+ %f1x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f2,
+ metadata !"fpexcept.strict") #0
%res = call double @llvm.experimental.constrained.fmul.f64(
double %f1x, double %f2x,
metadata !"round.dynamic",
@@ -63,8 +74,12 @@ define double @f4(float %f1, float *%base) #0 {
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 1024
%f2 = load float, float *%ptr
- %f1x = fpext float %f1 to double
- %f2x = fpext float %f2 to double
+ %f1x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f2,
+ metadata !"fpexcept.strict") #0
%res = call double @llvm.experimental.constrained.fmul.f64(
double %f1x, double %f2x,
metadata !"round.dynamic",
@@ -80,8 +95,12 @@ define double @f5(float %f1, float *%base) #0 {
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 -1
%f2 = load float, float *%ptr
- %f1x = fpext float %f1 to double
- %f2x = fpext float %f2 to double
+ %f1x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f2,
+ metadata !"fpexcept.strict") #0
%res = call double @llvm.experimental.constrained.fmul.f64(
double %f1x, double %f2x,
metadata !"round.dynamic",
@@ -98,8 +117,12 @@ define double @f6(float %f1, float *%base, i64 %index) #0 {
%ptr1 = getelementptr float, float *%base, i64 %index
%ptr2 = getelementptr float, float *%ptr1, i64 100
%f2 = load float, float *%ptr2
- %f1x = fpext float %f1 to double
- %f2x = fpext float %f2 to double
+ %f1x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %f2,
+ metadata !"fpexcept.strict") #0
%res = call double @llvm.experimental.constrained.fmul.f64(
double %f1x, double %f2x,
metadata !"round.dynamic",
@@ -195,8 +218,12 @@ define float @f7(float *%ptr0) #0 {
%ret = call float @foo() #0
- %accext0 = fpext float %ret to double
- %ext0 = fpext float %frob0 to double
+ %accext0 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %ret,
+ metadata !"fpexcept.strict") #0
+ %ext0 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob0,
+ metadata !"fpexcept.strict") #0
%mul0 = call double @llvm.experimental.constrained.fmul.f64(
double %accext0, double %ext0,
metadata !"round.dynamic",
@@ -210,8 +237,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext1 = fpext float %trunc0 to double
- %ext1 = fpext float %frob1 to double
+ %accext1 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc0,
+ metadata !"fpexcept.strict") #0
+ %ext1 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob1,
+ metadata !"fpexcept.strict") #0
%mul1 = call double @llvm.experimental.constrained.fmul.f64(
double %accext1, double %ext1,
metadata !"round.dynamic",
@@ -225,8 +256,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext2 = fpext float %trunc1 to double
- %ext2 = fpext float %frob2 to double
+ %accext2 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc1,
+ metadata !"fpexcept.strict") #0
+ %ext2 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob2,
+ metadata !"fpexcept.strict") #0
%mul2 = call double @llvm.experimental.constrained.fmul.f64(
double %accext2, double %ext2,
metadata !"round.dynamic",
@@ -240,8 +275,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext3 = fpext float %trunc2 to double
- %ext3 = fpext float %frob3 to double
+ %accext3 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc2,
+ metadata !"fpexcept.strict") #0
+ %ext3 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob3,
+ metadata !"fpexcept.strict") #0
%mul3 = call double @llvm.experimental.constrained.fmul.f64(
double %accext3, double %ext3,
metadata !"round.dynamic",
@@ -255,8 +294,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext4 = fpext float %trunc3 to double
- %ext4 = fpext float %frob4 to double
+ %accext4 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc3,
+ metadata !"fpexcept.strict") #0
+ %ext4 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob4,
+ metadata !"fpexcept.strict") #0
%mul4 = call double @llvm.experimental.constrained.fmul.f64(
double %accext4, double %ext4,
metadata !"round.dynamic",
@@ -270,8 +313,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext5 = fpext float %trunc4 to double
- %ext5 = fpext float %frob5 to double
+ %accext5 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc4,
+ metadata !"fpexcept.strict") #0
+ %ext5 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob5,
+ metadata !"fpexcept.strict") #0
%mul5 = call double @llvm.experimental.constrained.fmul.f64(
double %accext5, double %ext5,
metadata !"round.dynamic",
@@ -285,8 +332,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext6 = fpext float %trunc5 to double
- %ext6 = fpext float %frob6 to double
+ %accext6 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc5,
+ metadata !"fpexcept.strict") #0
+ %ext6 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob6,
+ metadata !"fpexcept.strict") #0
%mul6 = call double @llvm.experimental.constrained.fmul.f64(
double %accext6, double %ext6,
metadata !"round.dynamic",
@@ -300,8 +351,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext7 = fpext float %trunc6 to double
- %ext7 = fpext float %frob7 to double
+ %accext7 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc6,
+ metadata !"fpexcept.strict") #0
+ %ext7 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob7,
+ metadata !"fpexcept.strict") #0
%mul7 = call double @llvm.experimental.constrained.fmul.f64(
double %accext7, double %ext7,
metadata !"round.dynamic",
@@ -315,8 +370,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext8 = fpext float %trunc7 to double
- %ext8 = fpext float %frob8 to double
+ %accext8 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc7,
+ metadata !"fpexcept.strict") #0
+ %ext8 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob8,
+ metadata !"fpexcept.strict") #0
%mul8 = call double @llvm.experimental.constrained.fmul.f64(
double %accext8, double %ext8,
metadata !"round.dynamic",
@@ -330,8 +389,12 @@ define float @f7(float *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext9 = fpext float %trunc8 to double
- %ext9 = fpext float %frob9 to double
+ %accext9 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %trunc8,
+ metadata !"fpexcept.strict") #0
+ %ext9 = call double @llvm.experimental.constrained.fpext.f64.f32(
+ float %frob9,
+ metadata !"fpexcept.strict") #0
%mul9 = call double @llvm.experimental.constrained.fmul.f64(
double %accext9, double %ext9,
metadata !"round.dynamic",
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll
index 9a8c868ad15..fe41a6506ce 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll
@@ -1,12 +1,11 @@
; Test strict multiplication of two f64s, producing an f128 result.
-; FIXME: We should use llvm.experimental.constrained.fpext, but we currently
-; cannot match a combination of two strict operations in ISel.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
declare fp128 @llvm.experimental.constrained.fmul.f128(fp128, fp128, metadata, metadata)
declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata)
declare double @foo()
@@ -19,8 +18,12 @@ define void @f1(double %f1, double %dummy, double %f2, fp128 *%dst) #0 {
; CHECK: std %f0, 0(%r2)
; CHECK: std %f2, 8(%r2)
; CHECK: br %r14
- %f1x = fpext double %f1 to fp128
- %f2x = fpext double %f2 to fp128
+ %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f2,
+ metadata !"fpexcept.strict") #0
%res = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %f1x, fp128 %f2x,
metadata !"round.dynamic",
@@ -37,8 +40,12 @@ define void @f2(double %f1, double *%ptr, fp128 *%dst) #0 {
; CHECK: std %f2, 8(%r3)
; CHECK: br %r14
%f2 = load double, double *%ptr
- %f1x = fpext double %f1 to fp128
- %f2x = fpext double %f2 to fp128
+ %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f2,
+ metadata !"fpexcept.strict") #0
%res = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %f1x, fp128 %f2x,
metadata !"round.dynamic",
@@ -56,8 +63,12 @@ define void @f3(double %f1, double *%base, fp128 *%dst) #0 {
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 511
%f2 = load double, double *%ptr
- %f1x = fpext double %f1 to fp128
- %f2x = fpext double %f2 to fp128
+ %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f2,
+ metadata !"fpexcept.strict") #0
%res = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %f1x, fp128 %f2x,
metadata !"round.dynamic",
@@ -77,8 +88,12 @@ define void @f4(double %f1, double *%base, fp128 *%dst) #0 {
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 512
%f2 = load double, double *%ptr
- %f1x = fpext double %f1 to fp128
- %f2x = fpext double %f2 to fp128
+ %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f2,
+ metadata !"fpexcept.strict") #0
%res = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %f1x, fp128 %f2x,
metadata !"round.dynamic",
@@ -97,8 +112,12 @@ define void @f5(double %f1, double *%base, fp128 *%dst) #0 {
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 -1
%f2 = load double, double *%ptr
- %f1x = fpext double %f1 to fp128
- %f2x = fpext double %f2 to fp128
+ %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f2,
+ metadata !"fpexcept.strict") #0
%res = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %f1x, fp128 %f2x,
metadata !"round.dynamic",
@@ -118,8 +137,12 @@ define void @f6(double %f1, double *%base, i64 %index, fp128 *%dst) #0 {
%ptr1 = getelementptr double, double *%base, i64 %index
%ptr2 = getelementptr double, double *%ptr1, i64 100
%f2 = load double, double *%ptr2
- %f1x = fpext double %f1 to fp128
- %f2x = fpext double %f2 to fp128
+ %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f1,
+ metadata !"fpexcept.strict") #0
+ %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %f2,
+ metadata !"fpexcept.strict") #0
%res = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %f1x, fp128 %f2x,
metadata !"round.dynamic",
@@ -216,8 +239,12 @@ define double @f7(double *%ptr0) #0 {
%ret = call double @foo() #0
- %accext0 = fpext double %ret to fp128
- %ext0 = fpext double %frob0 to fp128
+ %accext0 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %ret,
+ metadata !"fpexcept.strict") #0
+ %ext0 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob0,
+ metadata !"fpexcept.strict") #0
%mul0 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext0, fp128 %ext0,
metadata !"round.dynamic",
@@ -231,8 +258,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext1 = fpext double %trunc0 to fp128
- %ext1 = fpext double %frob1 to fp128
+ %accext1 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc0,
+ metadata !"fpexcept.strict") #0
+ %ext1 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob1,
+ metadata !"fpexcept.strict") #0
%mul1 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext1, fp128 %ext1,
metadata !"round.dynamic",
@@ -246,8 +277,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext2 = fpext double %trunc1 to fp128
- %ext2 = fpext double %frob2 to fp128
+ %accext2 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc1,
+ metadata !"fpexcept.strict") #0
+ %ext2 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob2,
+ metadata !"fpexcept.strict") #0
%mul2 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext2, fp128 %ext2,
metadata !"round.dynamic",
@@ -261,8 +296,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext3 = fpext double %trunc2 to fp128
- %ext3 = fpext double %frob3 to fp128
+ %accext3 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc2,
+ metadata !"fpexcept.strict") #0
+ %ext3 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob3,
+ metadata !"fpexcept.strict") #0
%mul3 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext3, fp128 %ext3,
metadata !"round.dynamic",
@@ -276,8 +315,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext4 = fpext double %trunc3 to fp128
- %ext4 = fpext double %frob4 to fp128
+ %accext4 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc3,
+ metadata !"fpexcept.strict") #0
+ %ext4 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob4,
+ metadata !"fpexcept.strict") #0
%mul4 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext4, fp128 %ext4,
metadata !"round.dynamic",
@@ -291,8 +334,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext5 = fpext double %trunc4 to fp128
- %ext5 = fpext double %frob5 to fp128
+ %accext5 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc4,
+ metadata !"fpexcept.strict") #0
+ %ext5 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob5,
+ metadata !"fpexcept.strict") #0
%mul5 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext5, fp128 %ext5,
metadata !"round.dynamic",
@@ -306,8 +353,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext6 = fpext double %trunc5 to fp128
- %ext6 = fpext double %frob6 to fp128
+ %accext6 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc5,
+ metadata !"fpexcept.strict") #0
+ %ext6 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob6,
+ metadata !"fpexcept.strict") #0
%mul6 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext6, fp128 %ext6,
metadata !"round.dynamic",
@@ -321,8 +372,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext7 = fpext double %trunc6 to fp128
- %ext7 = fpext double %frob7 to fp128
+ %accext7 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc6,
+ metadata !"fpexcept.strict") #0
+ %ext7 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob7,
+ metadata !"fpexcept.strict") #0
%mul7 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext7, fp128 %ext7,
metadata !"round.dynamic",
@@ -336,8 +391,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext8 = fpext double %trunc7 to fp128
- %ext8 = fpext double %frob8 to fp128
+ %accext8 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc7,
+ metadata !"fpexcept.strict") #0
+ %ext8 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob8,
+ metadata !"fpexcept.strict") #0
%mul8 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext8, fp128 %ext8,
metadata !"round.dynamic",
@@ -351,8 +410,12 @@ define double @f7(double *%ptr0) #0 {
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
- %accext9 = fpext double %trunc8 to fp128
- %ext9 = fpext double %frob9 to fp128
+ %accext9 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %trunc8,
+ metadata !"fpexcept.strict") #0
+ %ext9 = call fp128 @llvm.experimental.constrained.fpext.f128.f64(
+ double %frob9,
+ metadata !"fpexcept.strict") #0
%mul9 = call fp128 @llvm.experimental.constrained.fmul.f128(
fp128 %accext9, fp128 %ext9,
metadata !"round.dynamic",
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index 6c53ffc785d..931e11831f4 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -33,11 +33,11 @@ define <2 x double> @constrained_vector_fdiv_v2f64() #0 {
; S390X-NEXT: larl %r1, .LCPI1_0
; S390X-NEXT: ldeb %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI1_1
-; S390X-NEXT: ldeb %f2, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI1_2
; S390X-NEXT: ldeb %f0, 0(%r1)
-; S390X-NEXT: ddbr %f2, %f1
+; S390X-NEXT: larl %r1, .LCPI1_2
+; S390X-NEXT: ldeb %f2, 0(%r1)
; S390X-NEXT: ddbr %f0, %f1
+; S390X-NEXT: ddbr %f2, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fdiv_v2f64:
@@ -63,14 +63,14 @@ define <3 x float> @constrained_vector_fdiv_v3f32() #0 {
; S390X-NEXT: larl %r1, .LCPI2_0
; S390X-NEXT: le %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI2_1
-; S390X-NEXT: le %f4, 0(%r1)
+; S390X-NEXT: le %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI2_2
; S390X-NEXT: le %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI2_3
-; S390X-NEXT: le %f0, 0(%r1)
-; S390X-NEXT: debr %f4, %f1
-; S390X-NEXT: debr %f2, %f1
+; S390X-NEXT: le %f4, 0(%r1)
; S390X-NEXT: debr %f0, %f1
+; S390X-NEXT: debr %f2, %f1
+; S390X-NEXT: debr %f4, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fdiv_v3f32:
@@ -100,20 +100,18 @@ entry:
define void @constrained_vector_fdiv_v3f64(<3 x double>* %a) #0 {
; S390X-LABEL: constrained_vector_fdiv_v3f64:
; S390X: # %bb.0: # %entry
-; S390X-NEXT: ld %f0, 16(%r2)
-; S390X-NEXT: ld %f1, 8(%r2)
-; S390X-NEXT: larl %r1, .LCPI3_0
-; S390X-NEXT: ldeb %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI3_1
-; S390X-NEXT: ldeb %f3, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI3_2
-; S390X-NEXT: ldeb %f4, 0(%r1)
-; S390X-NEXT: ddbr %f3, %f1
-; S390X-NEXT: ddb %f2, 0(%r2)
-; S390X-NEXT: ddbr %f4, %f0
-; S390X-NEXT: std %f4, 16(%r2)
-; S390X-NEXT: std %f3, 8(%r2)
-; S390X-NEXT: std %f2, 0(%r2)
+; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI3_0
+; S390X-NEXT: ldeb %f2, 0(%r1)
+; S390X-NEXT: ddb %f1, 0(%r2)
+; S390X-NEXT: ddb %f0, 8(%r2)
+; S390X-NEXT: ddb %f2, 16(%r2)
+; S390X-NEXT: std %f1, 0(%r2)
+; S390X-NEXT: std %f0, 8(%r2)
+; S390X-NEXT: std %f2, 16(%r2)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fdiv_v3f64:
@@ -145,17 +143,17 @@ define <4 x double> @constrained_vector_fdiv_v4f64() #0 {
; S390X-NEXT: larl %r1, .LCPI4_0
; S390X-NEXT: ldeb %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI4_1
-; S390X-NEXT: ldeb %f6, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI4_2
-; S390X-NEXT: ldeb %f4, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI4_3
; S390X-NEXT: ldeb %f2, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI4_3
+; S390X-NEXT: ldeb %f4, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI4_4
-; S390X-NEXT: ldeb %f0, 0(%r1)
-; S390X-NEXT: ddbr %f6, %f1
-; S390X-NEXT: ddbr %f4, %f1
-; S390X-NEXT: ddbr %f2, %f1
+; S390X-NEXT: ldeb %f6, 0(%r1)
; S390X-NEXT: ddbr %f0, %f1
+; S390X-NEXT: ddbr %f2, %f1
+; S390X-NEXT: ddbr %f4, %f1
+; S390X-NEXT: ddbr %f6, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fdiv_v4f64:
@@ -164,10 +162,10 @@ define <4 x double> @constrained_vector_fdiv_v4f64() #0 {
; SZ13-NEXT: vl %v0, 0(%r1), 3
; SZ13-NEXT: larl %r1, .LCPI4_1
; SZ13-NEXT: vl %v1, 0(%r1), 3
-; SZ13-NEXT: vfddb %v26, %v1, %v0
+; SZ13-NEXT: vfddb %v24, %v1, %v0
; SZ13-NEXT: larl %r1, .LCPI4_2
; SZ13-NEXT: vl %v1, 0(%r1), 3
-; SZ13-NEXT: vfddb %v24, %v1, %v0
+; SZ13-NEXT: vfddb %v26, %v1, %v0
; SZ13-NEXT: br %r14
entry:
%div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64(
@@ -244,7 +242,8 @@ define <2 x double> @constrained_vector_frem_v2f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: ldr %f2, %f8
; S390X-NEXT: brasl %r14, fmod@PLT
-; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f9
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -316,8 +315,9 @@ define <3 x float> @constrained_vector_frem_v3f32() #0 {
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: ler %f2, %f8
; S390X-NEXT: brasl %r14, fmodf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f9
; S390X-NEXT: ler %f2, %f10
-; S390X-NEXT: ler %f4, %f9
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -383,8 +383,8 @@ define void @constrained_vector_frem_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f2, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f2, 16(%r2)
; S390X-NEXT: larl %r1, .LCPI8_0
; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: ld %f9, 8(%r2)
@@ -401,9 +401,9 @@ define void @constrained_vector_frem_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: ldr %f2, %f8
; S390X-NEXT: brasl %r14, fmod@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -499,9 +499,10 @@ define <4 x double> @constrained_vector_frem_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: ldr %f2, %f8
; S390X-NEXT: brasl %r14, fmod@PLT
-; S390X-NEXT: ldr %f2, %f11
-; S390X-NEXT: ldr %f4, %f10
-; S390X-NEXT: ldr %f6, %f9
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f9
+; S390X-NEXT: ldr %f2, %f10
+; S390X-NEXT: ldr %f4, %f11
; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
@@ -588,13 +589,13 @@ define <2 x double> @constrained_vector_fmul_v2f64() #0 {
; S390X-LABEL: constrained_vector_fmul_v2f64:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI11_0
-; S390X-NEXT: ldeb %f2, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI11_1
; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI11_2
-; S390X-NEXT: ldeb %f0, 0(%r1)
-; S390X-NEXT: mdbr %f2, %f1
+; S390X-NEXT: ldeb %f2, 0(%r1)
; S390X-NEXT: mdbr %f0, %f1
+; S390X-NEXT: mdbr %f2, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fmul_v2f64:
@@ -618,15 +619,15 @@ define <3 x float> @constrained_vector_fmul_v3f32() #0 {
; S390X-LABEL: constrained_vector_fmul_v3f32:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI12_0
-; S390X-NEXT: le %f0, 0(%r1)
+; S390X-NEXT: le %f4, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI12_1
-; S390X-NEXT: ler %f4, %f0
-; S390X-NEXT: meeb %f4, 0(%r1)
+; S390X-NEXT: ler %f0, %f4
+; S390X-NEXT: meeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI12_2
-; S390X-NEXT: ler %f2, %f0
+; S390X-NEXT: ler %f2, %f4
; S390X-NEXT: meeb %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI12_3
-; S390X-NEXT: meeb %f0, 0(%r1)
+; S390X-NEXT: meeb %f4, 0(%r1)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fmul_v3f32:
@@ -656,17 +657,16 @@ entry:
define void @constrained_vector_fmul_v3f64(<3 x double>* %a) #0 {
; S390X-LABEL: constrained_vector_fmul_v3f64:
; S390X: # %bb.0: # %entry
-; S390X-NEXT: ld %f0, 8(%r2)
; S390X-NEXT: larl %r1, .LCPI13_0
-; S390X-NEXT: ld %f1, 0(%r1)
-; S390X-NEXT: ld %f2, 16(%r2)
-; S390X-NEXT: mdbr %f0, %f1
-; S390X-NEXT: ldr %f3, %f1
-; S390X-NEXT: mdb %f3, 0(%r2)
-; S390X-NEXT: mdbr %f2, %f1
-; S390X-NEXT: std %f2, 16(%r2)
-; S390X-NEXT: std %f0, 8(%r2)
-; S390X-NEXT: std %f3, 0(%r2)
+; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldr %f1, %f0
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: mdb %f0, 0(%r2)
+; S390X-NEXT: mdb %f2, 8(%r2)
+; S390X-NEXT: mdb %f1, 16(%r2)
+; S390X-NEXT: std %f0, 0(%r2)
+; S390X-NEXT: std %f2, 8(%r2)
+; S390X-NEXT: std %f1, 16(%r2)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fmul_v3f64:
@@ -678,8 +678,8 @@ define void @constrained_vector_fmul_v3f64(<3 x double>* %a) #0 {
; SZ13-NEXT: vl %v2, 0(%r1), 3
; SZ13-NEXT: mdb %f1, 16(%r2)
; SZ13-NEXT: vfmdb %v0, %v2, %v0
-; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -697,19 +697,19 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 {
; S390X-LABEL: constrained_vector_fmul_v4f64:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI14_0
-; S390X-NEXT: ldeb %f6, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI14_1
; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI14_2
-; S390X-NEXT: ldeb %f4, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI14_3
; S390X-NEXT: ldeb %f2, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI14_3
+; S390X-NEXT: ldeb %f4, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI14_4
-; S390X-NEXT: ldeb %f0, 0(%r1)
-; S390X-NEXT: mdbr %f6, %f1
-; S390X-NEXT: mdbr %f4, %f1
-; S390X-NEXT: mdbr %f2, %f1
+; S390X-NEXT: ldeb %f6, 0(%r1)
; S390X-NEXT: mdbr %f0, %f1
+; S390X-NEXT: mdbr %f2, %f1
+; S390X-NEXT: mdbr %f4, %f1
+; S390X-NEXT: mdbr %f6, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fmul_v4f64:
@@ -719,9 +719,9 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 {
; SZ13-NEXT: larl %r1, .LCPI14_1
; SZ13-NEXT: vl %v1, 0(%r1), 3
; SZ13-NEXT: larl %r1, .LCPI14_2
-; SZ13-NEXT: vfmdb %v26, %v1, %v0
-; SZ13-NEXT: vl %v0, 0(%r1), 3
; SZ13-NEXT: vfmdb %v24, %v1, %v0
+; SZ13-NEXT: vl %v0, 0(%r1), 3
+; SZ13-NEXT: vfmdb %v26, %v1, %v0
; SZ13-NEXT: br %r14
entry:
%mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
@@ -763,13 +763,12 @@ define <2 x double> @constrained_vector_fadd_v2f64() #0 {
; S390X-LABEL: constrained_vector_fadd_v2f64:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI16_0
-; S390X-NEXT: ld %f1, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI16_2
; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI16_1
-; S390X-NEXT: ldr %f2, %f1
+; S390X-NEXT: ld %f2, 0(%r1)
+; S390X-NEXT: adbr %f0, %f2
+; S390X-NEXT: larl %r1, .LCPI16_2
; S390X-NEXT: adb %f2, 0(%r1)
-; S390X-NEXT: adbr %f0, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fadd_v2f64:
@@ -793,14 +792,15 @@ define <3 x float> @constrained_vector_fadd_v3f32() #0 {
; S390X-LABEL: constrained_vector_fadd_v3f32:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI17_0
-; S390X-NEXT: le %f0, 0(%r1)
-; S390X-NEXT: lzer %f4
-; S390X-NEXT: aebr %f4, %f0
+; S390X-NEXT: le %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI17_1
-; S390X-NEXT: ler %f2, %f0
-; S390X-NEXT: aeb %f2, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI17_2
+; S390X-NEXT: ler %f2, %f1
+; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: aeb %f0, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI17_2
+; S390X-NEXT: aeb %f2, 0(%r1)
+; S390X-NEXT: lzer %f4
+; S390X-NEXT: aebr %f4, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fadd_v3f32:
@@ -829,17 +829,16 @@ entry:
define void @constrained_vector_fadd_v3f64(<3 x double>* %a) #0 {
; S390X-LABEL: constrained_vector_fadd_v3f64:
; S390X: # %bb.0: # %entry
-; S390X-NEXT: ld %f0, 8(%r2)
; S390X-NEXT: larl %r1, .LCPI18_0
-; S390X-NEXT: ld %f1, 0(%r1)
-; S390X-NEXT: ld %f2, 16(%r2)
-; S390X-NEXT: adbr %f0, %f1
-; S390X-NEXT: ldr %f3, %f1
-; S390X-NEXT: adb %f3, 0(%r2)
-; S390X-NEXT: adbr %f2, %f1
-; S390X-NEXT: std %f2, 16(%r2)
-; S390X-NEXT: std %f0, 8(%r2)
-; S390X-NEXT: std %f3, 0(%r2)
+; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldr %f1, %f0
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: adb %f0, 0(%r2)
+; S390X-NEXT: adb %f2, 8(%r2)
+; S390X-NEXT: adb %f1, 16(%r2)
+; S390X-NEXT: std %f0, 0(%r2)
+; S390X-NEXT: std %f2, 8(%r2)
+; S390X-NEXT: std %f1, 16(%r2)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fadd_v3f64:
@@ -851,8 +850,8 @@ define void @constrained_vector_fadd_v3f64(<3 x double>* %a) #0 {
; SZ13-NEXT: vl %v2, 0(%r1), 3
; SZ13-NEXT: adb %f1, 16(%r2)
; SZ13-NEXT: vfadb %v0, %v2, %v0
-; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -870,19 +869,18 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 {
; S390X-LABEL: constrained_vector_fadd_v4f64:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI19_0
-; S390X-NEXT: ld %f1, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI19_1
-; S390X-NEXT: ldr %f2, %f1
-; S390X-NEXT: ldr %f6, %f1
-; S390X-NEXT: adb %f6, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI19_2
-; S390X-NEXT: ldeb %f4, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI19_4
; S390X-NEXT: ldeb %f0, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI19_1
+; S390X-NEXT: ld %f6, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI19_3
+; S390X-NEXT: ldeb %f4, 0(%r1)
+; S390X-NEXT: adbr %f0, %f6
+; S390X-NEXT: larl %r1, .LCPI19_2
+; S390X-NEXT: ldr %f2, %f6
; S390X-NEXT: adb %f2, 0(%r1)
-; S390X-NEXT: adbr %f4, %f1
-; S390X-NEXT: adbr %f0, %f1
+; S390X-NEXT: adbr %f4, %f6
+; S390X-NEXT: larl %r1, .LCPI19_4
+; S390X-NEXT: adb %f6, 0(%r1)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fadd_v4f64:
@@ -892,9 +890,9 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 {
; SZ13-NEXT: larl %r1, .LCPI19_1
; SZ13-NEXT: vl %v1, 0(%r1), 3
; SZ13-NEXT: larl %r1, .LCPI19_2
-; SZ13-NEXT: vfadb %v26, %v1, %v0
-; SZ13-NEXT: vl %v0, 0(%r1), 3
; SZ13-NEXT: vfadb %v24, %v1, %v0
+; SZ13-NEXT: vl %v0, 0(%r1), 3
+; SZ13-NEXT: vfadb %v26, %v1, %v0
; SZ13-NEXT: br %r14
entry:
%add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
@@ -935,12 +933,12 @@ entry:
define <2 x double> @constrained_vector_fsub_v2f64() #0 {
; S390X-LABEL: constrained_vector_fsub_v2f64:
; S390X: # %bb.0: # %entry
+; S390X-NEXT: larl %r1, .LCPI21_1
+; S390X-NEXT: ld %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI21_0
-; S390X-NEXT: ld %f0, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI21_2
; S390X-NEXT: ldeb %f1, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI21_1
-; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f2
+; S390X-NEXT: larl %r1, .LCPI21_2
; S390X-NEXT: sdb %f2, 0(%r1)
; S390X-NEXT: sdbr %f0, %f1
; S390X-NEXT: br %r14
@@ -965,13 +963,13 @@ define <3 x float> @constrained_vector_fsub_v3f32() #0 {
; S390X-LABEL: constrained_vector_fsub_v3f32:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI22_0
-; S390X-NEXT: le %f0, 0(%r1)
-; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: le %f4, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI22_1
-; S390X-NEXT: ler %f2, %f0
-; S390X-NEXT: seb %f2, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI22_2
+; S390X-NEXT: ler %f0, %f4
; S390X-NEXT: seb %f0, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI22_2
+; S390X-NEXT: ler %f2, %f4
+; S390X-NEXT: seb %f2, 0(%r1)
; S390X-NEXT: lzer %f1
; S390X-NEXT: sebr %f4, %f1
; S390X-NEXT: br %r14
@@ -1006,16 +1004,14 @@ define void @constrained_vector_fsub_v3f64(<3 x double>* %a) #0 {
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI23_0
; S390X-NEXT: ld %f0, 0(%r1)
-; S390X-NEXT: ld %f1, 8(%r2)
-; S390X-NEXT: ld %f2, 16(%r2)
-; S390X-NEXT: ldr %f3, %f0
-; S390X-NEXT: sdb %f3, 0(%r2)
-; S390X-NEXT: ldr %f4, %f0
-; S390X-NEXT: sdbr %f4, %f1
-; S390X-NEXT: sdbr %f0, %f2
-; S390X-NEXT: std %f0, 16(%r2)
-; S390X-NEXT: std %f4, 8(%r2)
-; S390X-NEXT: std %f3, 0(%r2)
+; S390X-NEXT: ldr %f1, %f0
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: sdb %f0, 0(%r2)
+; S390X-NEXT: sdb %f2, 8(%r2)
+; S390X-NEXT: sdb %f1, 16(%r2)
+; S390X-NEXT: std %f0, 0(%r2)
+; S390X-NEXT: std %f2, 8(%r2)
+; S390X-NEXT: std %f1, 16(%r2)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fsub_v3f64:
@@ -1025,8 +1021,8 @@ define void @constrained_vector_fsub_v3f64(<3 x double>* %a) #0 {
; SZ13-NEXT: sdb %f2, 16(%r2)
; SZ13-NEXT: vgmg %v1, 12, 10
; SZ13-NEXT: vfsdb %v0, %v1, %v0
-; SZ13-NEXT: std %f2, 16(%r2)
; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f2, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -1043,21 +1039,21 @@ entry:
define <4 x double> @constrained_vector_fsub_v4f64() #0 {
; S390X-LABEL: constrained_vector_fsub_v4f64:
; S390X: # %bb.0: # %entry
-; S390X-NEXT: larl %r1, .LCPI24_0
-; S390X-NEXT: ld %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI24_1
-; S390X-NEXT: ldr %f6, %f0
-; S390X-NEXT: sdb %f6, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI24_2
+; S390X-NEXT: ld %f6, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI24_0
; S390X-NEXT: ldeb %f1, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI24_4
-; S390X-NEXT: ldeb %f3, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI24_3
-; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f6
+; S390X-NEXT: larl %r1, .LCPI24_2
+; S390X-NEXT: ldr %f2, %f6
; S390X-NEXT: sdb %f2, 0(%r1)
-; S390X-NEXT: ldr %f4, %f0
-; S390X-NEXT: sdbr %f4, %f1
-; S390X-NEXT: sdbr %f0, %f3
+; S390X-NEXT: larl %r1, .LCPI24_3
+; S390X-NEXT: ldeb %f3, 0(%r1)
+; S390X-NEXT: ldr %f4, %f6
+; S390X-NEXT: larl %r1, .LCPI24_4
+; S390X-NEXT: sdb %f6, 0(%r1)
+; S390X-NEXT: sdbr %f0, %f1
+; S390X-NEXT: sdbr %f4, %f3
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fsub_v4f64:
@@ -1066,9 +1062,9 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 {
; SZ13-NEXT: vl %v0, 0(%r1), 3
; SZ13-NEXT: vgmg %v1, 12, 10
; SZ13-NEXT: larl %r1, .LCPI24_1
-; SZ13-NEXT: vfsdb %v26, %v1, %v0
-; SZ13-NEXT: vl %v0, 0(%r1), 3
; SZ13-NEXT: vfsdb %v24, %v1, %v0
+; SZ13-NEXT: vl %v0, 0(%r1), 3
+; SZ13-NEXT: vfsdb %v26, %v1, %v0
; SZ13-NEXT: br %r14
entry:
%sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
@@ -1130,11 +1126,11 @@ define <3 x float> @constrained_vector_sqrt_v3f32() #0 {
; S390X-LABEL: constrained_vector_sqrt_v3f32:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI27_0
-; S390X-NEXT: sqeb %f4, 0(%r1)
+; S390X-NEXT: sqeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI27_1
; S390X-NEXT: sqeb %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI27_2
-; S390X-NEXT: sqeb %f0, 0(%r1)
+; S390X-NEXT: sqeb %f4, 0(%r1)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_sqrt_v3f32:
@@ -1160,14 +1156,12 @@ entry:
define void @constrained_vector_sqrt_v3f64(<3 x double>* %a) #0 {
; S390X-LABEL: constrained_vector_sqrt_v3f64:
; S390X: # %bb.0: # %entry
-; S390X-NEXT: ld %f0, 8(%r2)
-; S390X-NEXT: ld %f1, 16(%r2)
-; S390X-NEXT: sqdb %f2, 0(%r2)
-; S390X-NEXT: sqdbr %f0, %f0
-; S390X-NEXT: sqdbr %f1, %f1
-; S390X-NEXT: std %f1, 16(%r2)
-; S390X-NEXT: std %f0, 8(%r2)
-; S390X-NEXT: std %f2, 0(%r2)
+; S390X-NEXT: sqdb %f0, 0(%r2)
+; S390X-NEXT: sqdb %f1, 8(%r2)
+; S390X-NEXT: sqdb %f2, 16(%r2)
+; S390X-NEXT: std %f0, 0(%r2)
+; S390X-NEXT: std %f1, 8(%r2)
+; S390X-NEXT: std %f2, 16(%r2)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_sqrt_v3f64:
@@ -1192,13 +1186,13 @@ define <4 x double> @constrained_vector_sqrt_v4f64() #0 {
; S390X-LABEL: constrained_vector_sqrt_v4f64:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI29_0
-; S390X-NEXT: sqdb %f6, 0(%r1)
+; S390X-NEXT: sqdb %f2, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI29_1
; S390X-NEXT: sqdb %f4, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI29_3
; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI29_2
-; S390X-NEXT: sqdb %f2, 0(%r1)
+; S390X-NEXT: sqdb %f6, 0(%r1)
; S390X-NEXT: sqdbr %f0, %f0
; S390X-NEXT: br %r14
;
@@ -1206,10 +1200,10 @@ define <4 x double> @constrained_vector_sqrt_v4f64() #0 {
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI29_0
; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vfsqdb %v26, %v0
+; SZ13-NEXT: vfsqdb %v24, %v0
; SZ13-NEXT: larl %r1, .LCPI29_1
; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vfsqdb %v24, %v0
+; SZ13-NEXT: vfsqdb %v26, %v0
; SZ13-NEXT: br %r14
entry:
%sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64(
@@ -1285,7 +1279,8 @@ define <2 x double> @constrained_vector_pow_v2f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: ldr %f2, %f8
; S390X-NEXT: brasl %r14, pow@PLT
-; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f9
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -1359,8 +1354,9 @@ define <3 x float> @constrained_vector_pow_v3f32() #0 {
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: ler %f2, %f8
; S390X-NEXT: brasl %r14, powf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f9
; S390X-NEXT: ler %f2, %f10
-; S390X-NEXT: ler %f4, %f9
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -1430,8 +1426,8 @@ define void @constrained_vector_pow_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: .cfi_offset %f11, -192
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: larl %r1, .LCPI33_0
; S390X-NEXT: ldeb %f9, 0(%r1)
; S390X-NEXT: ld %f10, 8(%r2)
@@ -1445,9 +1441,9 @@ define void @constrained_vector_pow_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ldr %f2, %f9
; S390X-NEXT: brasl %r14, pow@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f10, 8(%r13)
-; S390X-NEXT: std %f11, 0(%r13)
+; S390X-NEXT: std %f11, 16(%r13)
; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
@@ -1548,9 +1544,10 @@ define <4 x double> @constrained_vector_pow_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: ldr %f2, %f8
; S390X-NEXT: brasl %r14, pow@PLT
-; S390X-NEXT: ldr %f2, %f11
-; S390X-NEXT: ldr %f4, %f10
-; S390X-NEXT: ldr %f6, %f9
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f9
+; S390X-NEXT: ldr %f2, %f10
+; S390X-NEXT: ldr %f4, %f11
; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
@@ -1670,7 +1667,8 @@ define <2 x double> @constrained_vector_powi_v2f64() #0 {
; S390X-NEXT: lghi %r2, 3
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, __powidf2@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -1734,8 +1732,9 @@ define <3 x float> @constrained_vector_powi_v3f32() #0 {
; S390X-NEXT: lghi %r2, 3
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, __powisf2@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -1898,9 +1897,10 @@ define <4 x double> @constrained_vector_powi_v4f64() #0 {
; S390X-NEXT: lghi %r2, 3
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, __powidf2@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2001,14 +2001,15 @@ define <2 x double> @constrained_vector_sin_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI41_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, sin@PLT
; S390X-NEXT: larl %r1, .LCPI41_1
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, sin@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -2066,8 +2067,9 @@ define <3 x float> @constrained_vector_sin_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, sinf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -2125,8 +2127,8 @@ define void @constrained_vector_sin_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, sin@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -2135,9 +2137,9 @@ define void @constrained_vector_sin_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, sin@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2203,7 +2205,7 @@ define <4 x double> @constrained_vector_sin_v4f64() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI44_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, sin@PLT
; S390X-NEXT: larl %r1, .LCPI44_1
; S390X-NEXT: ld %f1, 0(%r1)
@@ -2216,13 +2218,14 @@ define <4 x double> @constrained_vector_sin_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, sin@PLT
; S390X-NEXT: larl %r1, .LCPI44_3
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, sin@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2318,14 +2321,15 @@ define <2 x double> @constrained_vector_cos_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI46_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, cos@PLT
; S390X-NEXT: larl %r1, .LCPI46_1
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, cos@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -2383,8 +2387,9 @@ define <3 x float> @constrained_vector_cos_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, cosf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -2442,8 +2447,8 @@ define void @constrained_vector_cos_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, cos@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -2452,9 +2457,9 @@ define void @constrained_vector_cos_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, cos@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2520,7 +2525,7 @@ define <4 x double> @constrained_vector_cos_v4f64() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI49_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, cos@PLT
; S390X-NEXT: larl %r1, .LCPI49_1
; S390X-NEXT: ld %f1, 0(%r1)
@@ -2533,13 +2538,14 @@ define <4 x double> @constrained_vector_cos_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, cos@PLT
; S390X-NEXT: larl %r1, .LCPI49_3
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, cos@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2635,14 +2641,15 @@ define <2 x double> @constrained_vector_exp_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI51_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, exp@PLT
; S390X-NEXT: larl %r1, .LCPI51_1
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, exp@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -2700,8 +2707,9 @@ define <3 x float> @constrained_vector_exp_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, expf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -2759,8 +2767,8 @@ define void @constrained_vector_exp_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, exp@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -2769,9 +2777,9 @@ define void @constrained_vector_exp_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, exp@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2837,7 +2845,7 @@ define <4 x double> @constrained_vector_exp_v4f64() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI54_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, exp@PLT
; S390X-NEXT: larl %r1, .LCPI54_1
; S390X-NEXT: ld %f1, 0(%r1)
@@ -2850,13 +2858,14 @@ define <4 x double> @constrained_vector_exp_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, exp@PLT
; S390X-NEXT: larl %r1, .LCPI54_3
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, exp@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -2952,14 +2961,15 @@ define <2 x double> @constrained_vector_exp2_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI56_0
-; S390X-NEXT: ldeb %f0, 0(%r1)
+; S390X-NEXT: ld %f0, 0(%r1)
; S390X-NEXT: brasl %r14, exp2@PLT
; S390X-NEXT: larl %r1, .LCPI56_1
-; S390X-NEXT: ld %f1, 0(%r1)
+; S390X-NEXT: ldeb %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, exp2@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -3017,8 +3027,9 @@ define <3 x float> @constrained_vector_exp2_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, exp2f@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -3076,8 +3087,8 @@ define void @constrained_vector_exp2_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, exp2@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -3086,9 +3097,9 @@ define void @constrained_vector_exp2_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, exp2@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -3171,9 +3182,10 @@ define <4 x double> @constrained_vector_exp2_v4f64() #0 {
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, exp2@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -3269,14 +3281,15 @@ define <2 x double> @constrained_vector_log_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI61_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, log@PLT
; S390X-NEXT: larl %r1, .LCPI61_1
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -3334,8 +3347,9 @@ define <3 x float> @constrained_vector_log_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, logf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -3393,8 +3407,8 @@ define void @constrained_vector_log_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, log@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -3403,9 +3417,9 @@ define void @constrained_vector_log_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, log@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -3471,7 +3485,7 @@ define <4 x double> @constrained_vector_log_v4f64() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI64_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, log@PLT
; S390X-NEXT: larl %r1, .LCPI64_1
; S390X-NEXT: ld %f1, 0(%r1)
@@ -3484,13 +3498,14 @@ define <4 x double> @constrained_vector_log_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log@PLT
; S390X-NEXT: larl %r1, .LCPI64_3
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -3586,14 +3601,15 @@ define <2 x double> @constrained_vector_log10_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI66_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, log10@PLT
; S390X-NEXT: larl %r1, .LCPI66_1
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log10@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -3651,8 +3667,9 @@ define <3 x float> @constrained_vector_log10_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, log10f@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -3710,8 +3727,8 @@ define void @constrained_vector_log10_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, log10@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -3720,9 +3737,9 @@ define void @constrained_vector_log10_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, log10@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -3788,7 +3805,7 @@ define <4 x double> @constrained_vector_log10_v4f64() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI69_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, log10@PLT
; S390X-NEXT: larl %r1, .LCPI69_1
; S390X-NEXT: ld %f1, 0(%r1)
@@ -3801,13 +3818,14 @@ define <4 x double> @constrained_vector_log10_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log10@PLT
; S390X-NEXT: larl %r1, .LCPI69_3
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log10@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -3903,14 +3921,15 @@ define <2 x double> @constrained_vector_log2_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI71_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, log2@PLT
; S390X-NEXT: larl %r1, .LCPI71_1
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log2@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -3968,8 +3987,9 @@ define <3 x float> @constrained_vector_log2_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, log2f@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -4027,8 +4047,8 @@ define void @constrained_vector_log2_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, log2@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -4037,9 +4057,9 @@ define void @constrained_vector_log2_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, log2@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -4105,7 +4125,7 @@ define <4 x double> @constrained_vector_log2_v4f64() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI74_0
-; S390X-NEXT: ld %f0, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: brasl %r14, log2@PLT
; S390X-NEXT: larl %r1, .LCPI74_1
; S390X-NEXT: ld %f1, 0(%r1)
@@ -4118,13 +4138,14 @@ define <4 x double> @constrained_vector_log2_v4f64() #0 {
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log2@PLT
; S390X-NEXT: larl %r1, .LCPI74_3
-; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, log2@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -4200,11 +4221,11 @@ define <2 x double> @constrained_vector_rint_v2f64() #0 {
; S390X-LABEL: constrained_vector_rint_v2f64:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI76_0
-; S390X-NEXT: ldeb %f0, 0(%r1)
+; S390X-NEXT: ld %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI76_1
-; S390X-NEXT: ld %f1, 0(%r1)
-; S390X-NEXT: fidbr %f2, 0, %f0
-; S390X-NEXT: fidbr %f0, 0, %f1
+; S390X-NEXT: ldeb %f1, 0(%r1)
+; S390X-NEXT: fidbr %f0, 0, %f0
+; S390X-NEXT: fidbr %f2, 0, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_rint_v2f64:
@@ -4230,9 +4251,9 @@ define <3 x float> @constrained_vector_rint_v3f32() #0 {
; S390X-NEXT: le %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI77_2
; S390X-NEXT: le %f3, 0(%r1)
-; S390X-NEXT: fiebr %f4, 0, %f0
+; S390X-NEXT: fiebr %f0, 0, %f0
; S390X-NEXT: fiebr %f2, 0, %f1
-; S390X-NEXT: fiebr %f0, 0, %f3
+; S390X-NEXT: fiebr %f4, 0, %f3
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_rint_v3f32:
@@ -4261,25 +4282,25 @@ define <3 x float> @constrained_vector_rint_v3f32() #0 {
define void @constrained_vector_rint_v3f64(<3 x double>* %a) #0 {
; S390X-LABEL: constrained_vector_rint_v3f64:
; S390X: # %bb.0: # %entry
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f1, 8(%r2)
-; S390X-NEXT: ld %f2, 16(%r2)
+; S390X-NEXT: ld %f2, 0(%r2)
; S390X-NEXT: fidbr %f0, 0, %f0
; S390X-NEXT: fidbr %f1, 0, %f1
; S390X-NEXT: fidbr %f2, 0, %f2
-; S390X-NEXT: std %f2, 16(%r2)
+; S390X-NEXT: std %f2, 0(%r2)
; S390X-NEXT: std %f1, 8(%r2)
-; S390X-NEXT: std %f0, 0(%r2)
+; S390X-NEXT: std %f0, 16(%r2)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_rint_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vfidb %v1, %v1, 0, 0
-; SZ13-NEXT: fidbra %f0, 0, %f0, 0
-; SZ13-NEXT: std %f0, 16(%r2)
-; SZ13-NEXT: vst %v1, 0(%r2), 4
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vfidb %v0, %v0, 0, 0
+; SZ13-NEXT: fidbra %f1, 0, %f1, 0
+; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -4299,13 +4320,13 @@ define <4 x double> @constrained_vector_rint_v4f64() #0 {
; S390X-NEXT: larl %r1, .LCPI79_1
; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI79_2
-; S390X-NEXT: ld %f2, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI79_3
; S390X-NEXT: ld %f3, 0(%r1)
-; S390X-NEXT: fidbr %f6, 0, %f0
-; S390X-NEXT: fidbr %f4, 0, %f1
-; S390X-NEXT: fidbr %f2, 0, %f2
-; S390X-NEXT: fidbr %f0, 0, %f3
+; S390X-NEXT: larl %r1, .LCPI79_3
+; S390X-NEXT: ld %f5, 0(%r1)
+; S390X-NEXT: fidbr %f0, 0, %f0
+; S390X-NEXT: fidbr %f2, 0, %f1
+; S390X-NEXT: fidbr %f4, 0, %f3
+; S390X-NEXT: fidbr %f6, 0, %f5
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_rint_v4f64:
@@ -4366,14 +4387,15 @@ define <2 x double> @constrained_vector_nearbyint_v2f64() #0 {
; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
; S390X-NEXT: .cfi_offset %f8, -168
; S390X-NEXT: larl %r1, .LCPI81_0
-; S390X-NEXT: ldeb %f0, 0(%r1)
+; S390X-NEXT: ld %f0, 0(%r1)
; S390X-NEXT: brasl %r14, nearbyint@PLT
; S390X-NEXT: larl %r1, .LCPI81_1
-; S390X-NEXT: ld %f1, 0(%r1)
+; S390X-NEXT: ldeb %f1, 0(%r1)
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, nearbyint@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -4417,8 +4439,9 @@ define <3 x float> @constrained_vector_nearbyint_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, nearbyintf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -4463,8 +4486,8 @@ define void @constrained_vector_nearbyint_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, nearbyint@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -4473,9 +4496,9 @@ define void @constrained_vector_nearbyint_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, nearbyint@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -4484,12 +4507,12 @@ define void @constrained_vector_nearbyint_v3f64(<3 x double>* %a) #0 {
;
; SZ13-LABEL: constrained_vector_nearbyint_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vfidb %v1, %v1, 4, 0
-; SZ13-NEXT: fidbra %f0, 0, %f0, 4
-; SZ13-NEXT: std %f0, 16(%r2)
-; SZ13-NEXT: vst %v1, 0(%r2), 4
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vfidb %v0, %v0, 4, 0
+; SZ13-NEXT: fidbra %f1, 0, %f1, 4
+; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -4533,9 +4556,10 @@ define <4 x double> @constrained_vector_nearbyint_v4f64() #0 {
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, nearbyint@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -4622,7 +4646,8 @@ define <2 x double> @constrained_vector_maxnum_v2f64() #0 {
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, fmax@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -4675,10 +4700,10 @@ define <3 x float> @constrained_vector_maxnum_v3f32() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI87_0
-; S390X-NEXT: le %f0, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI87_1
; S390X-NEXT: le %f8, 0(%r1)
-; S390X-NEXT: ler %f2, %f8
+; S390X-NEXT: larl %r1, .LCPI87_1
+; S390X-NEXT: le %f2, 0(%r1)
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: brasl %r14, fmaxf@PLT
; S390X-NEXT: larl %r1, .LCPI87_2
; S390X-NEXT: le %f1, 0(%r1)
@@ -4688,12 +4713,14 @@ define <3 x float> @constrained_vector_maxnum_v3f32() #0 {
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, fmaxf@PLT
; S390X-NEXT: larl %r1, .LCPI87_4
-; S390X-NEXT: le %f2, 0(%r1)
+; S390X-NEXT: le %f1, 0(%r1)
; S390X-NEXT: ler %f10, %f0
-; S390X-NEXT: ler %f0, %f8
+; S390X-NEXT: ler %f0, %f1
+; S390X-NEXT: ler %f2, %f8
; S390X-NEXT: brasl %r14, fmaxf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f9
; S390X-NEXT: ler %f2, %f10
-; S390X-NEXT: ler %f4, %f9
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -4762,8 +4789,8 @@ define void @constrained_vector_log10_maxnum_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: larl %r1, .LCPI88_0
; S390X-NEXT: ldeb %f2, 0(%r1)
; S390X-NEXT: ld %f9, 8(%r2)
@@ -4778,9 +4805,9 @@ define void @constrained_vector_log10_maxnum_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, fmax@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -4878,9 +4905,10 @@ define <4 x double> @constrained_vector_maxnum_v4f64() #0 {
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, fmax@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -5001,7 +5029,8 @@ define <2 x double> @constrained_vector_minnum_v2f64() #0 {
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, fmin@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -5054,10 +5083,10 @@ define <3 x float> @constrained_vector_minnum_v3f32() #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: larl %r1, .LCPI92_0
-; S390X-NEXT: le %f0, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI92_1
; S390X-NEXT: le %f8, 0(%r1)
-; S390X-NEXT: ler %f2, %f8
+; S390X-NEXT: larl %r1, .LCPI92_1
+; S390X-NEXT: le %f2, 0(%r1)
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: brasl %r14, fminf@PLT
; S390X-NEXT: larl %r1, .LCPI92_2
; S390X-NEXT: le %f1, 0(%r1)
@@ -5067,12 +5096,14 @@ define <3 x float> @constrained_vector_minnum_v3f32() #0 {
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, fminf@PLT
; S390X-NEXT: larl %r1, .LCPI92_4
-; S390X-NEXT: le %f2, 0(%r1)
+; S390X-NEXT: le %f1, 0(%r1)
; S390X-NEXT: ler %f10, %f0
-; S390X-NEXT: ler %f0, %f8
+; S390X-NEXT: ler %f0, %f1
+; S390X-NEXT: ler %f2, %f8
; S390X-NEXT: brasl %r14, fminf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f9
; S390X-NEXT: ler %f2, %f10
-; S390X-NEXT: ler %f4, %f9
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -5143,8 +5174,8 @@ define void @constrained_vector_minnum_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: .cfi_offset %f11, -192
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: larl %r1, .LCPI93_0
; S390X-NEXT: ldeb %f9, 0(%r1)
; S390X-NEXT: ld %f10, 8(%r2)
@@ -5158,9 +5189,9 @@ define void @constrained_vector_minnum_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ldr %f2, %f9
; S390X-NEXT: brasl %r14, fmin@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f10, 8(%r13)
-; S390X-NEXT: std %f11, 0(%r13)
+; S390X-NEXT: std %f11, 16(%r13)
; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
@@ -5261,9 +5292,10 @@ define <4 x double> @constrained_vector_minnum_v4f64() #0 {
; S390X-NEXT: ldr %f10, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, fmin@PLT
-; S390X-NEXT: ldr %f2, %f10
-; S390X-NEXT: ldr %f4, %f9
-; S390X-NEXT: ldr %f6, %f8
+; S390X-NEXT: ldr %f6, %f0
+; S390X-NEXT: ldr %f0, %f8
+; S390X-NEXT: ldr %f2, %f9
+; S390X-NEXT: ldr %f4, %f10
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -5351,8 +5383,8 @@ define <2 x float> @constrained_vector_fptrunc_v2f64() #0 {
; S390X-NEXT: ld %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI96_1
; S390X-NEXT: ld %f1, 0(%r1)
-; S390X-NEXT: ledbr %f2, %f0
-; S390X-NEXT: ledbr %f0, %f1
+; S390X-NEXT: ledbr %f0, %f0
+; S390X-NEXT: ledbr %f2, %f1
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fptrunc_v2f64:
@@ -5423,13 +5455,13 @@ define <4 x float> @constrained_vector_fptrunc_v4f64() #0 {
; S390X-NEXT: larl %r1, .LCPI98_1
; S390X-NEXT: ld %f1, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI98_2
-; S390X-NEXT: ld %f2, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI98_3
; S390X-NEXT: ld %f3, 0(%r1)
-; S390X-NEXT: ledbr %f6, %f0
-; S390X-NEXT: ledbr %f4, %f1
-; S390X-NEXT: ledbr %f2, %f2
-; S390X-NEXT: ledbr %f0, %f3
+; S390X-NEXT: larl %r1, .LCPI98_3
+; S390X-NEXT: ld %f5, 0(%r1)
+; S390X-NEXT: ledbr %f0, %f0
+; S390X-NEXT: ledbr %f2, %f1
+; S390X-NEXT: ledbr %f4, %f3
+; S390X-NEXT: ledbr %f6, %f5
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fptrunc_v4f64:
@@ -5483,9 +5515,9 @@ define <2 x double> @constrained_vector_fpext_v2f32() #0 {
; S390X-LABEL: constrained_vector_fpext_v2f32:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI100_0
-; S390X-NEXT: ldeb %f2, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI100_1
; S390X-NEXT: ldeb %f0, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI100_1
+; S390X-NEXT: ldeb %f2, 0(%r1)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fpext_v2f32:
@@ -5510,13 +5542,13 @@ define void @constrained_vector_fpext_v3f64(<3 x float>* %src, <3 x double>* %de
; S390X-NEXT: sllg %r1, %r0, 32
; S390X-NEXT: ldgr %f0, %r1
; S390X-NEXT: nilf %r0, 0
-; S390X-NEXT: ldeb %f1, 8(%r2)
-; S390X-NEXT: ldgr %f2, %r0
-; S390X-NEXT: ldebr %f2, %f2
+; S390X-NEXT: ldgr %f1, %r0
+; S390X-NEXT: ldeb %f2, 8(%r2)
+; S390X-NEXT: ldebr %f1, %f1
; S390X-NEXT: ldebr %f0, %f0
-; S390X-NEXT: std %f1, 16(%r3)
; S390X-NEXT: std %f0, 8(%r3)
-; S390X-NEXT: std %f2, 0(%r3)
+; S390X-NEXT: std %f2, 16(%r3)
+; S390X-NEXT: std %f1, 0(%r3)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fpext_v3f64:
@@ -5544,13 +5576,13 @@ define <4 x double> @constrained_vector_fpext_v4f32() #0 {
; S390X-LABEL: constrained_vector_fpext_v4f32:
; S390X: # %bb.0: # %entry
; S390X-NEXT: larl %r1, .LCPI102_0
-; S390X-NEXT: ldeb %f6, 0(%r1)
+; S390X-NEXT: ldeb %f0, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI102_1
-; S390X-NEXT: ldeb %f4, 0(%r1)
-; S390X-NEXT: larl %r1, .LCPI102_2
; S390X-NEXT: ldeb %f2, 0(%r1)
+; S390X-NEXT: larl %r1, .LCPI102_2
+; S390X-NEXT: ldeb %f4, 0(%r1)
; S390X-NEXT: larl %r1, .LCPI102_3
-; S390X-NEXT: ldeb %f0, 0(%r1)
+; S390X-NEXT: ldeb %f6, 0(%r1)
; S390X-NEXT: br %r14
;
; SZ13-LABEL: constrained_vector_fpext_v4f32:
@@ -5620,7 +5652,8 @@ define <2 x double> @constrained_vector_ceil_v2f64() #0 {
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, ceil@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -5664,8 +5697,9 @@ define <3 x float> @constrained_vector_ceil_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, ceilf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -5709,8 +5743,8 @@ define void @constrained_vector_ceil_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, ceil@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -5719,9 +5753,9 @@ define void @constrained_vector_ceil_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, ceil@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -5730,12 +5764,12 @@ define void @constrained_vector_ceil_v3f64(<3 x double>* %a) #0 {
;
; SZ13-LABEL: constrained_vector_ceil_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vfidb %v1, %v1, 4, 6
-; SZ13-NEXT: fidbra %f0, 6, %f0, 4
-; SZ13-NEXT: std %f0, 16(%r2)
-; SZ13-NEXT: vst %v1, 0(%r2), 4
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vfidb %v0, %v0, 4, 6
+; SZ13-NEXT: fidbra %f1, 6, %f1, 4
+; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -5794,7 +5828,8 @@ define <2 x double> @constrained_vector_floor_v2f64() #0 {
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, floor@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -5838,8 +5873,9 @@ define <3 x float> @constrained_vector_floor_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, floorf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -5883,8 +5919,8 @@ define void @constrained_vector_floor_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, floor@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -5893,9 +5929,9 @@ define void @constrained_vector_floor_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, floor@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -5904,12 +5940,12 @@ define void @constrained_vector_floor_v3f64(<3 x double>* %a) #0 {
;
; SZ13-LABEL: constrained_vector_floor_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vfidb %v1, %v1, 4, 7
-; SZ13-NEXT: fidbra %f0, 7, %f0, 4
-; SZ13-NEXT: std %f0, 16(%r2)
-; SZ13-NEXT: vst %v1, 0(%r2), 4
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vfidb %v0, %v0, 4, 7
+; SZ13-NEXT: fidbra %f1, 7, %f1, 4
+; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -5967,7 +6003,8 @@ define <2 x double> @constrained_vector_round_v2f64() #0 {
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, round@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -6011,8 +6048,9 @@ define <3 x float> @constrained_vector_round_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, roundf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -6057,8 +6095,8 @@ define void @constrained_vector_round_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, round@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -6067,9 +6105,9 @@ define void @constrained_vector_round_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, round@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -6078,12 +6116,12 @@ define void @constrained_vector_round_v3f64(<3 x double>* %a) #0 {
;
; SZ13-LABEL: constrained_vector_round_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vfidb %v1, %v1, 4, 1
-; SZ13-NEXT: fidbra %f0, 1, %f0, 4
-; SZ13-NEXT: std %f0, 16(%r2)
-; SZ13-NEXT: vst %v1, 0(%r2), 4
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vfidb %v0, %v0, 4, 1
+; SZ13-NEXT: fidbra %f1, 1, %f1, 4
+; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
@@ -6141,7 +6179,8 @@ define <2 x double> @constrained_vector_trunc_v2f64() #0 {
; S390X-NEXT: ldr %f8, %f0
; S390X-NEXT: ldr %f0, %f1
; S390X-NEXT: brasl %r14, trunc@PLT
-; S390X-NEXT: ldr %f2, %f8
+; S390X-NEXT: ldr %f2, %f0
+; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 280(%r15)
; S390X-NEXT: br %r14
@@ -6185,8 +6224,9 @@ define <3 x float> @constrained_vector_trunc_v3f32() #0 {
; S390X-NEXT: ler %f9, %f0
; S390X-NEXT: ler %f0, %f1
; S390X-NEXT: brasl %r14, truncf@PLT
+; S390X-NEXT: ler %f4, %f0
+; S390X-NEXT: ler %f0, %f8
; S390X-NEXT: ler %f2, %f9
-; S390X-NEXT: ler %f4, %f8
; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; S390X-NEXT: lmg %r14, %r15, 288(%r15)
@@ -6230,8 +6270,8 @@ define void @constrained_vector_trunc_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: .cfi_offset %f9, -176
; S390X-NEXT: .cfi_offset %f10, -184
; S390X-NEXT: lgr %r13, %r2
-; S390X-NEXT: ld %f8, 16(%r2)
-; S390X-NEXT: ld %f0, 0(%r2)
+; S390X-NEXT: ld %f8, 0(%r2)
+; S390X-NEXT: ld %f0, 16(%r2)
; S390X-NEXT: ld %f9, 8(%r2)
; S390X-NEXT: brasl %r14, trunc@PLT
; S390X-NEXT: ldr %f10, %f0
@@ -6240,9 +6280,9 @@ define void @constrained_vector_trunc_v3f64(<3 x double>* %a) #0 {
; S390X-NEXT: ldr %f9, %f0
; S390X-NEXT: ldr %f0, %f8
; S390X-NEXT: brasl %r14, trunc@PLT
-; S390X-NEXT: std %f0, 16(%r13)
+; S390X-NEXT: std %f0, 0(%r13)
; S390X-NEXT: std %f9, 8(%r13)
-; S390X-NEXT: std %f10, 0(%r13)
+; S390X-NEXT: std %f10, 16(%r13)
; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
@@ -6251,12 +6291,12 @@ define void @constrained_vector_trunc_v3f64(<3 x double>* %a) #0 {
;
; SZ13-LABEL: constrained_vector_trunc_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vfidb %v1, %v1, 4, 5
-; SZ13-NEXT: fidbra %f0, 5, %f0, 4
-; SZ13-NEXT: std %f0, 16(%r2)
-; SZ13-NEXT: vst %v1, 0(%r2), 4
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vfidb %v0, %v0, 4, 5
+; SZ13-NEXT: fidbra %f1, 5, %f1, 4
+; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, <3 x double>* %a
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 011d235c39f..6f7551a59ee 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -1104,10 +1104,10 @@ define i128 @f20s128(double %x) nounwind strictfp {
; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X87-NEXT: movl {{[0-9]+}}(%esp), %edx
; X87-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X87-NEXT: movl %edi, 8(%esi)
-; X87-NEXT: movl %edx, 12(%esi)
-; X87-NEXT: movl %eax, (%esi)
+; X87-NEXT: movl %edi, 12(%esi)
+; X87-NEXT: movl %edx, 8(%esi)
; X87-NEXT: movl %ecx, 4(%esi)
+; X87-NEXT: movl %eax, (%esi)
; X87-NEXT: movl %esi, %eax
; X87-NEXT: addl $36, %esp
; X87-NEXT: popl %esi
@@ -1130,10 +1130,10 @@ define i128 @f20s128(double %x) nounwind strictfp {
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT: movl %edi, 8(%esi)
-; X86-SSE-NEXT: movl %edx, 12(%esi)
-; X86-SSE-NEXT: movl %eax, (%esi)
+; X86-SSE-NEXT: movl %edi, 12(%esi)
+; X86-SSE-NEXT: movl %edx, 8(%esi)
; X86-SSE-NEXT: movl %ecx, 4(%esi)
+; X86-SSE-NEXT: movl %eax, (%esi)
; X86-SSE-NEXT: movl %esi, %eax
; X86-SSE-NEXT: addl $36, %esp
; X86-SSE-NEXT: popl %esi
@@ -1444,10 +1444,10 @@ define i128 @f20u128(double %x) nounwind strictfp {
; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X87-NEXT: movl {{[0-9]+}}(%esp), %edx
; X87-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X87-NEXT: movl %edi, 8(%esi)
-; X87-NEXT: movl %edx, 12(%esi)
-; X87-NEXT: movl %eax, (%esi)
+; X87-NEXT: movl %edi, 12(%esi)
+; X87-NEXT: movl %edx, 8(%esi)
; X87-NEXT: movl %ecx, 4(%esi)
+; X87-NEXT: movl %eax, (%esi)
; X87-NEXT: movl %esi, %eax
; X87-NEXT: addl $36, %esp
; X87-NEXT: popl %esi
@@ -1470,10 +1470,10 @@ define i128 @f20u128(double %x) nounwind strictfp {
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT: movl %edi, 8(%esi)
-; X86-SSE-NEXT: movl %edx, 12(%esi)
-; X86-SSE-NEXT: movl %eax, (%esi)
+; X86-SSE-NEXT: movl %edi, 12(%esi)
+; X86-SSE-NEXT: movl %edx, 8(%esi)
; X86-SSE-NEXT: movl %ecx, 4(%esi)
+; X86-SSE-NEXT: movl %eax, (%esi)
; X86-SSE-NEXT: movl %esi, %eax
; X86-SSE-NEXT: addl $36, %esp
; X86-SSE-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 2173ff369a9..e5fbe07334b 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -396,10 +396,10 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -535,10 +535,10 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 05b129ceeea..a02e9280c6c 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -42,10 +42,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -87,10 +87,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -132,10 +132,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -177,10 +177,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -226,10 +226,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -271,10 +271,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -312,10 +312,10 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -353,10 +353,10 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -394,10 +394,10 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -435,10 +435,10 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -476,10 +476,10 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -517,10 +517,10 @@ define fp128 @log(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -558,10 +558,10 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -599,10 +599,10 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -644,10 +644,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -689,10 +689,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -730,10 +730,10 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -775,10 +775,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -817,10 +817,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -858,10 +858,10 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -899,10 +899,10 @@ define fp128 @round(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -940,10 +940,10 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -981,10 +981,10 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
@@ -1022,10 +1022,10 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 12(%esi)
+; X86-NEXT: movl %edx, 8(%esi)
; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $20, %esp
; X86-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index e9b5e82d8de..9ba7074dc46 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -292,9 +292,9 @@ define <3 x double> @constrained_vector_frem_v3f64() #0 {
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -1077,9 +1077,9 @@ define <3 x double> @constrained_vector_pow_v3f64() #0 {
; CHECK-NEXT: callq pow
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -1333,9 +1333,9 @@ define <3 x double> @constrained_vector_powi_v3f64() #0 {
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -1570,9 +1570,9 @@ define <3 x double> @constrained_vector_sin_v3f64() #0 {
; CHECK-NEXT: callq sin
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -1794,9 +1794,9 @@ define <3 x double> @constrained_vector_cos_v3f64() #0 {
; CHECK-NEXT: callq cos
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -2018,9 +2018,9 @@ define <3 x double> @constrained_vector_exp_v3f64() #0 {
; CHECK-NEXT: callq exp
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -2242,9 +2242,9 @@ define <3 x double> @constrained_vector_exp2_v3f64() #0 {
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -2466,9 +2466,9 @@ define <3 x double> @constrained_vector_log_v3f64() #0 {
; CHECK-NEXT: callq log
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -2690,9 +2690,9 @@ define <3 x double> @constrained_vector_log10_v3f64() #0 {
; CHECK-NEXT: callq log10
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -2914,9 +2914,9 @@ define <3 x double> @constrained_vector_log2_v3f64() #0 {
; CHECK-NEXT: callq log2
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -3116,9 +3116,9 @@ define <3 x double> @constrained_vector_rint_v3f64() #0 {
; CHECK-NEXT: callq rint
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -3286,9 +3286,9 @@ define <3 x double> @constrained_vector_nearby_v3f64() #0 {
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -3495,9 +3495,9 @@ define <3 x double> @constrained_vector_max_v3f64() #0 {
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -3750,9 +3750,9 @@ define <3 x double> @constrained_vector_min_v3f64() #0 {
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -5237,9 +5237,9 @@ define <3 x double> @constrained_vector_ceil_v3f64() #0 {
; CHECK-NEXT: callq ceil
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -5369,9 +5369,9 @@ define <3 x double> @constrained_vector_floor_v3f64() #0 {
; CHECK-NEXT: callq floor
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -5523,9 +5523,9 @@ define <3 x double> @constrained_vector_round_v3f64() #0 {
; CHECK-NEXT: callq round
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -5667,9 +5667,9 @@ define <3 x double> @constrained_vector_trunc_v3f64() #0 {
; CHECK-NEXT: callq trunc
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
OpenPOWER on IntegriCloud