diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6888,7 +6888,10 @@ ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs); ValueVTs.push_back(MVT::Other); // Out chain - SDValue Chain = getRoot(); + // We do not need to serialize constrained FP intrinsics against + // each other or against (nonvolatile) loads, so they can be + // chained like loads. + SDValue Chain = DAG.getRoot(); SmallVector Opers; Opers.push_back(Chain); if (FPI.isUnaryOp()) { @@ -6926,8 +6929,9 @@ } assert(Result.getNode()->getNumValues() == 2); + // See above -- chain is handled like for loads here. SDValue OutChain = Result.getValue(1); - DAG.setRoot(OutChain); + PendingLoads.push_back(OutChain); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -3171,13 +3171,19 @@ case OPC_CheckFoldableChainNode: { assert(NodeStack.size() != 1 && "No parent node"); // Verify that all intermediate nodes between the root and this one have - // a single use. + // a single use (ignoring chains, which are handled in UpdateChains). bool HasMultipleUses = false; - for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) - if (!NodeStack[i].getNode()->hasOneUse()) { - HasMultipleUses = true; - break; - } + for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) { + unsigned NNonChainUses = 0; + SDNode *NS = NodeStack[i].getNode(); + for (auto UI = NS->use_begin(), UE = NS->use_end(); UI != UE; ++UI) + if (UI.getUse().getValueType() != MVT::Other) + if (++NNonChainUses > 1) { + HasMultipleUses = true; + break; + } + if (HasMultipleUses) break; + } if (HasMultipleUses) break; // Check to see that the target thinks this is profitable to fold and that diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td --- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -467,16 +467,16 @@ // f64 multiplication of two FP32 registers. let Uses = [FPC], mayRaiseFPException = 1 in def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>; -def : Pat<(any_fmul (f64 (fpextend FP32:$src1)), - (f64 (fpextend FP32:$src2))), +def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)), + (f64 (any_fpextend FP32:$src2))), (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32), FP32:$src2)>; // f64 multiplication of an FP32 register and an f32 memory. let Uses = [FPC], mayRaiseFPException = 1 in def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>; -def : Pat<(any_fmul (f64 (fpextend FP32:$src1)), - (f64 (extloadf32 bdxaddr12only:$addr))), +def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)), + (f64 (any_extloadf32 bdxaddr12only:$addr))), (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32), bdxaddr12only:$addr)>; @@ -484,8 +484,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>; let Predicates = [FeatureNoVectorEnhancements1] in - def : Pat<(any_fmul (f128 (fpextend FP64:$src1)), - (f128 (fpextend FP64:$src2))), + def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)), + (f128 (any_fpextend FP64:$src2))), (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), FP64:$src2)>; @@ -493,8 +493,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>; let Predicates = [FeatureNoVectorEnhancements1] in - def : Pat<(any_fmul (f128 (fpextend FP64:$src1)), - (f128 (extloadf64 bdxaddr12only:$addr))), + def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)), + (f128 (any_extloadf64 bdxaddr12only:$addr))), (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), bdxaddr12only:$addr)>; diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll @@ -1,6 +1,4 @@ ; Test strict multiplication of two f32s, producing an f64 result. -; FIXME: We should use llvm.experimental.constrained.fpext, but we currently -; cannot match a combination of two strict operations in ISel. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s @@ -8,14 +6,19 @@ declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) ; Check register multiplication. define double @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: mdebr %f0, %f2 ; CHECK: br %r14 - %f1x = fpext float %f1 to double - %f2x = fpext float %f2 to double + %f1x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f1, + metadata !"fpexcept.strict") #0 + %f2x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f2, + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", @@ -29,8 +32,12 @@ ; CHECK: mdeb %f0, 0(%r2) ; CHECK: br %r14 %f2 = load float, float *%ptr - %f1x = fpext float %f1 to double - %f2x = fpext float %f2 to double + %f1x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f1, + metadata !"fpexcept.strict") #0 + %f2x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f2, + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", @@ -45,8 +52,12 @@ ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1023 %f2 = load float, float *%ptr - %f1x = fpext float %f1 to double - %f2x = fpext float %f2 to double + %f1x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f1, + metadata !"fpexcept.strict") #0 + %f2x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f2, + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", @@ -63,8 +74,12 @@ ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1024 %f2 = load float, float *%ptr - %f1x = fpext float %f1 to double - %f2x = fpext float %f2 to double + %f1x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f1, + metadata !"fpexcept.strict") #0 + %f2x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f2, + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", @@ -80,8 +95,12 @@ ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 -1 %f2 = load float, float *%ptr - %f1x = fpext float %f1 to double - %f2x = fpext float %f2 to double + %f1x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f1, + metadata !"fpexcept.strict") #0 + %f2x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f2, + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", @@ -98,8 +117,12 @@ %ptr1 = getelementptr float, float *%base, i64 %index %ptr2 = getelementptr float, float *%ptr1, i64 100 %f2 = load float, float *%ptr2 - %f1x = fpext float %f1 to double - %f2x = fpext float %f2 to double + %f1x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f1, + metadata !"fpexcept.strict") #0 + %f2x = call double @llvm.experimental.constrained.fpext.f64.f32( + float %f2, + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", @@ -195,8 +218,12 @@ %ret = call float @foo() #0 - %accext0 = fpext float %ret to double - %ext0 = fpext float %frob0 to double + %accext0 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %ret, + metadata !"fpexcept.strict") #0 + %ext0 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob0, + metadata !"fpexcept.strict") #0 %mul0 = call double @llvm.experimental.constrained.fmul.f64( double %accext0, double %ext0, metadata !"round.dynamic", @@ -210,8 +237,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext1 = fpext float %trunc0 to double - %ext1 = fpext float %frob1 to double + %accext1 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc0, + metadata !"fpexcept.strict") #0 + %ext1 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob1, + metadata !"fpexcept.strict") #0 %mul1 = call double @llvm.experimental.constrained.fmul.f64( double %accext1, double %ext1, metadata !"round.dynamic", @@ -225,8 +256,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext2 = fpext float %trunc1 to double - %ext2 = fpext float %frob2 to double + %accext2 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc1, + metadata !"fpexcept.strict") #0 + %ext2 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob2, + metadata !"fpexcept.strict") #0 %mul2 = call double @llvm.experimental.constrained.fmul.f64( double %accext2, double %ext2, metadata !"round.dynamic", @@ -240,8 +275,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext3 = fpext float %trunc2 to double - %ext3 = fpext float %frob3 to double + %accext3 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc2, + metadata !"fpexcept.strict") #0 + %ext3 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob3, + metadata !"fpexcept.strict") #0 %mul3 = call double @llvm.experimental.constrained.fmul.f64( double %accext3, double %ext3, metadata !"round.dynamic", @@ -255,8 +294,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext4 = fpext float %trunc3 to double - %ext4 = fpext float %frob4 to double + %accext4 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc3, + metadata !"fpexcept.strict") #0 + %ext4 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob4, + metadata !"fpexcept.strict") #0 %mul4 = call double @llvm.experimental.constrained.fmul.f64( double %accext4, double %ext4, metadata !"round.dynamic", @@ -270,8 +313,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext5 = fpext float %trunc4 to double - %ext5 = fpext float %frob5 to double + %accext5 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc4, + metadata !"fpexcept.strict") #0 + %ext5 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob5, + metadata !"fpexcept.strict") #0 %mul5 = call double @llvm.experimental.constrained.fmul.f64( double %accext5, double %ext5, metadata !"round.dynamic", @@ -285,8 +332,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext6 = fpext float %trunc5 to double - %ext6 = fpext float %frob6 to double + %accext6 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc5, + metadata !"fpexcept.strict") #0 + %ext6 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob6, + metadata !"fpexcept.strict") #0 %mul6 = call double @llvm.experimental.constrained.fmul.f64( double %accext6, double %ext6, metadata !"round.dynamic", @@ -300,8 +351,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext7 = fpext float %trunc6 to double - %ext7 = fpext float %frob7 to double + %accext7 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc6, + metadata !"fpexcept.strict") #0 + %ext7 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob7, + metadata !"fpexcept.strict") #0 %mul7 = call double @llvm.experimental.constrained.fmul.f64( double %accext7, double %ext7, metadata !"round.dynamic", @@ -315,8 +370,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext8 = fpext float %trunc7 to double - %ext8 = fpext float %frob8 to double + %accext8 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc7, + metadata !"fpexcept.strict") #0 + %ext8 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob8, + metadata !"fpexcept.strict") #0 %mul8 = call double @llvm.experimental.constrained.fmul.f64( double %accext8, double %ext8, metadata !"round.dynamic", @@ -330,8 +389,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext9 = fpext float %trunc8 to double - %ext9 = fpext float %frob9 to double + %accext9 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %trunc8, + metadata !"fpexcept.strict") #0 + %ext9 = call double @llvm.experimental.constrained.fpext.f64.f32( + float %frob9, + metadata !"fpexcept.strict") #0 %mul9 = call double @llvm.experimental.constrained.fmul.f64( double %accext9, double %ext9, metadata !"round.dynamic", diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll @@ -1,12 +1,11 @@ ; Test strict multiplication of two f64s, producing an f128 result. -; FIXME: We should use llvm.experimental.constrained.fpext, but we currently -; cannot match a combination of two strict operations in ISel. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s declare fp128 @llvm.experimental.constrained.fmul.f128(fp128, fp128, metadata, metadata) declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata) +declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata) declare double @foo() @@ -19,8 +18,12 @@ ; CHECK: std %f0, 0(%r2) ; CHECK: std %f2, 8(%r2) ; CHECK: br %r14 - %f1x = fpext double %f1 to fp128 - %f2x = fpext double %f2 to fp128 + %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f1, + metadata !"fpexcept.strict") #0 + %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f2, + metadata !"fpexcept.strict") #0 %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", @@ -37,8 +40,12 @@ ; CHECK: std %f2, 8(%r3) ; CHECK: br %r14 %f2 = load double, double *%ptr - %f1x = fpext double %f1 to fp128 - %f2x = fpext double %f2 to fp128 + %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f1, + metadata !"fpexcept.strict") #0 + %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f2, + metadata !"fpexcept.strict") #0 %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", @@ -56,8 +63,12 @@ ; CHECK: br %r14 %ptr = getelementptr double, double *%base, i64 511 %f2 = load double, double *%ptr - %f1x = fpext double %f1 to fp128 - %f2x = fpext double %f2 to fp128 + %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f1, + metadata !"fpexcept.strict") #0 + %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f2, + metadata !"fpexcept.strict") #0 %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", @@ -77,8 +88,12 @@ ; CHECK: br %r14 %ptr = getelementptr double, double *%base, i64 512 %f2 = load double, double *%ptr - %f1x = fpext double %f1 to fp128 - %f2x = fpext double %f2 to fp128 + %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f1, + metadata !"fpexcept.strict") #0 + %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f2, + metadata !"fpexcept.strict") #0 %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", @@ -97,8 +112,12 @@ ; CHECK: br %r14 %ptr = getelementptr double, double *%base, i64 -1 %f2 = load double, double *%ptr - %f1x = fpext double %f1 to fp128 - %f2x = fpext double %f2 to fp128 + %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f1, + metadata !"fpexcept.strict") #0 + %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f2, + metadata !"fpexcept.strict") #0 %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", @@ -118,8 +137,12 @@ %ptr1 = getelementptr double, double *%base, i64 %index %ptr2 = getelementptr double, double *%ptr1, i64 100 %f2 = load double, double *%ptr2 - %f1x = fpext double %f1 to fp128 - %f2x = fpext double %f2 to fp128 + %f1x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f1, + metadata !"fpexcept.strict") #0 + %f2x = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %f2, + metadata !"fpexcept.strict") #0 %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", @@ -216,8 +239,12 @@ %ret = call double @foo() #0 - %accext0 = fpext double %ret to fp128 - %ext0 = fpext double %frob0 to fp128 + %accext0 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %ret, + metadata !"fpexcept.strict") #0 + %ext0 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob0, + metadata !"fpexcept.strict") #0 %mul0 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext0, fp128 %ext0, metadata !"round.dynamic", @@ -231,8 +258,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext1 = fpext double %trunc0 to fp128 - %ext1 = fpext double %frob1 to fp128 + %accext1 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc0, + metadata !"fpexcept.strict") #0 + %ext1 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob1, + metadata !"fpexcept.strict") #0 %mul1 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext1, fp128 %ext1, metadata !"round.dynamic", @@ -246,8 +277,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext2 = fpext double %trunc1 to fp128 - %ext2 = fpext double %frob2 to fp128 + %accext2 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc1, + metadata !"fpexcept.strict") #0 + %ext2 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob2, + metadata !"fpexcept.strict") #0 %mul2 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext2, fp128 %ext2, metadata !"round.dynamic", @@ -261,8 +296,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext3 = fpext double %trunc2 to fp128 - %ext3 = fpext double %frob3 to fp128 + %accext3 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc2, + metadata !"fpexcept.strict") #0 + %ext3 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob3, + metadata !"fpexcept.strict") #0 %mul3 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext3, fp128 %ext3, metadata !"round.dynamic", @@ -276,8 +315,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext4 = fpext double %trunc3 to fp128 - %ext4 = fpext double %frob4 to fp128 + %accext4 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc3, + metadata !"fpexcept.strict") #0 + %ext4 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob4, + metadata !"fpexcept.strict") #0 %mul4 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext4, fp128 %ext4, metadata !"round.dynamic", @@ -291,8 +334,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext5 = fpext double %trunc4 to fp128 - %ext5 = fpext double %frob5 to fp128 + %accext5 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc4, + metadata !"fpexcept.strict") #0 + %ext5 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob5, + metadata !"fpexcept.strict") #0 %mul5 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext5, fp128 %ext5, metadata !"round.dynamic", @@ -306,8 +353,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext6 = fpext double %trunc5 to fp128 - %ext6 = fpext double %frob6 to fp128 + %accext6 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc5, + metadata !"fpexcept.strict") #0 + %ext6 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob6, + metadata !"fpexcept.strict") #0 %mul6 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext6, fp128 %ext6, metadata !"round.dynamic", @@ -321,8 +372,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext7 = fpext double %trunc6 to fp128 - %ext7 = fpext double %frob7 to fp128 + %accext7 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc6, + metadata !"fpexcept.strict") #0 + %ext7 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob7, + metadata !"fpexcept.strict") #0 %mul7 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext7, fp128 %ext7, metadata !"round.dynamic", @@ -336,8 +391,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext8 = fpext double %trunc7 to fp128 - %ext8 = fpext double %frob8 to fp128 + %accext8 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc7, + metadata !"fpexcept.strict") #0 + %ext8 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob8, + metadata !"fpexcept.strict") #0 %mul8 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext8, fp128 %ext8, metadata !"round.dynamic", @@ -351,8 +410,12 @@ metadata !"round.dynamic", metadata !"fpexcept.strict") #0 - %accext9 = fpext double %trunc8 to fp128 - %ext9 = fpext double %frob9 to fp128 + %accext9 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %trunc8, + metadata !"fpexcept.strict") #0 + %ext9 = call fp128 @llvm.experimental.constrained.fpext.f128.f64( + double %frob9, + metadata !"fpexcept.strict") #0 %mul9 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext9, fp128 %ext9, metadata !"round.dynamic", diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -33,11 +33,11 @@ ; S390X-NEXT: larl %r1, .LCPI1_0 ; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI1_1 -; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI1_2 ; S390X-NEXT: ldeb %f0, 0(%r1) -; S390X-NEXT: ddbr %f2, %f1 +; S390X-NEXT: larl %r1, .LCPI1_2 +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: ddbr %f0, %f1 +; S390X-NEXT: ddbr %f2, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v2f64: @@ -63,14 +63,14 @@ ; S390X-NEXT: larl %r1, .LCPI2_0 ; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI2_1 -; S390X-NEXT: le %f4, 0(%r1) +; S390X-NEXT: le %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI2_2 ; S390X-NEXT: le %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI2_3 -; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: debr %f4, %f1 -; S390X-NEXT: debr %f2, %f1 +; S390X-NEXT: le %f4, 0(%r1) ; S390X-NEXT: debr %f0, %f1 +; S390X-NEXT: debr %f2, %f1 +; S390X-NEXT: debr %f4, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v3f32: @@ -100,20 +100,18 @@ define void @constrained_vector_fdiv_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fdiv_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: ld %f0, 16(%r2) -; S390X-NEXT: ld %f1, 8(%r2) -; S390X-NEXT: larl %r1, .LCPI3_0 -; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI3_1 -; S390X-NEXT: ldeb %f3, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI3_2 -; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: ddbr %f3, %f1 -; S390X-NEXT: ddb %f2, 0(%r2) -; S390X-NEXT: ddbr %f4, %f0 -; S390X-NEXT: std %f4, 16(%r2) -; S390X-NEXT: std %f3, 8(%r2) -; S390X-NEXT: std %f2, 0(%r2) +; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI3_0 +; S390X-NEXT: ldeb %f2, 0(%r1) +; S390X-NEXT: ddb %f1, 0(%r2) +; S390X-NEXT: ddb %f0, 8(%r2) +; S390X-NEXT: ddb %f2, 16(%r2) +; S390X-NEXT: std %f1, 0(%r2) +; S390X-NEXT: std %f0, 8(%r2) +; S390X-NEXT: std %f2, 16(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v3f64: @@ -145,17 +143,17 @@ ; S390X-NEXT: larl %r1, .LCPI4_0 ; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI4_1 -; S390X-NEXT: ldeb %f6, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI4_2 -; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI4_3 ; S390X-NEXT: ldeb %f2, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI4_3 +; S390X-NEXT: ldeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI4_4 -; S390X-NEXT: ldeb %f0, 0(%r1) -; S390X-NEXT: ddbr %f6, %f1 -; S390X-NEXT: ddbr %f4, %f1 -; S390X-NEXT: ddbr %f2, %f1 +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: ddbr %f0, %f1 +; S390X-NEXT: ddbr %f2, %f1 +; S390X-NEXT: ddbr %f4, %f1 +; S390X-NEXT: ddbr %f6, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v4f64: @@ -164,10 +162,10 @@ ; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: larl %r1, .LCPI4_1 ; SZ13-NEXT: vl %v1, 0(%r1), 3 -; SZ13-NEXT: vfddb %v26, %v1, %v0 +; SZ13-NEXT: vfddb %v24, %v1, %v0 ; SZ13-NEXT: larl %r1, .LCPI4_2 ; SZ13-NEXT: vl %v1, 0(%r1), 3 -; SZ13-NEXT: vfddb %v24, %v1, %v0 +; SZ13-NEXT: vfddb %v26, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64( @@ -244,7 +242,8 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, fmod@PLT -; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f9 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -316,8 +315,9 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, fmodf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 -; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -383,8 +383,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f2, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f2, 16(%r2) ; S390X-NEXT: larl %r1, .LCPI8_0 ; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: ld %f9, 8(%r2) @@ -401,9 +401,9 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, fmod@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -499,9 +499,10 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, fmod@PLT -; S390X-NEXT: ldr %f2, %f11 -; S390X-NEXT: ldr %f4, %f10 -; S390X-NEXT: ldr %f6, %f9 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f9 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f11 ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -588,13 +589,13 @@ ; S390X-LABEL: constrained_vector_fmul_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI11_0 -; S390X-NEXT: ldeb %f2, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI11_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI11_2 -; S390X-NEXT: ldeb %f0, 0(%r1) -; S390X-NEXT: mdbr %f2, %f1 +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: mdbr %f0, %f1 +; S390X-NEXT: mdbr %f2, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v2f64: @@ -618,15 +619,15 @@ ; S390X-LABEL: constrained_vector_fmul_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI12_0 -; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: le %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI12_1 -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: meeb %f4, 0(%r1) +; S390X-NEXT: ler %f0, %f4 +; S390X-NEXT: meeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI12_2 -; S390X-NEXT: ler %f2, %f0 +; S390X-NEXT: ler %f2, %f4 ; S390X-NEXT: meeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI12_3 -; S390X-NEXT: meeb %f0, 0(%r1) +; S390X-NEXT: meeb %f4, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v3f32: @@ -656,17 +657,16 @@ define void @constrained_vector_fmul_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fmul_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: ld %f0, 8(%r2) ; S390X-NEXT: larl %r1, .LCPI13_0 -; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: ld %f2, 16(%r2) -; S390X-NEXT: mdbr %f0, %f1 -; S390X-NEXT: ldr %f3, %f1 -; S390X-NEXT: mdb %f3, 0(%r2) -; S390X-NEXT: mdbr %f2, %f1 -; S390X-NEXT: std %f2, 16(%r2) -; S390X-NEXT: std %f0, 8(%r2) -; S390X-NEXT: std %f3, 0(%r2) +; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldr %f1, %f0 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: mdb %f0, 0(%r2) +; S390X-NEXT: mdb %f2, 8(%r2) +; S390X-NEXT: mdb %f1, 16(%r2) +; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: std %f2, 8(%r2) +; S390X-NEXT: std %f1, 16(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v3f64: @@ -678,8 +678,8 @@ ; SZ13-NEXT: vl %v2, 0(%r1), 3 ; SZ13-NEXT: mdb %f1, 16(%r2) ; SZ13-NEXT: vfmdb %v0, %v2, %v0 -; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -697,19 +697,19 @@ ; S390X-LABEL: constrained_vector_fmul_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI14_0 -; S390X-NEXT: ldeb %f6, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI14_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI14_2 -; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI14_3 ; S390X-NEXT: ldeb %f2, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI14_3 +; S390X-NEXT: ldeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI14_4 -; S390X-NEXT: ldeb %f0, 0(%r1) -; S390X-NEXT: mdbr %f6, %f1 -; S390X-NEXT: mdbr %f4, %f1 -; S390X-NEXT: mdbr %f2, %f1 +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: mdbr %f0, %f1 +; S390X-NEXT: mdbr %f2, %f1 +; S390X-NEXT: mdbr %f4, %f1 +; S390X-NEXT: mdbr %f6, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v4f64: @@ -719,9 +719,9 @@ ; SZ13-NEXT: larl %r1, .LCPI14_1 ; SZ13-NEXT: vl %v1, 0(%r1), 3 ; SZ13-NEXT: larl %r1, .LCPI14_2 -; SZ13-NEXT: vfmdb %v26, %v1, %v0 -; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vfmdb %v24, %v1, %v0 +; SZ13-NEXT: vl %v0, 0(%r1), 3 +; SZ13-NEXT: vfmdb %v26, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( @@ -763,13 +763,12 @@ ; S390X-LABEL: constrained_vector_fadd_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI16_0 -; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI16_2 ; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI16_1 -; S390X-NEXT: ldr %f2, %f1 +; S390X-NEXT: ld %f2, 0(%r1) +; S390X-NEXT: adbr %f0, %f2 +; S390X-NEXT: larl %r1, .LCPI16_2 ; S390X-NEXT: adb %f2, 0(%r1) -; S390X-NEXT: adbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v2f64: @@ -793,14 +792,15 @@ ; S390X-LABEL: constrained_vector_fadd_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI17_0 -; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: lzer %f4 -; S390X-NEXT: aebr %f4, %f0 +; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI17_1 -; S390X-NEXT: ler %f2, %f0 -; S390X-NEXT: aeb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI17_2 +; S390X-NEXT: ler %f2, %f1 +; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: aeb %f0, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI17_2 +; S390X-NEXT: aeb %f2, 0(%r1) +; S390X-NEXT: lzer %f4 +; S390X-NEXT: aebr %f4, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v3f32: @@ -829,17 +829,16 @@ define void @constrained_vector_fadd_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fadd_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: ld %f0, 8(%r2) ; S390X-NEXT: larl %r1, .LCPI18_0 -; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: ld %f2, 16(%r2) -; S390X-NEXT: adbr %f0, %f1 -; S390X-NEXT: ldr %f3, %f1 -; S390X-NEXT: adb %f3, 0(%r2) -; S390X-NEXT: adbr %f2, %f1 -; S390X-NEXT: std %f2, 16(%r2) -; S390X-NEXT: std %f0, 8(%r2) -; S390X-NEXT: std %f3, 0(%r2) +; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldr %f1, %f0 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: adb %f0, 0(%r2) +; S390X-NEXT: adb %f2, 8(%r2) +; S390X-NEXT: adb %f1, 16(%r2) +; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: std %f2, 8(%r2) +; S390X-NEXT: std %f1, 16(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v3f64: @@ -851,8 +850,8 @@ ; SZ13-NEXT: vl %v2, 0(%r1), 3 ; SZ13-NEXT: adb %f1, 16(%r2) ; SZ13-NEXT: vfadb %v0, %v2, %v0 -; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -870,19 +869,18 @@ ; S390X-LABEL: constrained_vector_fadd_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI19_0 -; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI19_1 -; S390X-NEXT: ldr %f2, %f1 -; S390X-NEXT: ldr %f6, %f1 -; S390X-NEXT: adb %f6, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI19_2 -; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI19_4 ; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI19_1 +; S390X-NEXT: ld %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI19_3 +; S390X-NEXT: ldeb %f4, 0(%r1) +; S390X-NEXT: adbr %f0, %f6 +; S390X-NEXT: larl %r1, .LCPI19_2 +; S390X-NEXT: ldr %f2, %f6 ; S390X-NEXT: adb %f2, 0(%r1) -; S390X-NEXT: adbr %f4, %f1 -; S390X-NEXT: adbr %f0, %f1 +; S390X-NEXT: adbr %f4, %f6 +; S390X-NEXT: larl %r1, .LCPI19_4 +; S390X-NEXT: adb %f6, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v4f64: @@ -892,9 +890,9 @@ ; SZ13-NEXT: larl %r1, .LCPI19_1 ; SZ13-NEXT: vl %v1, 0(%r1), 3 ; SZ13-NEXT: larl %r1, .LCPI19_2 -; SZ13-NEXT: vfadb %v26, %v1, %v0 -; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vfadb %v24, %v1, %v0 +; SZ13-NEXT: vl %v0, 0(%r1), 3 +; SZ13-NEXT: vfadb %v26, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( @@ -935,12 +933,12 @@ define <2 x double> @constrained_vector_fsub_v2f64() #0 { ; S390X-LABEL: constrained_vector_fsub_v2f64: ; S390X: # %bb.0: # %entry +; S390X-NEXT: larl %r1, .LCPI21_1 +; S390X-NEXT: ld %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI21_0 -; S390X-NEXT: ld %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI21_2 ; S390X-NEXT: ldeb %f1, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI21_1 -; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f2 +; S390X-NEXT: larl %r1, .LCPI21_2 ; S390X-NEXT: sdb %f2, 0(%r1) ; S390X-NEXT: sdbr %f0, %f1 ; S390X-NEXT: br %r14 @@ -965,13 +963,13 @@ ; S390X-LABEL: constrained_vector_fsub_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI22_0 -; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: le %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI22_1 -; S390X-NEXT: ler %f2, %f0 -; S390X-NEXT: seb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI22_2 +; S390X-NEXT: ler %f0, %f4 ; S390X-NEXT: seb %f0, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI22_2 +; S390X-NEXT: ler %f2, %f4 +; S390X-NEXT: seb %f2, 0(%r1) ; S390X-NEXT: lzer %f1 ; S390X-NEXT: sebr %f4, %f1 ; S390X-NEXT: br %r14 @@ -1006,16 +1004,14 @@ ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI23_0 ; S390X-NEXT: ld %f0, 0(%r1) -; S390X-NEXT: ld %f1, 8(%r2) -; S390X-NEXT: ld %f2, 16(%r2) -; S390X-NEXT: ldr %f3, %f0 -; S390X-NEXT: sdb %f3, 0(%r2) -; S390X-NEXT: ldr %f4, %f0 -; S390X-NEXT: sdbr %f4, %f1 -; S390X-NEXT: sdbr %f0, %f2 -; S390X-NEXT: std %f0, 16(%r2) -; S390X-NEXT: std %f4, 8(%r2) -; S390X-NEXT: std %f3, 0(%r2) +; S390X-NEXT: ldr %f1, %f0 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: sdb %f0, 0(%r2) +; S390X-NEXT: sdb %f2, 8(%r2) +; S390X-NEXT: sdb %f1, 16(%r2) +; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: std %f2, 8(%r2) +; S390X-NEXT: std %f1, 16(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fsub_v3f64: @@ -1025,8 +1021,8 @@ ; SZ13-NEXT: sdb %f2, 16(%r2) ; SZ13-NEXT: vgmg %v1, 12, 10 ; SZ13-NEXT: vfsdb %v0, %v1, %v0 -; SZ13-NEXT: std %f2, 16(%r2) ; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f2, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -1043,21 +1039,21 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; S390X-LABEL: constrained_vector_fsub_v4f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: larl %r1, .LCPI24_0 -; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI24_1 -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: sdb %f6, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI24_2 +; S390X-NEXT: ld %f6, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI24_0 ; S390X-NEXT: ldeb %f1, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI24_4 -; S390X-NEXT: ldeb %f3, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI24_3 -; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f6 +; S390X-NEXT: larl %r1, .LCPI24_2 +; S390X-NEXT: ldr %f2, %f6 ; S390X-NEXT: sdb %f2, 0(%r1) -; S390X-NEXT: ldr %f4, %f0 -; S390X-NEXT: sdbr %f4, %f1 -; S390X-NEXT: sdbr %f0, %f3 +; S390X-NEXT: larl %r1, .LCPI24_3 +; S390X-NEXT: ldeb %f3, 0(%r1) +; S390X-NEXT: ldr %f4, %f6 +; S390X-NEXT: larl %r1, .LCPI24_4 +; S390X-NEXT: sdb %f6, 0(%r1) +; S390X-NEXT: sdbr %f0, %f1 +; S390X-NEXT: sdbr %f4, %f3 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fsub_v4f64: @@ -1066,9 +1062,9 @@ ; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vgmg %v1, 12, 10 ; SZ13-NEXT: larl %r1, .LCPI24_1 -; SZ13-NEXT: vfsdb %v26, %v1, %v0 -; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vfsdb %v24, %v1, %v0 +; SZ13-NEXT: vl %v0, 0(%r1), 3 +; SZ13-NEXT: vfsdb %v26, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64( @@ -1130,11 +1126,11 @@ ; S390X-LABEL: constrained_vector_sqrt_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI27_0 -; S390X-NEXT: sqeb %f4, 0(%r1) +; S390X-NEXT: sqeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI27_1 ; S390X-NEXT: sqeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI27_2 -; S390X-NEXT: sqeb %f0, 0(%r1) +; S390X-NEXT: sqeb %f4, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_sqrt_v3f32: @@ -1160,14 +1156,12 @@ define void @constrained_vector_sqrt_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_sqrt_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: ld %f0, 8(%r2) -; S390X-NEXT: ld %f1, 16(%r2) -; S390X-NEXT: sqdb %f2, 0(%r2) -; S390X-NEXT: sqdbr %f0, %f0 -; S390X-NEXT: sqdbr %f1, %f1 -; S390X-NEXT: std %f1, 16(%r2) -; S390X-NEXT: std %f0, 8(%r2) -; S390X-NEXT: std %f2, 0(%r2) +; S390X-NEXT: sqdb %f0, 0(%r2) +; S390X-NEXT: sqdb %f1, 8(%r2) +; S390X-NEXT: sqdb %f2, 16(%r2) +; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: std %f1, 8(%r2) +; S390X-NEXT: std %f2, 16(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_sqrt_v3f64: @@ -1192,13 +1186,13 @@ ; S390X-LABEL: constrained_vector_sqrt_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI29_0 -; S390X-NEXT: sqdb %f6, 0(%r1) +; S390X-NEXT: sqdb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI29_1 ; S390X-NEXT: sqdb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI29_3 ; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI29_2 -; S390X-NEXT: sqdb %f2, 0(%r1) +; S390X-NEXT: sqdb %f6, 0(%r1) ; S390X-NEXT: sqdbr %f0, %f0 ; S390X-NEXT: br %r14 ; @@ -1206,10 +1200,10 @@ ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI29_0 ; SZ13-NEXT: vl %v0, 0(%r1), 3 -; SZ13-NEXT: vfsqdb %v26, %v0 +; SZ13-NEXT: vfsqdb %v24, %v0 ; SZ13-NEXT: larl %r1, .LCPI29_1 ; SZ13-NEXT: vl %v0, 0(%r1), 3 -; SZ13-NEXT: vfsqdb %v24, %v0 +; SZ13-NEXT: vfsqdb %v26, %v0 ; SZ13-NEXT: br %r14 entry: %sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64( @@ -1285,7 +1279,8 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, pow@PLT -; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f9 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -1359,8 +1354,9 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, powf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 -; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -1430,8 +1426,8 @@ ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: .cfi_offset %f11, -192 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: larl %r1, .LCPI33_0 ; S390X-NEXT: ldeb %f9, 0(%r1) ; S390X-NEXT: ld %f10, 8(%r2) @@ -1445,9 +1441,9 @@ ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ldr %f2, %f9 ; S390X-NEXT: brasl %r14, pow@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f10, 8(%r13) -; S390X-NEXT: std %f11, 0(%r13) +; S390X-NEXT: std %f11, 16(%r13) ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -1548,9 +1544,10 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, pow@PLT -; S390X-NEXT: ldr %f2, %f11 -; S390X-NEXT: ldr %f4, %f10 -; S390X-NEXT: ldr %f6, %f9 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f9 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f11 ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -1670,7 +1667,8 @@ ; S390X-NEXT: lghi %r2, 3 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, __powidf2@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -1734,8 +1732,9 @@ ; S390X-NEXT: lghi %r2, 3 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, __powisf2@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -1898,9 +1897,10 @@ ; S390X-NEXT: lghi %r2, 3 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, __powidf2@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2001,14 +2001,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI41_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: larl %r1, .LCPI41_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, sin@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -2066,8 +2067,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, sinf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -2125,8 +2127,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -2135,9 +2137,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, sin@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2203,7 +2205,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI44_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: larl %r1, .LCPI44_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -2216,13 +2218,14 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: larl %r1, .LCPI44_3 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, sin@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2318,14 +2321,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI46_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: larl %r1, .LCPI46_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, cos@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -2383,8 +2387,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, cosf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -2442,8 +2447,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -2452,9 +2457,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, cos@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2520,7 +2525,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI49_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: larl %r1, .LCPI49_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -2533,13 +2538,14 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: larl %r1, .LCPI49_3 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, cos@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2635,14 +2641,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI51_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: larl %r1, .LCPI51_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -2700,8 +2707,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, expf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -2759,8 +2767,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -2769,9 +2777,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, exp@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2837,7 +2845,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI54_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: larl %r1, .LCPI54_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -2850,13 +2858,14 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: larl %r1, .LCPI54_3 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2952,14 +2961,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI56_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, exp2@PLT ; S390X-NEXT: larl %r1, .LCPI56_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp2@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3017,8 +3027,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, exp2f@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -3076,8 +3087,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, exp2@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -3086,9 +3097,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, exp2@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3171,9 +3182,10 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp2@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3269,14 +3281,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI61_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: larl %r1, .LCPI61_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3334,8 +3347,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, logf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -3393,8 +3407,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -3403,9 +3417,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, log@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3471,7 +3485,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI64_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: larl %r1, .LCPI64_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -3484,13 +3498,14 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: larl %r1, .LCPI64_3 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3586,14 +3601,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI66_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: larl %r1, .LCPI66_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log10@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3651,8 +3667,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, log10f@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -3710,8 +3727,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -3720,9 +3737,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, log10@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3788,7 +3805,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI69_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: larl %r1, .LCPI69_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -3801,13 +3818,14 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: larl %r1, .LCPI69_3 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log10@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3903,14 +3921,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI71_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: larl %r1, .LCPI71_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log2@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3968,8 +3987,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, log2f@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -4027,8 +4047,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -4037,9 +4057,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, log2@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4105,7 +4125,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI74_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: larl %r1, .LCPI74_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -4118,13 +4138,14 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: larl %r1, .LCPI74_3 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log2@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4200,11 +4221,11 @@ ; S390X-LABEL: constrained_vector_rint_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI76_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI76_1 -; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: fidbr %f2, 0, %f0 -; S390X-NEXT: fidbr %f0, 0, %f1 +; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: fidbr %f0, 0, %f0 +; S390X-NEXT: fidbr %f2, 0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v2f64: @@ -4230,9 +4251,9 @@ ; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI77_2 ; S390X-NEXT: le %f3, 0(%r1) -; S390X-NEXT: fiebr %f4, 0, %f0 +; S390X-NEXT: fiebr %f0, 0, %f0 ; S390X-NEXT: fiebr %f2, 0, %f1 -; S390X-NEXT: fiebr %f0, 0, %f3 +; S390X-NEXT: fiebr %f4, 0, %f3 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v3f32: @@ -4261,25 +4282,25 @@ define void @constrained_vector_rint_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_rint_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f1, 8(%r2) -; S390X-NEXT: ld %f2, 16(%r2) +; S390X-NEXT: ld %f2, 0(%r2) ; S390X-NEXT: fidbr %f0, 0, %f0 ; S390X-NEXT: fidbr %f1, 0, %f1 ; S390X-NEXT: fidbr %f2, 0, %f2 -; S390X-NEXT: std %f2, 16(%r2) +; S390X-NEXT: std %f2, 0(%r2) ; S390X-NEXT: std %f1, 8(%r2) -; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: std %f0, 16(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v1, 0(%r2), 4 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vfidb %v1, %v1, 0, 0 -; SZ13-NEXT: fidbra %f0, 0, %f0, 0 -; SZ13-NEXT: std %f0, 16(%r2) -; SZ13-NEXT: vst %v1, 0(%r2), 4 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f1, 16(%r2) +; SZ13-NEXT: vfidb %v0, %v0, 0, 0 +; SZ13-NEXT: fidbra %f1, 0, %f1, 0 +; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -4299,13 +4320,13 @@ ; S390X-NEXT: larl %r1, .LCPI79_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI79_2 -; S390X-NEXT: ld %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI79_3 ; S390X-NEXT: ld %f3, 0(%r1) -; S390X-NEXT: fidbr %f6, 0, %f0 -; S390X-NEXT: fidbr %f4, 0, %f1 -; S390X-NEXT: fidbr %f2, 0, %f2 -; S390X-NEXT: fidbr %f0, 0, %f3 +; S390X-NEXT: larl %r1, .LCPI79_3 +; S390X-NEXT: ld %f5, 0(%r1) +; S390X-NEXT: fidbr %f0, 0, %f0 +; S390X-NEXT: fidbr %f2, 0, %f1 +; S390X-NEXT: fidbr %f4, 0, %f3 +; S390X-NEXT: fidbr %f6, 0, %f5 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v4f64: @@ -4366,14 +4387,15 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI81_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, nearbyint@PLT ; S390X-NEXT: larl %r1, .LCPI81_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, nearbyint@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -4417,8 +4439,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, nearbyintf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -4463,8 +4486,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, nearbyint@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -4473,9 +4496,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, nearbyint@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4484,12 +4507,12 @@ ; ; SZ13-LABEL: constrained_vector_nearbyint_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v1, 0(%r2), 4 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vfidb %v1, %v1, 4, 0 -; SZ13-NEXT: fidbra %f0, 0, %f0, 4 -; SZ13-NEXT: std %f0, 16(%r2) -; SZ13-NEXT: vst %v1, 0(%r2), 4 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f1, 16(%r2) +; SZ13-NEXT: vfidb %v0, %v0, 4, 0 +; SZ13-NEXT: fidbra %f1, 0, %f1, 4 +; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -4533,9 +4556,10 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, nearbyint@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4622,7 +4646,8 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmax@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -4675,10 +4700,10 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI87_0 -; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI87_1 ; S390X-NEXT: le %f8, 0(%r1) -; S390X-NEXT: ler %f2, %f8 +; S390X-NEXT: larl %r1, .LCPI87_1 +; S390X-NEXT: le %f2, 0(%r1) +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: brasl %r14, fmaxf@PLT ; S390X-NEXT: larl %r1, .LCPI87_2 ; S390X-NEXT: le %f1, 0(%r1) @@ -4688,12 +4713,14 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, fmaxf@PLT ; S390X-NEXT: larl %r1, .LCPI87_4 -; S390X-NEXT: le %f2, 0(%r1) +; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: ler %f10, %f0 -; S390X-NEXT: ler %f0, %f8 +; S390X-NEXT: ler %f0, %f1 +; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, fmaxf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 -; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4762,8 +4789,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: larl %r1, .LCPI88_0 ; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: ld %f9, 8(%r2) @@ -4778,9 +4805,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, fmax@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4878,9 +4905,10 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmax@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5001,7 +5029,8 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmin@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -5054,10 +5083,10 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI92_0 -; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI92_1 ; S390X-NEXT: le %f8, 0(%r1) -; S390X-NEXT: ler %f2, %f8 +; S390X-NEXT: larl %r1, .LCPI92_1 +; S390X-NEXT: le %f2, 0(%r1) +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: brasl %r14, fminf@PLT ; S390X-NEXT: larl %r1, .LCPI92_2 ; S390X-NEXT: le %f1, 0(%r1) @@ -5067,12 +5096,14 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, fminf@PLT ; S390X-NEXT: larl %r1, .LCPI92_4 -; S390X-NEXT: le %f2, 0(%r1) +; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: ler %f10, %f0 -; S390X-NEXT: ler %f0, %f8 +; S390X-NEXT: ler %f0, %f1 +; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, fminf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 -; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5143,8 +5174,8 @@ ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: .cfi_offset %f11, -192 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: larl %r1, .LCPI93_0 ; S390X-NEXT: ldeb %f9, 0(%r1) ; S390X-NEXT: ld %f10, 8(%r2) @@ -5158,9 +5189,9 @@ ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ldr %f2, %f9 ; S390X-NEXT: brasl %r14, fmin@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f10, 8(%r13) -; S390X-NEXT: std %f11, 0(%r13) +; S390X-NEXT: std %f11, 16(%r13) ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -5261,9 +5292,10 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmin@PLT -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f9 -; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f9 +; S390X-NEXT: ldr %f4, %f10 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5351,8 +5383,8 @@ ; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI96_1 ; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: ledbr %f2, %f0 -; S390X-NEXT: ledbr %f0, %f1 +; S390X-NEXT: ledbr %f0, %f0 +; S390X-NEXT: ledbr %f2, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fptrunc_v2f64: @@ -5423,13 +5455,13 @@ ; S390X-NEXT: larl %r1, .LCPI98_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI98_2 -; S390X-NEXT: ld %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI98_3 ; S390X-NEXT: ld %f3, 0(%r1) -; S390X-NEXT: ledbr %f6, %f0 -; S390X-NEXT: ledbr %f4, %f1 -; S390X-NEXT: ledbr %f2, %f2 -; S390X-NEXT: ledbr %f0, %f3 +; S390X-NEXT: larl %r1, .LCPI98_3 +; S390X-NEXT: ld %f5, 0(%r1) +; S390X-NEXT: ledbr %f0, %f0 +; S390X-NEXT: ledbr %f2, %f1 +; S390X-NEXT: ledbr %f4, %f3 +; S390X-NEXT: ledbr %f6, %f5 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fptrunc_v4f64: @@ -5483,9 +5515,9 @@ ; S390X-LABEL: constrained_vector_fpext_v2f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI100_0 -; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI100_1 ; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI100_1 +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v2f32: @@ -5510,13 +5542,13 @@ ; S390X-NEXT: sllg %r1, %r0, 32 ; S390X-NEXT: ldgr %f0, %r1 ; S390X-NEXT: nilf %r0, 0 -; S390X-NEXT: ldeb %f1, 8(%r2) -; S390X-NEXT: ldgr %f2, %r0 -; S390X-NEXT: ldebr %f2, %f2 +; S390X-NEXT: ldgr %f1, %r0 +; S390X-NEXT: ldeb %f2, 8(%r2) +; S390X-NEXT: ldebr %f1, %f1 ; S390X-NEXT: ldebr %f0, %f0 -; S390X-NEXT: std %f1, 16(%r3) ; S390X-NEXT: std %f0, 8(%r3) -; S390X-NEXT: std %f2, 0(%r3) +; S390X-NEXT: std %f2, 16(%r3) +; S390X-NEXT: std %f1, 0(%r3) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v3f64: @@ -5544,13 +5576,13 @@ ; S390X-LABEL: constrained_vector_fpext_v4f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI102_0 -; S390X-NEXT: ldeb %f6, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_1 -; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI102_2 ; S390X-NEXT: ldeb %f2, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI102_2 +; S390X-NEXT: ldeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_3 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v4f32: @@ -5620,7 +5652,8 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, ceil@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -5664,8 +5697,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, ceilf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -5709,8 +5743,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, ceil@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -5719,9 +5753,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, ceil@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5730,12 +5764,12 @@ ; ; SZ13-LABEL: constrained_vector_ceil_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v1, 0(%r2), 4 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vfidb %v1, %v1, 4, 6 -; SZ13-NEXT: fidbra %f0, 6, %f0, 4 -; SZ13-NEXT: std %f0, 16(%r2) -; SZ13-NEXT: vst %v1, 0(%r2), 4 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f1, 16(%r2) +; SZ13-NEXT: vfidb %v0, %v0, 4, 6 +; SZ13-NEXT: fidbra %f1, 6, %f1, 4 +; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -5794,7 +5828,8 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, floor@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -5838,8 +5873,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, floorf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -5883,8 +5919,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, floor@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -5893,9 +5929,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, floor@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5904,12 +5940,12 @@ ; ; SZ13-LABEL: constrained_vector_floor_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v1, 0(%r2), 4 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vfidb %v1, %v1, 4, 7 -; SZ13-NEXT: fidbra %f0, 7, %f0, 4 -; SZ13-NEXT: std %f0, 16(%r2) -; SZ13-NEXT: vst %v1, 0(%r2), 4 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f1, 16(%r2) +; SZ13-NEXT: vfidb %v0, %v0, 4, 7 +; SZ13-NEXT: fidbra %f1, 7, %f1, 4 +; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -5967,7 +6003,8 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, round@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -6011,8 +6048,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, roundf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -6057,8 +6095,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, round@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -6067,9 +6105,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, round@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -6078,12 +6116,12 @@ ; ; SZ13-LABEL: constrained_vector_round_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v1, 0(%r2), 4 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vfidb %v1, %v1, 4, 1 -; SZ13-NEXT: fidbra %f0, 1, %f0, 4 -; SZ13-NEXT: std %f0, 16(%r2) -; SZ13-NEXT: vst %v1, 0(%r2), 4 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f1, 16(%r2) +; SZ13-NEXT: vfidb %v0, %v0, 4, 1 +; SZ13-NEXT: fidbra %f1, 1, %f1, 4 +; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a @@ -6141,7 +6179,8 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, trunc@PLT -; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -6185,8 +6224,9 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, truncf@PLT +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 -; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -6230,8 +6270,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 16(%r2) -; S390X-NEXT: ld %f0, 0(%r2) +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, trunc@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -6240,9 +6280,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, trunc@PLT -; S390X-NEXT: std %f0, 16(%r13) +; S390X-NEXT: std %f0, 0(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 0(%r13) +; S390X-NEXT: std %f10, 16(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -6251,12 +6291,12 @@ ; ; SZ13-LABEL: constrained_vector_trunc_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v1, 0(%r2), 4 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vfidb %v1, %v1, 4, 5 -; SZ13-NEXT: fidbra %f0, 5, %f0, 4 -; SZ13-NEXT: std %f0, 16(%r2) -; SZ13-NEXT: vst %v1, 0(%r2), 4 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f1, 16(%r2) +; SZ13-NEXT: vfidb %v0, %v0, 4, 5 +; SZ13-NEXT: fidbra %f1, 5, %f1, 4 +; SZ13-NEXT: vst %v0, 0(%r2), 4 +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, <3 x double>* %a diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1104,10 +1104,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1130,10 +1130,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi @@ -1444,10 +1444,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1470,10 +1470,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -396,10 +396,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -535,10 +535,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -42,10 +42,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -87,10 +87,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -132,10 +132,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -177,10 +177,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -226,10 +226,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -271,10 +271,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -312,10 +312,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -353,10 +353,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -394,10 +394,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -435,10 +435,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -476,10 +476,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -517,10 +517,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -558,10 +558,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -599,10 +599,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -644,10 +644,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -689,10 +689,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -730,10 +730,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -775,10 +775,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -817,10 +817,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -858,10 +858,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -899,10 +899,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -940,10 +940,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -981,10 +981,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1022,10 +1022,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -292,9 +292,9 @@ ; CHECK-NEXT: callq fmod ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1077,9 +1077,9 @@ ; CHECK-NEXT: callq pow ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1333,9 +1333,9 @@ ; CHECK-NEXT: callq __powidf2 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1570,9 +1570,9 @@ ; CHECK-NEXT: callq sin ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1794,9 +1794,9 @@ ; CHECK-NEXT: callq cos ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2018,9 +2018,9 @@ ; CHECK-NEXT: callq exp ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2242,9 +2242,9 @@ ; CHECK-NEXT: callq exp2 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2466,9 +2466,9 @@ ; CHECK-NEXT: callq log ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2690,9 +2690,9 @@ ; CHECK-NEXT: callq log10 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2914,9 +2914,9 @@ ; CHECK-NEXT: callq log2 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3116,9 +3116,9 @@ ; CHECK-NEXT: callq rint ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3286,9 +3286,9 @@ ; CHECK-NEXT: callq nearbyint ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3495,9 +3495,9 @@ ; CHECK-NEXT: callq fmax ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3750,9 +3750,9 @@ ; CHECK-NEXT: callq fmin ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -5237,9 +5237,9 @@ ; CHECK-NEXT: callq ceil ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -5369,9 +5369,9 @@ ; CHECK-NEXT: callq floor ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -5523,9 +5523,9 @@ ; CHECK-NEXT: callq round ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -5667,9 +5667,9 @@ ; CHECK-NEXT: callq trunc ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8