diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -143,6 +143,17 @@ /// tokenfactor for them just before terminator instructions. SmallVector PendingExports; + /// Similar to loads, nodes corresponding to constrained FP intrinsics are + /// bunched up and emitted when necessary. These can be moved across each + /// other and any (normal) memory operation (load or store), but not across + /// calls or instructions having unspecified side effects. As a special + /// case, constrained FP intrinsics using fpexcept.strict may not be deleted + /// even if otherwise unused, so they need to be chained before any + /// terminator instruction (like PendingExports). We track the latter + /// set of nodes in a separate list. + SmallVector PendingConstrainedFP; + SmallVector PendingConstrainedFPStrict; + /// A unique monotonically increasing number used to order the SDNodes we /// create. unsigned SDNodeOrder; @@ -447,12 +458,18 @@ /// Return the current virtual root of the Selection DAG, flushing any /// PendingLoad items. This must be done before emitting a store or any other - /// node that may need to be ordered after any prior load instructions. + /// memory node that may need to be ordered after any prior load instructions. + SDValue getMemoryRoot(); + + /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict) + /// items. This must be done before emitting any call other any other node + /// that may need to be ordered after FP instructions due to other side + /// effects. SDValue getRoot(); /// Similar to getRoot, but instead of flushing all the PendingLoad items, - /// flush all the PendingExports items. It is necessary to do this before - /// emitting a terminator instruction. + /// flush all the PendingExports (and PendingConstrainedFPStrict) items. + /// It is necessary to do this before emitting a terminator instruction. SDValue getControlRoot(); SDLoc getCurSDLoc() const { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1025,6 +1025,8 @@ UnusedArgNodeMap.clear(); PendingLoads.clear(); PendingExports.clear(); + PendingConstrainedFP.clear(); + PendingConstrainedFPStrict.clear(); CurInst = nullptr; HasTailCall = false; SDNodeOrder = LowestSDNodeOrder; @@ -1035,7 +1037,7 @@ DanglingDebugInfoMap.clear(); } -SDValue SelectionDAGBuilder::getRoot() { +SDValue SelectionDAGBuilder::getMemoryRoot() { if (PendingLoads.empty()) return DAG.getRoot(); @@ -1053,9 +1055,31 @@ return Root; } +SDValue SelectionDAGBuilder::getRoot() { + // Chain up all pending constrained intrinsics together with all + // pending loads, by simply appending them to PendingLoads and + // then calling getMemoryRoot(). + PendingLoads.reserve(PendingLoads.size() + + PendingConstrainedFP.size() + + PendingConstrainedFPStrict.size()); + PendingLoads.append(PendingConstrainedFP.begin(), + PendingConstrainedFP.end()); + PendingLoads.append(PendingConstrainedFPStrict.begin(), + PendingConstrainedFPStrict.end()); + PendingConstrainedFP.clear(); + PendingConstrainedFPStrict.clear(); + return getMemoryRoot(); +} + SDValue SelectionDAGBuilder::getControlRoot() { SDValue Root = DAG.getRoot(); + // We need to emit pending fpexcept.strict constrained intrinsics, + // so append them to the PendingExports list. + PendingExports.append(PendingConstrainedFPStrict.begin(), + PendingConstrainedFPStrict.end()); + PendingConstrainedFPStrict.clear(); + if (PendingExports.empty()) return Root; @@ -4060,9 +4084,11 @@ SDValue Root; bool ConstantMemory = false; - if (isVolatile || NumValues > MaxParallelChains) + if (isVolatile) // Serialize volatile loads with other side effects. Root = getRoot(); + else if (NumValues > MaxParallelChains) + Root = getMemoryRoot(); else if (AA && AA->pointsToConstantMemory(MemoryLocation( SV, @@ -4237,7 +4263,7 @@ SDValue Src = getValue(SrcV); SDValue Ptr = getValue(PtrV); - SDValue Root = getRoot(); + SDValue Root = I.isVolatile() ? getRoot() : getMemoryRoot(); SmallVector Chains(std::min(MaxParallelChains, NumValues)); SDLoc dl = getCurSDLoc(); unsigned Alignment = I.getAlignment(); @@ -4329,7 +4355,7 @@ VT.getStoreSize().getKnownMinSize(), Alignment, AAInfo); SDValue StoreNode = - DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO, + DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO, ISD::UNINDEXED, false /* Truncating */, IsCompressing); DAG.setRoot(StoreNode); setValue(&I, StoreNode); @@ -4463,7 +4489,7 @@ IndexType = ISD::SIGNED_SCALED; Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } - SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale }; + SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale }; SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl, Ops, MMO, IndexType); DAG.setRoot(Scatter); @@ -5850,7 +5876,8 @@ bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memcpy DAG // node. - SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, + SDValue Root = isVol ? getRoot() : getMemoryRoot(); + SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Align, isVol, false, isTC, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1))); @@ -5866,7 +5893,8 @@ unsigned Align = std::max(MSI.getDestAlignment(), 1); bool isVol = MSI.isVolatile(); bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget()); - SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, + SDValue Root = isVol ? getRoot() : getMemoryRoot(); + SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Align, isVol, isTC, MachinePointerInfo(I.getArgOperand(0))); updateDAGForMaybeTailCall(MS); return; @@ -5884,7 +5912,8 @@ bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memmove DAG // node. - SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, + SDValue Root = isVol ? getRoot() : getMemoryRoot(); + SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Align, isVol, isTC, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1))); updateDAGForMaybeTailCall(MM); @@ -7039,9 +7068,29 @@ SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); assert(Result.getNode()->getNumValues() == 2); - // See above -- chain is handled like for loads here. + + // Push node to the appropriate list so that future instructions can be + // chained up correctly. SDValue OutChain = Result.getValue(1); - PendingLoads.push_back(OutChain); + switch (FPI.getExceptionBehavior().getValue()) { + case fp::ExceptionBehavior::ebIgnore: + // The only reason why ebIgnore nodes still need to be chained is that + // they might depend on the current rounding mode, and therefore must + // not be moved across instruction that may change that mode. + LLVM_FALLTHROUGH; + case fp::ExceptionBehavior::ebMayTrap: + // These must not be moved across calls or instructions that may change + // floating-point exception masks. + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + // These must not be moved across calls or instructions that may change + // floating-point exception masks or read floating-point exception flags. + // In addition, they cannot be optimized out even if unused. + PendingConstrainedFPStrict.push_back(OutChain); + break; + } + SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); } @@ -7424,7 +7473,8 @@ // In the mempcpy context we need to pass in a false value for isTailCall // because the return pointer needs to be adjusted by the size of // the copied memory. - SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Align, isVol, + SDValue Root = isVol ? getRoot() : getMemoryRoot(); + SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Align, isVol, false, /*isTailCall=*/false, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1))); diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll @@ -1358,21 +1358,21 @@ ; PC64LE-NEXT: fmr 4, 2 ; PC64LE-NEXT: fmr 30, 1 ; PC64LE-NEXT: fmr 29, 2 -; PC64LE-NEXT: stfd 1, 16(30) ; PC64LE-NEXT: stfd 2, 24(30) +; PC64LE-NEXT: stfd 1, 16(30) ; PC64LE-NEXT: bl __gcc_qmul ; PC64LE-NEXT: nop ; PC64LE-NEXT: fmr 1, 31 ; PC64LE-NEXT: xxlxor 2, 2, 2 ; PC64LE-NEXT: li 5, 2 -; PC64LE-NEXT: stfd 30, 32(30) ; PC64LE-NEXT: stfd 29, 40(30) +; PC64LE-NEXT: stfd 30, 32(30) ; PC64LE-NEXT: bl __powitf2 ; PC64LE-NEXT: nop ; PC64LE-NEXT: frsp 0, 1 ; PC64LE-NEXT: stfsx 0, 0, 29 -; PC64LE-NEXT: stfd 2, -8(30) ; PC64LE-NEXT: stfd 1, -16(30) +; PC64LE-NEXT: stfd 2, -8(30) ; PC64LE-NEXT: addi 1, 1, 80 ; PC64LE-NEXT: ld 0, 16(1) ; PC64LE-NEXT: lfd 31, -8(1) # 8-byte Folded Reload @@ -1409,21 +1409,21 @@ ; PC64LE9-NEXT: fmr 4, 2 ; PC64LE9-NEXT: fmr 30, 2 ; PC64LE9-NEXT: fmr 29, 1 -; PC64LE9-NEXT: stfd 1, 16(30) ; PC64LE9-NEXT: stfd 2, 24(30) +; PC64LE9-NEXT: stfd 1, 16(30) ; PC64LE9-NEXT: bl __gcc_qmul ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 1, 31 ; PC64LE9-NEXT: xxlxor 2, 2, 2 ; PC64LE9-NEXT: li 5, 2 -; PC64LE9-NEXT: stfd 29, 32(30) ; PC64LE9-NEXT: stfd 30, 40(30) +; PC64LE9-NEXT: stfd 29, 32(30) ; PC64LE9-NEXT: bl __powitf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: frsp 0, 1 ; PC64LE9-NEXT: stfs 0, 0(29) -; PC64LE9-NEXT: stfd 2, -8(30) ; PC64LE9-NEXT: stfd 1, -16(30) +; PC64LE9-NEXT: stfd 2, -8(30) ; PC64LE9-NEXT: addi 1, 1, 80 ; PC64LE9-NEXT: ld 0, 16(1) ; PC64LE9-NEXT: lfd 31, -8(1) # 8-byte Folded Reload @@ -1463,15 +1463,15 @@ ; PC64-NEXT: fmr 4, 2 ; PC64-NEXT: fmr 29, 1 ; PC64-NEXT: fmr 28, 2 -; PC64-NEXT: stfd 1, 16(30) ; PC64-NEXT: stfd 2, 24(30) +; PC64-NEXT: stfd 1, 16(30) ; PC64-NEXT: bl __gcc_qmul ; PC64-NEXT: nop ; PC64-NEXT: fmr 1, 31 ; PC64-NEXT: fmr 2, 30 ; PC64-NEXT: li 5, 2 -; PC64-NEXT: stfd 29, 32(30) ; PC64-NEXT: stfd 28, 40(30) +; PC64-NEXT: stfd 29, 32(30) ; PC64-NEXT: bl __powitf2 ; PC64-NEXT: nop ; PC64-NEXT: frsp 0, 1 @@ -1481,8 +1481,8 @@ ; PC64-NEXT: lfd 29, 152(1) # 8-byte Folded Reload ; PC64-NEXT: lfd 28, 144(1) # 8-byte Folded Reload ; PC64-NEXT: ld 29, 120(1) # 8-byte Folded Reload -; PC64-NEXT: stfd 2, -8(30) ; PC64-NEXT: stfd 1, -16(30) +; PC64-NEXT: stfd 2, -8(30) ; PC64-NEXT: ld 30, 128(1) # 8-byte Folded Reload ; PC64-NEXT: addi 1, 1, 176 ; PC64-NEXT: ld 0, 16(1) diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll b/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll --- a/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll @@ -287,4 +287,56 @@ ret void } +; If the result of any FP operation is unused, it can be removed +; -- except for fpexcept.strict operations. + +define void @f13(float %f1) { +; CHECK-LABEL: f13: +; CHECK-NOT: sqeb +; CHECK: br %r14 + + %sqrt = call float @llvm.sqrt.f32(float %f1) + + ret void +} + +define void @f14(float %f1) { +; CHECK-LABEL: f14: +; CHECK-NOT: sqeb +; CHECK: br %r14 + + %sqrt = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, + metadata !"round.dynamic", + metadata !"fpexcept.ignore") #0 + + ret void +} + +define void @f15(float %f1) { +; CHECK-LABEL: f15: +; CHECK-NOT: sqeb +; CHECK: br %r14 + + %sqrt = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, + metadata !"round.dynamic", + metadata !"fpexcept.maytrap") #0 + + ret void +} + +define void @f16(float %f1) { +; CHECK-LABEL: f16: +; CHECK: sqebr +; CHECK: br %r14 + + %sqrt = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + + ret void +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -33,11 +33,11 @@ ; S390X-NEXT: larl %r1, .LCPI1_0 ; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI1_1 -; S390X-NEXT: ldeb %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI1_2 ; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: ddbr %f0, %f1 +; S390X-NEXT: larl %r1, .LCPI1_2 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: ddbr %f2, %f1 +; S390X-NEXT: ddbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v2f64: @@ -63,14 +63,14 @@ ; S390X-NEXT: larl %r1, .LCPI2_0 ; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI2_1 -; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: le %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI2_2 ; S390X-NEXT: le %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI2_3 -; S390X-NEXT: le %f4, 0(%r1) -; S390X-NEXT: debr %f0, %f1 -; S390X-NEXT: debr %f2, %f1 +; S390X-NEXT: le %f0, 0(%r1) ; S390X-NEXT: debr %f4, %f1 +; S390X-NEXT: debr %f2, %f1 +; S390X-NEXT: debr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v3f32: @@ -143,17 +143,17 @@ ; S390X-NEXT: larl %r1, .LCPI4_0 ; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI4_1 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI4_2 -; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI4_3 ; S390X-NEXT: ldeb %f4, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI4_3 +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI4_4 -; S390X-NEXT: ldeb %f6, 0(%r1) -; S390X-NEXT: ddbr %f0, %f1 -; S390X-NEXT: ddbr %f2, %f1 -; S390X-NEXT: ddbr %f4, %f1 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: ddbr %f6, %f1 +; S390X-NEXT: ddbr %f4, %f1 +; S390X-NEXT: ddbr %f2, %f1 +; S390X-NEXT: ddbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v4f64: @@ -162,10 +162,10 @@ ; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: larl %r1, .LCPI4_1 ; SZ13-NEXT: vl %v1, 0(%r1), 3 -; SZ13-NEXT: vfddb %v24, %v1, %v0 +; SZ13-NEXT: vfddb %v26, %v1, %v0 ; SZ13-NEXT: larl %r1, .LCPI4_2 ; SZ13-NEXT: vl %v1, 0(%r1), 3 -; SZ13-NEXT: vfddb %v26, %v1, %v0 +; SZ13-NEXT: vfddb %v24, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64( @@ -242,8 +242,7 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, fmod@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f9 +; S390X-NEXT: ldr %f2, %f9 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -315,9 +314,8 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, fmodf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 +; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -499,10 +497,9 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, fmod@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f9 -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f11 +; S390X-NEXT: ldr %f2, %f11 +; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f6, %f9 ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -589,13 +586,13 @@ ; S390X-LABEL: constrained_vector_fmul_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI11_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI11_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI11_2 -; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: mdbr %f0, %f1 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: mdbr %f2, %f1 +; S390X-NEXT: mdbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v2f64: @@ -619,15 +616,15 @@ ; S390X-LABEL: constrained_vector_fmul_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI12_0 -; S390X-NEXT: le %f4, 0(%r1) +; S390X-NEXT: le %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI12_1 -; S390X-NEXT: ler %f0, %f4 -; S390X-NEXT: meeb %f0, 0(%r1) +; S390X-NEXT: ler %f4, %f0 +; S390X-NEXT: meeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI12_2 -; S390X-NEXT: ler %f2, %f4 +; S390X-NEXT: ler %f2, %f0 ; S390X-NEXT: meeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI12_3 -; S390X-NEXT: meeb %f4, 0(%r1) +; S390X-NEXT: meeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v3f32: @@ -697,19 +694,19 @@ ; S390X-LABEL: constrained_vector_fmul_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI14_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI14_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI14_2 -; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI14_3 ; S390X-NEXT: ldeb %f4, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI14_3 +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI14_4 -; S390X-NEXT: ldeb %f6, 0(%r1) -; S390X-NEXT: mdbr %f0, %f1 -; S390X-NEXT: mdbr %f2, %f1 -; S390X-NEXT: mdbr %f4, %f1 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: mdbr %f6, %f1 +; S390X-NEXT: mdbr %f4, %f1 +; S390X-NEXT: mdbr %f2, %f1 +; S390X-NEXT: mdbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v4f64: @@ -719,9 +716,9 @@ ; SZ13-NEXT: larl %r1, .LCPI14_1 ; SZ13-NEXT: vl %v1, 0(%r1), 3 ; SZ13-NEXT: larl %r1, .LCPI14_2 -; SZ13-NEXT: vfmdb %v24, %v1, %v0 -; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vfmdb %v26, %v1, %v0 +; SZ13-NEXT: vl %v0, 0(%r1), 3 +; SZ13-NEXT: vfmdb %v24, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( @@ -763,12 +760,13 @@ ; S390X-LABEL: constrained_vector_fadd_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI16_0 +; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI16_2 ; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI16_1 -; S390X-NEXT: ld %f2, 0(%r1) -; S390X-NEXT: adbr %f0, %f2 -; S390X-NEXT: larl %r1, .LCPI16_2 +; S390X-NEXT: ldr %f2, %f1 ; S390X-NEXT: adb %f2, 0(%r1) +; S390X-NEXT: adbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v2f64: @@ -792,15 +790,14 @@ ; S390X-LABEL: constrained_vector_fadd_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI17_0 -; S390X-NEXT: le %f1, 0(%r1) +; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: lzer %f4 +; S390X-NEXT: aebr %f4, %f0 ; S390X-NEXT: larl %r1, .LCPI17_1 -; S390X-NEXT: ler %f2, %f1 -; S390X-NEXT: ler %f0, %f1 -; S390X-NEXT: aeb %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI17_2 +; S390X-NEXT: ler %f2, %f0 ; S390X-NEXT: aeb %f2, 0(%r1) -; S390X-NEXT: lzer %f4 -; S390X-NEXT: aebr %f4, %f1 +; S390X-NEXT: larl %r1, .LCPI17_2 +; S390X-NEXT: aeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v3f32: @@ -869,18 +866,19 @@ ; S390X-LABEL: constrained_vector_fadd_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI19_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI19_1 -; S390X-NEXT: ld %f6, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI19_3 -; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: adbr %f0, %f6 +; S390X-NEXT: ldr %f2, %f1 +; S390X-NEXT: ldr %f6, %f1 +; S390X-NEXT: adb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI19_2 -; S390X-NEXT: ldr %f2, %f6 -; S390X-NEXT: adb %f2, 0(%r1) -; S390X-NEXT: adbr %f4, %f6 +; S390X-NEXT: ldeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI19_4 -; S390X-NEXT: adb %f6, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI19_3 +; S390X-NEXT: adb %f2, 0(%r1) +; S390X-NEXT: adbr %f4, %f1 +; S390X-NEXT: adbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v4f64: @@ -890,9 +888,9 @@ ; SZ13-NEXT: larl %r1, .LCPI19_1 ; SZ13-NEXT: vl %v1, 0(%r1), 3 ; SZ13-NEXT: larl %r1, .LCPI19_2 -; SZ13-NEXT: vfadb %v24, %v1, %v0 -; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vfadb %v26, %v1, %v0 +; SZ13-NEXT: vl %v0, 0(%r1), 3 +; SZ13-NEXT: vfadb %v24, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( @@ -933,12 +931,12 @@ define <2 x double> @constrained_vector_fsub_v2f64() #0 { ; S390X-LABEL: constrained_vector_fsub_v2f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: larl %r1, .LCPI21_1 -; S390X-NEXT: ld %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI21_0 -; S390X-NEXT: ldeb %f1, 0(%r1) -; S390X-NEXT: ldr %f0, %f2 +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI21_2 +; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI21_1 +; S390X-NEXT: ldr %f2, %f0 ; S390X-NEXT: sdb %f2, 0(%r1) ; S390X-NEXT: sdbr %f0, %f1 ; S390X-NEXT: br %r14 @@ -963,13 +961,13 @@ ; S390X-LABEL: constrained_vector_fsub_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI22_0 -; S390X-NEXT: le %f4, 0(%r1) +; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: ler %f4, %f0 ; S390X-NEXT: larl %r1, .LCPI22_1 -; S390X-NEXT: ler %f0, %f4 -; S390X-NEXT: seb %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI22_2 -; S390X-NEXT: ler %f2, %f4 +; S390X-NEXT: ler %f2, %f0 ; S390X-NEXT: seb %f2, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI22_2 +; S390X-NEXT: seb %f0, 0(%r1) ; S390X-NEXT: lzer %f1 ; S390X-NEXT: sebr %f4, %f1 ; S390X-NEXT: br %r14 @@ -1039,21 +1037,21 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; S390X-LABEL: constrained_vector_fsub_v4f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: larl %r1, .LCPI24_1 -; S390X-NEXT: ld %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI24_0 -; S390X-NEXT: ldeb %f1, 0(%r1) -; S390X-NEXT: ldr %f0, %f6 +; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI24_1 +; S390X-NEXT: ldr %f6, %f0 +; S390X-NEXT: sdb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI24_2 -; S390X-NEXT: ldr %f2, %f6 -; S390X-NEXT: sdb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI24_3 -; S390X-NEXT: ldeb %f3, 0(%r1) -; S390X-NEXT: ldr %f4, %f6 +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI24_4 -; S390X-NEXT: sdb %f6, 0(%r1) -; S390X-NEXT: sdbr %f0, %f1 -; S390X-NEXT: sdbr %f4, %f3 +; S390X-NEXT: ldeb %f3, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI24_3 +; S390X-NEXT: ldr %f2, %f0 +; S390X-NEXT: sdb %f2, 0(%r1) +; S390X-NEXT: ldr %f4, %f0 +; S390X-NEXT: sdbr %f4, %f1 +; S390X-NEXT: sdbr %f0, %f3 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fsub_v4f64: @@ -1062,9 +1060,9 @@ ; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vgmg %v1, 12, 10 ; SZ13-NEXT: larl %r1, .LCPI24_1 -; SZ13-NEXT: vfsdb %v24, %v1, %v0 -; SZ13-NEXT: vl %v0, 0(%r1), 3 ; SZ13-NEXT: vfsdb %v26, %v1, %v0 +; SZ13-NEXT: vl %v0, 0(%r1), 3 +; SZ13-NEXT: vfsdb %v24, %v1, %v0 ; SZ13-NEXT: br %r14 entry: %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64( @@ -1126,11 +1124,11 @@ ; S390X-LABEL: constrained_vector_sqrt_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI27_0 -; S390X-NEXT: sqeb %f0, 0(%r1) +; S390X-NEXT: sqeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI27_1 ; S390X-NEXT: sqeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI27_2 -; S390X-NEXT: sqeb %f4, 0(%r1) +; S390X-NEXT: sqeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_sqrt_v3f32: @@ -1186,13 +1184,13 @@ ; S390X-LABEL: constrained_vector_sqrt_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI29_0 -; S390X-NEXT: sqdb %f2, 0(%r1) +; S390X-NEXT: sqdb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI29_1 ; S390X-NEXT: sqdb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI29_3 ; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI29_2 -; S390X-NEXT: sqdb %f6, 0(%r1) +; S390X-NEXT: sqdb %f2, 0(%r1) ; S390X-NEXT: sqdbr %f0, %f0 ; S390X-NEXT: br %r14 ; @@ -1200,10 +1198,10 @@ ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI29_0 ; SZ13-NEXT: vl %v0, 0(%r1), 3 -; SZ13-NEXT: vfsqdb %v24, %v0 +; SZ13-NEXT: vfsqdb %v26, %v0 ; SZ13-NEXT: larl %r1, .LCPI29_1 ; SZ13-NEXT: vl %v0, 0(%r1), 3 -; SZ13-NEXT: vfsqdb %v26, %v0 +; SZ13-NEXT: vfsqdb %v24, %v0 ; SZ13-NEXT: br %r14 entry: %sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64( @@ -1279,8 +1277,7 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, pow@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f9 +; S390X-NEXT: ldr %f2, %f9 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -1354,9 +1351,8 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, powf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 +; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -1544,10 +1540,9 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, pow@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f9 -; S390X-NEXT: ldr %f2, %f10 -; S390X-NEXT: ldr %f4, %f11 +; S390X-NEXT: ldr %f2, %f11 +; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f6, %f9 ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -1667,8 +1662,7 @@ ; S390X-NEXT: lghi %r2, 3 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, __powidf2@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -1732,9 +1726,8 @@ ; S390X-NEXT: lghi %r2, 3 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, __powisf2@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -1897,10 +1890,9 @@ ; S390X-NEXT: lghi %r2, 3 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, __powidf2@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2001,15 +1993,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI41_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: larl %r1, .LCPI41_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, sin@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -2067,9 +2058,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, sinf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -2205,7 +2195,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI44_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: larl %r1, .LCPI44_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -2218,14 +2208,13 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: larl %r1, .LCPI44_3 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, sin@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2321,15 +2310,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI46_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: larl %r1, .LCPI46_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, cos@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -2387,9 +2375,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, cosf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -2525,7 +2512,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI49_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: larl %r1, .LCPI49_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -2538,14 +2525,13 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: larl %r1, .LCPI49_3 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, cos@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2641,15 +2627,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI51_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: larl %r1, .LCPI51_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -2707,9 +2692,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, expf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -2845,7 +2829,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI54_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: larl %r1, .LCPI54_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -2858,14 +2842,13 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: larl %r1, .LCPI54_3 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2961,15 +2944,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI56_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, exp2@PLT ; S390X-NEXT: larl %r1, .LCPI56_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp2@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3027,9 +3009,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, exp2f@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -3182,10 +3163,9 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, exp2@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3281,15 +3261,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI61_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: larl %r1, .LCPI61_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3347,9 +3326,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, logf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -3485,7 +3463,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI64_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: larl %r1, .LCPI64_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -3498,14 +3476,13 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: larl %r1, .LCPI64_3 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3601,15 +3578,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI66_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: larl %r1, .LCPI66_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log10@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3667,9 +3643,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, log10f@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -3805,7 +3780,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI69_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: larl %r1, .LCPI69_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -3818,14 +3793,13 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: larl %r1, .LCPI69_3 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log10@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3921,15 +3895,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI71_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: larl %r1, .LCPI71_1 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log2@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -3987,9 +3960,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, log2f@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -4125,7 +4097,7 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI74_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: larl %r1, .LCPI74_1 ; S390X-NEXT: ld %f1, 0(%r1) @@ -4138,14 +4110,13 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: larl %r1, .LCPI74_3 -; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldeb %f1, 0(%r1) ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, log2@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4221,11 +4192,11 @@ ; S390X-LABEL: constrained_vector_rint_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI76_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI76_1 -; S390X-NEXT: ldeb %f1, 0(%r1) -; S390X-NEXT: fidbr %f0, 0, %f0 -; S390X-NEXT: fidbr %f2, 0, %f1 +; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: fidbr %f2, 0, %f0 +; S390X-NEXT: fidbr %f0, 0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v2f64: @@ -4251,9 +4222,9 @@ ; S390X-NEXT: le %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI77_2 ; S390X-NEXT: le %f3, 0(%r1) -; S390X-NEXT: fiebr %f0, 0, %f0 +; S390X-NEXT: fiebr %f4, 0, %f0 ; S390X-NEXT: fiebr %f2, 0, %f1 -; S390X-NEXT: fiebr %f4, 0, %f3 +; S390X-NEXT: fiebr %f0, 0, %f3 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v3f32: @@ -4320,13 +4291,13 @@ ; S390X-NEXT: larl %r1, .LCPI79_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI79_2 -; S390X-NEXT: ld %f3, 0(%r1) +; S390X-NEXT: ld %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI79_3 -; S390X-NEXT: ld %f5, 0(%r1) -; S390X-NEXT: fidbr %f0, 0, %f0 -; S390X-NEXT: fidbr %f2, 0, %f1 -; S390X-NEXT: fidbr %f4, 0, %f3 -; S390X-NEXT: fidbr %f6, 0, %f5 +; S390X-NEXT: ld %f3, 0(%r1) +; S390X-NEXT: fidbr %f6, 0, %f0 +; S390X-NEXT: fidbr %f4, 0, %f1 +; S390X-NEXT: fidbr %f2, 0, %f2 +; S390X-NEXT: fidbr %f0, 0, %f3 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v4f64: @@ -4387,15 +4358,14 @@ ; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill ; S390X-NEXT: .cfi_offset %f8, -168 ; S390X-NEXT: larl %r1, .LCPI81_0 -; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: brasl %r14, nearbyint@PLT ; S390X-NEXT: larl %r1, .LCPI81_1 -; S390X-NEXT: ldeb %f1, 0(%r1) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, nearbyint@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -4439,9 +4409,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, nearbyintf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -4556,10 +4525,9 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, nearbyint@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4645,8 +4613,7 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmax@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -4698,10 +4665,10 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI87_0 -; S390X-NEXT: le %f8, 0(%r1) +; S390X-NEXT: le %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI87_1 -; S390X-NEXT: le %f2, 0(%r1) -; S390X-NEXT: ler %f0, %f8 +; S390X-NEXT: le %f8, 0(%r1) +; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, fmaxf@PLT ; S390X-NEXT: larl %r1, .LCPI87_2 ; S390X-NEXT: le %f1, 0(%r1) @@ -4711,14 +4678,12 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, fmaxf@PLT ; S390X-NEXT: larl %r1, .LCPI87_4 -; S390X-NEXT: le %f1, 0(%r1) +; S390X-NEXT: le %f2, 0(%r1) ; S390X-NEXT: ler %f10, %f0 -; S390X-NEXT: ler %f0, %f1 -; S390X-NEXT: ler %f2, %f8 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: brasl %r14, fmaxf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 +; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4901,10 +4866,9 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmax@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5023,8 +4987,7 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmin@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -5076,10 +5039,10 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: larl %r1, .LCPI92_0 -; S390X-NEXT: le %f8, 0(%r1) +; S390X-NEXT: le %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI92_1 -; S390X-NEXT: le %f2, 0(%r1) -; S390X-NEXT: ler %f0, %f8 +; S390X-NEXT: le %f8, 0(%r1) +; S390X-NEXT: ler %f2, %f8 ; S390X-NEXT: brasl %r14, fminf@PLT ; S390X-NEXT: larl %r1, .LCPI92_2 ; S390X-NEXT: le %f1, 0(%r1) @@ -5089,14 +5052,12 @@ ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, fminf@PLT ; S390X-NEXT: larl %r1, .LCPI92_4 -; S390X-NEXT: le %f1, 0(%r1) +; S390X-NEXT: le %f2, 0(%r1) ; S390X-NEXT: ler %f10, %f0 -; S390X-NEXT: ler %f0, %f1 -; S390X-NEXT: ler %f2, %f8 +; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: brasl %r14, fminf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f9 ; S390X-NEXT: ler %f2, %f10 +; S390X-NEXT: ler %f4, %f9 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5283,10 +5244,9 @@ ; S390X-NEXT: ldr %f10, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, fmin@PLT -; S390X-NEXT: ldr %f6, %f0 -; S390X-NEXT: ldr %f0, %f8 -; S390X-NEXT: ldr %f2, %f9 -; S390X-NEXT: ldr %f4, %f10 +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5373,8 +5333,8 @@ ; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI96_1 ; S390X-NEXT: ld %f1, 0(%r1) -; S390X-NEXT: ledbr %f0, %f0 -; S390X-NEXT: ledbr %f2, %f1 +; S390X-NEXT: ledbr %f2, %f0 +; S390X-NEXT: ledbr %f0, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fptrunc_v2f64: @@ -5444,13 +5404,13 @@ ; S390X-NEXT: larl %r1, .LCPI98_1 ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI98_2 -; S390X-NEXT: ld %f3, 0(%r1) +; S390X-NEXT: ld %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI98_3 -; S390X-NEXT: ld %f5, 0(%r1) -; S390X-NEXT: ledbr %f0, %f0 -; S390X-NEXT: ledbr %f2, %f1 -; S390X-NEXT: ledbr %f4, %f3 -; S390X-NEXT: ledbr %f6, %f5 +; S390X-NEXT: ld %f3, 0(%r1) +; S390X-NEXT: ledbr %f6, %f0 +; S390X-NEXT: ledbr %f4, %f1 +; S390X-NEXT: ledbr %f2, %f2 +; S390X-NEXT: ledbr %f0, %f3 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fptrunc_v4f64: @@ -5504,9 +5464,9 @@ ; S390X-LABEL: constrained_vector_fpext_v2f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI100_0 -; S390X-NEXT: ldeb %f0, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI100_1 ; S390X-NEXT: ldeb %f2, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI100_1 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v2f32: @@ -5531,13 +5491,13 @@ ; S390X-NEXT: sllg %r1, %r0, 32 ; S390X-NEXT: ldgr %f0, %r1 ; S390X-NEXT: nilf %r0, 0 -; S390X-NEXT: ldgr %f1, %r0 -; S390X-NEXT: ldeb %f2, 8(%r2) -; S390X-NEXT: ldebr %f1, %f1 +; S390X-NEXT: ldeb %f1, 8(%r2) +; S390X-NEXT: ldgr %f2, %r0 +; S390X-NEXT: ldebr %f2, %f2 ; S390X-NEXT: ldebr %f0, %f0 +; S390X-NEXT: std %f1, 16(%r3) ; S390X-NEXT: std %f0, 8(%r3) -; S390X-NEXT: std %f2, 16(%r3) -; S390X-NEXT: std %f1, 0(%r3) +; S390X-NEXT: std %f2, 0(%r3) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v3f64: @@ -5563,13 +5523,13 @@ ; S390X-LABEL: constrained_vector_fpext_v4f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI102_0 -; S390X-NEXT: ldeb %f0, 0(%r1) +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_1 -; S390X-NEXT: ldeb %f2, 0(%r1) -; S390X-NEXT: larl %r1, .LCPI102_2 ; S390X-NEXT: ldeb %f4, 0(%r1) +; S390X-NEXT: larl %r1, .LCPI102_2 +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_3 -; S390X-NEXT: ldeb %f6, 0(%r1) +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v4f32: @@ -5638,8 +5598,7 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, ceil@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -5682,9 +5641,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, ceilf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -5810,8 +5768,7 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, floor@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -5854,9 +5811,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, floorf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -5981,8 +5937,7 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, round@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -6025,9 +5980,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, roundf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) @@ -6153,8 +6107,7 @@ ; S390X-NEXT: ldr %f8, %f0 ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: brasl %r14, trunc@PLT -; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 280(%r15) ; S390X-NEXT: br %r14 @@ -6197,9 +6150,8 @@ ; S390X-NEXT: ler %f9, %f0 ; S390X-NEXT: ler %f0, %f1 ; S390X-NEXT: brasl %r14, truncf@PLT -; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: ler %f0, %f8 ; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 ; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload ; S390X-NEXT: lmg %r14, %r15, 288(%r15) diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1104,10 +1104,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 12(%esi) -; X87-NEXT: movl %edx, 8(%esi) -; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %edi, 8(%esi) +; X87-NEXT: movl %edx, 12(%esi) ; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %ecx, 4(%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1130,10 +1130,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 12(%esi) -; X86-SSE-NEXT: movl %edx, 8(%esi) -; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %edi, 8(%esi) +; X86-SSE-NEXT: movl %edx, 12(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi @@ -1443,10 +1443,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 12(%esi) -; X87-NEXT: movl %edx, 8(%esi) -; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %edi, 8(%esi) +; X87-NEXT: movl %edx, 12(%esi) ; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %ecx, 4(%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1469,10 +1469,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 12(%esi) -; X86-SSE-NEXT: movl %edx, 8(%esi) -; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %edi, 8(%esi) +; X86-SSE-NEXT: movl %edx, 12(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -47,10 +47,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -94,10 +94,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $40, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -143,10 +143,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $40, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -396,10 +396,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -535,10 +535,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -42,10 +42,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -87,10 +87,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -132,10 +132,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -177,10 +177,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -226,10 +226,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -271,10 +271,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -312,10 +312,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -353,10 +353,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -394,10 +394,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -435,10 +435,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -476,10 +476,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -517,10 +517,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -558,10 +558,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -599,10 +599,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -644,10 +644,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -689,10 +689,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -730,10 +730,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -775,10 +775,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -817,10 +817,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -858,10 +858,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -899,10 +899,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -940,10 +940,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -981,10 +981,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1022,10 +1022,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll @@ -40,8 +40,8 @@ ; CHECK: [[MOVAPDrm:%[0-9]+]]:vr128 = MOVAPDrm $rip, 1, $noreg, %const.0, $noreg :: (load 16 from constant-pool) ; CHECK: [[ADDPDrm:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load 16 from constant-pool) ; CHECK: [[ADDPDrm1:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load 16 from constant-pool) -; CHECK: $xmm0 = COPY [[ADDPDrm]] -; CHECK: $xmm1 = COPY [[ADDPDrm1]] +; CHECK: $xmm0 = COPY [[ADDPDrm1]] +; CHECK: $xmm1 = COPY [[ADDPDrm]] ; CHECK: RET 0, $xmm0, $xmm1 entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -115,10 +115,10 @@ ; CHECK-LABEL: constrained_vector_fdiv_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1] -; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] -; CHECK-NEXT: divpd %xmm2, %xmm0 ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [3.0E+0,4.0E+0] ; CHECK-NEXT: divpd %xmm2, %xmm1 +; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] +; CHECK-NEXT: divpd %xmm2, %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fdiv_v4f64: @@ -292,9 +292,9 @@ ; CHECK-NEXT: callq fmod ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -507,10 +507,10 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fmul_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] -; CHECK-NEXT: movapd {{.*#+}} xmm0 = [2.0E+0,3.0E+0] -; CHECK-NEXT: mulpd %xmm1, %xmm0 -; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.0E+0,5.0E+0] +; CHECK-NEXT: mulpd %xmm0, %xmm1 +; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fmul_v4f64: @@ -644,10 +644,10 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fadd_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] -; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0000000000000001E-1] -; CHECK-NEXT: addpd %xmm1, %xmm0 -; CHECK-NEXT: addpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; CHECK-NEXT: movapd {{.*#+}} xmm1 = [2.0E+0,2.0000000000000001E-1] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fadd_v4f64: @@ -784,10 +784,10 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fsub_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] -; CHECK-NEXT: movapd %xmm1, %xmm0 -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fsub_v4f64: @@ -912,8 +912,8 @@ define <4 x double> @constrained_vector_sqrt_v4f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_sqrt_v4f64: @@ -1077,9 +1077,9 @@ ; CHECK-NEXT: callq pow ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1333,9 +1333,9 @@ ; CHECK-NEXT: callq __powidf2 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1570,9 +1570,9 @@ ; CHECK-NEXT: callq sin ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1794,9 +1794,9 @@ ; CHECK-NEXT: callq cos ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2018,9 +2018,9 @@ ; CHECK-NEXT: callq exp ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2242,9 +2242,9 @@ ; CHECK-NEXT: callq exp2 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2466,9 +2466,9 @@ ; CHECK-NEXT: callq log ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2690,9 +2690,9 @@ ; CHECK-NEXT: callq log10 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2914,9 +2914,9 @@ ; CHECK-NEXT: callq log2 ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3116,9 +3116,9 @@ ; CHECK-NEXT: callq rint ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3286,9 +3286,9 @@ ; CHECK-NEXT: callq nearbyint ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3492,9 +3492,9 @@ ; CHECK-NEXT: callq fmax ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3742,9 +3742,9 @@ ; CHECK-NEXT: callq fmin ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -3975,9 +3975,9 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rdx ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rcx +; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rdx +; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f32: @@ -4217,9 +4217,9 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rdx ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rcx +; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rdx +; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f64: @@ -5542,9 +5542,9 @@ ; CHECK-LABEL: constrained_vector_fpext_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: cvtss2sd %xmm1, %xmm1 ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: cvtss2sd %xmm2, %xmm2 ; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) @@ -5573,8 +5573,8 @@ define <4 x double> @constrained_vector_fpext_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fpext_v4f32: @@ -5694,9 +5694,9 @@ ; CHECK-NEXT: callq ceil ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -5822,9 +5822,9 @@ ; CHECK-NEXT: callq floor ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -5972,9 +5972,9 @@ ; CHECK-NEXT: callq round ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -6112,9 +6112,9 @@ ; CHECK-NEXT: callq trunc ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -6396,8 +6396,8 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 ; CHECK-NEXT: cvtsi2sd %rsi, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 ; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 ; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) @@ -7255,15 +7255,15 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: xorpd %xmm2, %xmm2 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] -; CHECK-NEXT: orpd %xmm3, %xmm0 -; CHECK-NEXT: subpd %xmm3, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] ; CHECK-NEXT: orpd %xmm3, %xmm1 ; CHECK-NEXT: subpd %xmm3, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: orpd %xmm3, %xmm0 +; CHECK-NEXT: subpd %xmm3, %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32: @@ -7331,22 +7331,22 @@ ; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NEXT: pand %xmm2, %xmm3 ; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; CHECK-NEXT: por %xmm5, %xmm0 -; CHECK-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; CHECK-NEXT: subpd %xmm6, %xmm0 -; CHECK-NEXT: addpd %xmm3, %xmm0 -; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: por %xmm4, %xmm2 ; CHECK-NEXT: psrlq $32, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; CHECK-NEXT: subpd %xmm6, %xmm1 -; CHECK-NEXT: addpd %xmm2, %xmm1 +; CHECK-NEXT: addpd %xmm3, %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm2 +; CHECK-NEXT: por %xmm4, %xmm2 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: subpd %xmm6, %xmm0 +; CHECK-NEXT: addpd %xmm2, %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64: