diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -533,7 +533,8 @@ SDValue foldSelectOfConstants(SDNode *N); SDValue foldVSelectOfConstants(SDNode *N); - SDValue foldBinOpIntoSelect(SDNode *BO); + SDValue foldBinOpIntoSelect(SDNode *BO, APInt *IdentityInt = nullptr, + bool RHSIdentityOnly = false); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); @@ -2022,13 +2023,79 @@ return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } -SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { +/// Return true if 'Use' is a load or a store that uses N as its base pointer +/// and that N may be folded in the load / store addressing mode. +static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, + const TargetLowering &TLI) { + EVT VT; + unsigned AS; + + if (LoadSDNode *LD = dyn_cast(Use)) { + if (LD->isIndexed() || LD->getBasePtr().getNode() != N) + return false; + VT = LD->getMemoryVT(); + AS = LD->getAddressSpace(); + } else if (StoreSDNode *ST = dyn_cast(Use)) { + if (ST->isIndexed() || ST->getBasePtr().getNode() != N) + return false; + VT = ST->getMemoryVT(); + AS = ST->getAddressSpace(); + } else if (MaskedLoadSDNode *LD = dyn_cast(Use)) { + if (LD->isIndexed() || LD->getBasePtr().getNode() != N) + return false; + VT = LD->getMemoryVT(); + AS = LD->getAddressSpace(); + } else if (MaskedStoreSDNode *ST = dyn_cast(Use)) { + if (ST->isIndexed() || ST->getBasePtr().getNode() != N) + return false; + VT = ST->getMemoryVT(); + AS = ST->getAddressSpace(); + } else + return false; + + TargetLowering::AddrMode AM; + if (N->getOpcode() == ISD::ADD) { + AM.HasBaseReg = true; + ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); + if (Offset) + // [reg +/- imm] + AM.BaseOffs = Offset->getSExtValue(); + else + // [reg +/- reg] + AM.Scale = 1; + } else if (N->getOpcode() == ISD::SUB) { + AM.HasBaseReg = true; + ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); + if (Offset) + // [reg +/- imm] + AM.BaseOffs = -Offset->getSExtValue(); + else + // [reg +/- reg] + AM.Scale = 1; + } else + return false; + + return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, + VT.getTypeForEVT(*DAG.getContext()), AS); +} + +// Folds binop with constant into select with constant operands +// (add x, (select cc, ct, cf)) -> (select cc, x + ct, x + cf) +// (mul x, (select cc, ct, cf)) -> (select cc, x * ct, x * cf) +// etc. +// +// Fold binop into the select when the value wouldn't be modified in one case +// (add x, (select cc, 0, cf)) -> (select cc, x, (add x, cf)) +// (mul x, (select cc, 1, cf)) -> (select cc, x, (mul x, cf)) +// etc. +// +SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO, APInt *IdentityInt, + bool RHSIdentityOnly) { assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 && "Unexpected binary operator"); - // Don't do this unless the old select is going away. We want to eliminate the - // binary operator, not replace a binop with a select. - // TODO: Handle ISD::SELECT_CC. + // Don't do this unless the select has more than one use. + // TODO: Handle ISD::SELECT_CC and ISD:VSELECT. unsigned SelOpNo = 0; SDValue Sel = BO->getOperand(0); if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) { @@ -2039,12 +2106,43 @@ if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) return SDValue(); + SDValue CC = Sel.getOperand(0); SDValue CT = Sel.getOperand(1); + SDValue CF = Sel.getOperand(2); + + auto BinOpcode = BO->getOpcode(); + SDValue CBO = BO->getOperand(SelOpNo ^ 1); + + if (IdentityInt && (!RHSIdentityOnly || SelOpNo == 1)) { + // We don't want to create unnecessary instructions if this operator can + // already be folded into an address calculation in the target. + bool CanFoldInAddr = false; + for (SDNode *Use : BO->uses()) + if (canFoldInAddressingMode(BO, Use, DAG, TLI)) { + CanFoldInAddr = true; + break; + } + + for (char Swap = 0; !CanFoldInAddr && Swap <= 1; Swap++) { + ConstantSDNode *IdentOp = isConstOrConstSplat(Swap ? CF : CT); + if (!IdentOp || IdentOp->isOpaque() || + IdentOp->getAPIntValue() != *IdentityInt) + continue; + + const SDLoc DL = SDLoc(BO); + EVT BOT = BO->getValueType(0); + + SDValue NewBO = DAG.getNode(BinOpcode, DL, BOT, CBO, Swap ? CT : CF); + + return Swap ? DAG.getSelect(DL, BOT, CC, NewBO, CBO) + : DAG.getSelect(DL, BOT, CC, CBO, NewBO); + } + } + if (!isConstantOrConstantVector(CT, true) && !isConstantFPBuildVectorOrConstantFP(CT)) return SDValue(); - SDValue CF = Sel.getOperand(2); if (!isConstantOrConstantVector(CF, true) && !isConstantFPBuildVectorOrConstantFP(CF)) return SDValue(); @@ -2054,13 +2152,11 @@ // propagate non constant operands into select. I.e.: // and (select Cond, 0, -1), X --> select Cond, 0, X // or X, (select Cond, -1, 0) --> select Cond, -1, X - auto BinOpcode = BO->getOpcode(); bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) && (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF)); - SDValue CBO = BO->getOperand(SelOpNo ^ 1); if (!CanFoldNonConst && !isConstantOrConstantVector(CBO, true) && !isConstantFPBuildVectorOrConstantFP(CBO)) @@ -2265,7 +2361,8 @@ } } - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt::getNullValue(VT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt)) return NewSel; // reassociate add @@ -3114,7 +3211,8 @@ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1})) return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt::getNullValue(VT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt, true)) return NewSel; ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); @@ -3629,7 +3727,8 @@ if (N1IsConst && ConstValue1.isOneValue()) return N0; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt(VT.getScalarSizeInBits(), 1); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt)) return NewSel; // fold (mul x, -1) -> 0-x @@ -3967,7 +4066,8 @@ if (SDValue V = simplifyDivRem(N, DAG)) return V; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt(VT.getScalarSizeInBits(), 1); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt, true)) return NewSel; // If we know the sign bits of both operands are zero, strength reduce to a @@ -4108,7 +4208,8 @@ if (SDValue V = simplifyDivRem(N, DAG)) return V; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt(VT.getScalarSizeInBits(), 1); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt, true)) return NewSel; if (SDValue V = visitUDIVLike(N0, N1, N)) { @@ -5432,7 +5533,8 @@ APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt::getAllOnesValue(VT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt)) return NewSel; // reassociate and @@ -6185,7 +6287,8 @@ if (isAllOnesConstant(N1)) return N1; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt::getNullValue(VT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt)) return NewSel; // fold (or x, c) -> c iff (x & ~c) == 0 @@ -7474,7 +7577,8 @@ if (isNullConstant(N1)) return N0; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + APInt IdentityInt = APInt::getNullValue(VT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt)) return NewSel; // reassociate xor @@ -7906,7 +8010,6 @@ return V; EVT VT = N0.getValueType(); - EVT ShiftVT = N1.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops @@ -7940,7 +8043,9 @@ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1})) return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + EVT ShiftVT = N1.getValueType(); + APInt IdentityInt = APInt::getNullValue(ShiftVT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt, true)) return NewSel; // if (shl x, c) is known to be zero, return 0 @@ -8251,7 +8356,9 @@ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1})) return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + EVT ShiftVT = N1.getValueType(); + APInt IdentityInt = APInt::getNullValue(ShiftVT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt, true)) return NewSel; // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports @@ -8273,7 +8380,6 @@ // clamp (add c1, c2) to max shift. if (N0.getOpcode() == ISD::SRA) { SDLoc DL(N); - EVT ShiftVT = N1.getValueType(); EVT ShiftSVT = ShiftVT.getScalarType(); SmallVector ShiftValues; @@ -8440,7 +8546,9 @@ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1})) return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) + EVT ShiftVT = N1.getValueType(); + APInt IdentityInt = APInt::getNullValue(ShiftVT.getScalarSizeInBits()); + if (SDValue NewSel = foldBinOpIntoSelect(N, &IdentityInt, true)) return NewSel; // if (srl x, c) is known to be zero, return 0 @@ -8469,7 +8577,6 @@ }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDLoc DL(N); - EVT ShiftVT = N1.getValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); } @@ -14372,63 +14479,6 @@ return SDValue(); } -/// Return true if 'Use' is a load or a store that uses N as its base pointer -/// and that N may be folded in the load / store addressing mode. -static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, - SelectionDAG &DAG, - const TargetLowering &TLI) { - EVT VT; - unsigned AS; - - if (LoadSDNode *LD = dyn_cast(Use)) { - if (LD->isIndexed() || LD->getBasePtr().getNode() != N) - return false; - VT = LD->getMemoryVT(); - AS = LD->getAddressSpace(); - } else if (StoreSDNode *ST = dyn_cast(Use)) { - if (ST->isIndexed() || ST->getBasePtr().getNode() != N) - return false; - VT = ST->getMemoryVT(); - AS = ST->getAddressSpace(); - } else if (MaskedLoadSDNode *LD = dyn_cast(Use)) { - if (LD->isIndexed() || LD->getBasePtr().getNode() != N) - return false; - VT = LD->getMemoryVT(); - AS = LD->getAddressSpace(); - } else if (MaskedStoreSDNode *ST = dyn_cast(Use)) { - if (ST->isIndexed() || ST->getBasePtr().getNode() != N) - return false; - VT = ST->getMemoryVT(); - AS = ST->getAddressSpace(); - } else - return false; - - TargetLowering::AddrMode AM; - if (N->getOpcode() == ISD::ADD) { - AM.HasBaseReg = true; - ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); - if (Offset) - // [reg +/- imm] - AM.BaseOffs = Offset->getSExtValue(); - else - // [reg +/- reg] - AM.Scale = 1; - } else if (N->getOpcode() == ISD::SUB) { - AM.HasBaseReg = true; - ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); - if (Offset) - // [reg +/- imm] - AM.BaseOffs = -Offset->getSExtValue(); - else - // [reg +/- reg] - AM.Scale = 1; - } else - return false; - - return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, - VT.getTypeForEVT(*DAG.getContext()), AS); -} - static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, bool &IsLoad, bool &IsMasked, SDValue &Ptr, const TargetLowering &TLI) { diff --git a/llvm/test/CodeGen/AArch64/half.ll b/llvm/test/CodeGen/AArch64/half.ll --- a/llvm/test/CodeGen/AArch64/half.ll +++ b/llvm/test/CodeGen/AArch64/half.ll @@ -107,12 +107,12 @@ ; CHECK-NEXT: movk w9, #15428, lsl #16 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: cset w8, pl -; CHECK-NEXT: fccmp s0, s1, #8, pl -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: csinc w9, w9, wzr, mi -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: mov w10, #4 +; CHECK-NEXT: fccmp s0, s2, #8, pl +; CHECK-NEXT: csinc w8, w10, wzr, mi +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cinc w0, w8, pl ; CHECK-NEXT: ret %f16 = bitcast i16 %in to half %cmp0 = fcmp ogt half 0xH3333, %f16 diff --git a/llvm/test/CodeGen/AArch64/midpoint-int.ll b/llvm/test/CodeGen/AArch64/midpoint-int.ll --- a/llvm/test/CodeGen/AArch64/midpoint-int.ll +++ b/llvm/test/CodeGen/AArch64/midpoint-int.ll @@ -14,13 +14,13 @@ ; CHECK-LABEL: scalar_i32_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: csel w9, w1, w0, gt -; CHECK-NEXT: csel w10, w0, w1, gt -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: cneg w8, w8, le -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: csel w8, w1, w0, gt +; CHECK-NEXT: csel w9, w0, w1, gt +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: lsr w8, w8, #1 +; CHECK-NEXT: cneg w8, w8, gt +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %t3 = icmp sgt i32 %a1, %a2 ; signed %t4 = select i1 %t3, i32 -1, i32 1 @@ -37,13 +37,13 @@ ; CHECK-LABEL: scalar_i32_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: csel w9, w1, w0, hi -; CHECK-NEXT: csel w10, w0, w1, hi -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: cneg w8, w8, ls -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: csel w8, w1, w0, hi +; CHECK-NEXT: csel w9, w0, w1, hi +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: lsr w8, w8, #1 +; CHECK-NEXT: cneg w8, w8, hi +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %t3 = icmp ugt i32 %a1, %a2 %t4 = select i1 %t3, i32 -1, i32 1 @@ -62,14 +62,14 @@ ; CHECK-LABEL: scalar_i32_signed_mem_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1 -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: csel w11, w8, w1, gt -; CHECK-NEXT: sub w10, w11, w10 -; CHECK-NEXT: cneg w9, w9, le -; CHECK-NEXT: lsr w10, w10, #1 -; CHECK-NEXT: madd w0, w10, w9, w8 +; CHECK-NEXT: csel w9, w1, w8, gt +; CHECK-NEXT: csel w10, w8, w1, gt +; CHECK-NEXT: sub w9, w10, w9 +; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: lsr w9, w9, #1 +; CHECK-NEXT: cneg w9, w9, gt +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %a1 = load i32, i32* %a1_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -87,14 +87,14 @@ ; CHECK-LABEL: scalar_i32_signed_reg_mem: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x1] -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: csel w10, w8, w0, gt -; CHECK-NEXT: csel w8, w0, w8, gt -; CHECK-NEXT: sub w8, w8, w10 -; CHECK-NEXT: cneg w9, w9, le -; CHECK-NEXT: lsr w8, w8, #1 -; CHECK-NEXT: madd w0, w8, w9, w0 +; CHECK-NEXT: csel w9, w8, w0, gt +; CHECK-NEXT: csel w10, w0, w8, gt +; CHECK-NEXT: sub w9, w10, w9 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: lsr w8, w9, #1 +; CHECK-NEXT: cneg w8, w8, gt +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %a2 = load i32, i32* %a2_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -113,14 +113,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w11, w9, w8, gt -; CHECK-NEXT: csel w9, w8, w9, gt -; CHECK-NEXT: sub w9, w9, w11 -; CHECK-NEXT: cneg w10, w10, le -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: madd w0, w9, w10, w8 +; CHECK-NEXT: csel w10, w9, w8, gt +; CHECK-NEXT: csel w11, w8, w9, gt +; CHECK-NEXT: sub w10, w11, w10 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: lsr w9, w10, #1 +; CHECK-NEXT: cneg w9, w9, gt +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %a1 = load i32, i32* %a1_addr %a2 = load i32, i32* %a2_addr @@ -145,13 +145,13 @@ ; CHECK-LABEL: scalar_i64_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: csel x9, x1, x0, gt -; CHECK-NEXT: csel x10, x0, x1, gt -; CHECK-NEXT: mov x8, #-1 -; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: cneg x8, x8, le -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: madd x0, x9, x8, x0 +; CHECK-NEXT: csel x8, x1, x0, gt +; CHECK-NEXT: csel x9, x0, x1, gt +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: cmp x0, x1 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: cneg x8, x8, gt +; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret %t3 = icmp sgt i64 %a1, %a2 ; signed %t4 = select i1 %t3, i64 -1, i64 1 @@ -168,13 +168,13 @@ ; CHECK-LABEL: scalar_i64_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: csel x9, x1, x0, hi -; CHECK-NEXT: csel x10, x0, x1, hi -; CHECK-NEXT: mov x8, #-1 -; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: cneg x8, x8, ls -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: madd x0, x9, x8, x0 +; CHECK-NEXT: csel x8, x1, x0, hi +; CHECK-NEXT: csel x9, x0, x1, hi +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: cmp x0, x1 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: cneg x8, x8, hi +; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret %t3 = icmp ugt i64 %a1, %a2 %t4 = select i1 %t3, i64 -1, i64 1 @@ -193,14 +193,14 @@ ; CHECK-LABEL: scalar_i64_signed_mem_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov x9, #-1 ; CHECK-NEXT: cmp x8, x1 -; CHECK-NEXT: csel x10, x1, x8, gt -; CHECK-NEXT: csel x11, x8, x1, gt -; CHECK-NEXT: sub x10, x11, x10 -; CHECK-NEXT: cneg x9, x9, le -; CHECK-NEXT: lsr x10, x10, #1 -; CHECK-NEXT: madd x0, x10, x9, x8 +; CHECK-NEXT: csel x9, x1, x8, gt +; CHECK-NEXT: csel x10, x8, x1, gt +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: cmp x8, x1 +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: cneg x9, x9, gt +; CHECK-NEXT: add x0, x9, x8 ; CHECK-NEXT: ret %a1 = load i64, i64* %a1_addr %t3 = icmp sgt i64 %a1, %a2 ; signed @@ -218,14 +218,14 @@ ; CHECK-LABEL: scalar_i64_signed_reg_mem: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x1] -; CHECK-NEXT: mov x9, #-1 ; CHECK-NEXT: cmp x0, x8 -; CHECK-NEXT: csel x10, x8, x0, gt -; CHECK-NEXT: csel x8, x0, x8, gt -; CHECK-NEXT: sub x8, x8, x10 -; CHECK-NEXT: cneg x9, x9, le -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: madd x0, x8, x9, x0 +; CHECK-NEXT: csel x9, x8, x0, gt +; CHECK-NEXT: csel x10, x0, x8, gt +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: lsr x8, x9, #1 +; CHECK-NEXT: cneg x8, x8, gt +; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret %a2 = load i64, i64* %a2_addr %t3 = icmp sgt i64 %a1, %a2 ; signed @@ -244,14 +244,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: ldr x9, [x1] -; CHECK-NEXT: mov x10, #-1 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: csel x11, x9, x8, gt -; CHECK-NEXT: csel x9, x8, x9, gt -; CHECK-NEXT: sub x9, x9, x11 -; CHECK-NEXT: cneg x10, x10, le -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: madd x0, x9, x10, x8 +; CHECK-NEXT: csel x10, x9, x8, gt +; CHECK-NEXT: csel x11, x8, x9, gt +; CHECK-NEXT: sub x10, x11, x10 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: lsr x9, x10, #1 +; CHECK-NEXT: cneg x9, x9, gt +; CHECK-NEXT: add x0, x9, x8 ; CHECK-NEXT: ret %a1 = load i64, i64* %a1_addr %a2 = load i64, i64* %a2_addr @@ -276,14 +276,14 @@ ; CHECK-LABEL: scalar_i16_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1, sxth -; CHECK-NEXT: cneg w8, w9, le ; CHECK-NEXT: csel w9, w1, w0, gt ; CHECK-NEXT: csel w10, w0, w1, gt ; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #15 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: cmp w8, w1, sxth +; CHECK-NEXT: ubfx w8, w9, #1, #15 +; CHECK-NEXT: cneg w8, w8, gt +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %t3 = icmp sgt i16 %a1, %a2 ; signed %t4 = select i1 %t3, i16 -1, i16 1 @@ -300,14 +300,14 @@ ; CHECK-LABEL: scalar_i16_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1, uxth -; CHECK-NEXT: cneg w8, w9, ls ; CHECK-NEXT: csel w9, w1, w0, hi ; CHECK-NEXT: csel w10, w0, w1, hi ; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #15 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: cmp w8, w1, uxth +; CHECK-NEXT: ubfx w8, w9, #1, #15 +; CHECK-NEXT: cneg w8, w8, hi +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %t3 = icmp ugt i16 %a1, %a2 %t4 = select i1 %t3, i16 -1, i16 1 @@ -326,14 +326,14 @@ ; CHECK-LABEL: scalar_i16_signed_mem_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1, sxth -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: csel w11, w8, w1, gt -; CHECK-NEXT: sub w10, w11, w10 -; CHECK-NEXT: cneg w9, w9, le -; CHECK-NEXT: ubfx w10, w10, #1, #15 -; CHECK-NEXT: madd w0, w10, w9, w8 +; CHECK-NEXT: csel w9, w1, w8, gt +; CHECK-NEXT: csel w10, w8, w1, gt +; CHECK-NEXT: sub w9, w10, w9 +; CHECK-NEXT: cmp w8, w1, sxth +; CHECK-NEXT: ubfx w9, w9, #1, #15 +; CHECK-NEXT: cneg w9, w9, gt +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %a1 = load i16, i16* %a1_addr %t3 = icmp sgt i16 %a1, %a2 ; signed @@ -352,14 +352,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsh w8, [x1] ; CHECK-NEXT: sxth w9, w0 -; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: cneg w9, w10, le ; CHECK-NEXT: csel w10, w8, w0, gt -; CHECK-NEXT: csel w8, w0, w8, gt -; CHECK-NEXT: sub w8, w8, w10 -; CHECK-NEXT: ubfx w8, w8, #1, #15 -; CHECK-NEXT: madd w0, w8, w9, w0 +; CHECK-NEXT: csel w11, w0, w8, gt +; CHECK-NEXT: sub w10, w11, w10 +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: ubfx w8, w10, #1, #15 +; CHECK-NEXT: cneg w8, w8, gt +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %a2 = load i16, i16* %a2_addr %t3 = icmp sgt i16 %a1, %a2 ; signed @@ -378,14 +378,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsh w8, [x0] ; CHECK-NEXT: ldrsh w9, [x1] -; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w11, w9, w8, gt -; CHECK-NEXT: csel w9, w8, w9, gt -; CHECK-NEXT: sub w9, w9, w11 -; CHECK-NEXT: cneg w10, w10, le -; CHECK-NEXT: ubfx w9, w9, #1, #15 -; CHECK-NEXT: madd w0, w9, w10, w8 +; CHECK-NEXT: csel w10, w9, w8, gt +; CHECK-NEXT: csel w11, w8, w9, gt +; CHECK-NEXT: sub w10, w11, w10 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: ubfx w9, w10, #1, #15 +; CHECK-NEXT: cneg w9, w9, gt +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %a1 = load i16, i16* %a1_addr %a2 = load i16, i16* %a2_addr @@ -410,14 +410,14 @@ ; CHECK-LABEL: scalar_i8_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1, sxtb -; CHECK-NEXT: cneg w8, w9, le ; CHECK-NEXT: csel w9, w1, w0, gt ; CHECK-NEXT: csel w10, w0, w1, gt ; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #7 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: cmp w8, w1, sxtb +; CHECK-NEXT: ubfx w8, w9, #1, #7 +; CHECK-NEXT: cneg w8, w8, gt +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %t3 = icmp sgt i8 %a1, %a2 ; signed %t4 = select i1 %t3, i8 -1, i8 1 @@ -434,14 +434,14 @@ ; CHECK-LABEL: scalar_i8_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1, uxtb -; CHECK-NEXT: cneg w8, w9, ls ; CHECK-NEXT: csel w9, w1, w0, hi ; CHECK-NEXT: csel w10, w0, w1, hi ; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #7 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: cmp w8, w1, uxtb +; CHECK-NEXT: ubfx w8, w9, #1, #7 +; CHECK-NEXT: cneg w8, w8, hi +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %t3 = icmp ugt i8 %a1, %a2 %t4 = select i1 %t3, i8 -1, i8 1 @@ -460,14 +460,14 @@ ; CHECK-LABEL: scalar_i8_signed_mem_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsb w8, [x0] -; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: cmp w8, w1, sxtb -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: csel w11, w8, w1, gt -; CHECK-NEXT: sub w10, w11, w10 -; CHECK-NEXT: cneg w9, w9, le -; CHECK-NEXT: ubfx w10, w10, #1, #7 -; CHECK-NEXT: madd w0, w10, w9, w8 +; CHECK-NEXT: csel w9, w1, w8, gt +; CHECK-NEXT: csel w10, w8, w1, gt +; CHECK-NEXT: sub w9, w10, w9 +; CHECK-NEXT: cmp w8, w1, sxtb +; CHECK-NEXT: ubfx w9, w9, #1, #7 +; CHECK-NEXT: cneg w9, w9, gt +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %a1 = load i8, i8* %a1_addr %t3 = icmp sgt i8 %a1, %a2 ; signed @@ -486,14 +486,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsb w8, [x1] ; CHECK-NEXT: sxtb w9, w0 -; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: cneg w9, w10, le ; CHECK-NEXT: csel w10, w8, w0, gt -; CHECK-NEXT: csel w8, w0, w8, gt -; CHECK-NEXT: sub w8, w8, w10 -; CHECK-NEXT: ubfx w8, w8, #1, #7 -; CHECK-NEXT: madd w0, w8, w9, w0 +; CHECK-NEXT: csel w11, w0, w8, gt +; CHECK-NEXT: sub w10, w11, w10 +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: ubfx w8, w10, #1, #7 +; CHECK-NEXT: cneg w8, w8, gt +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret %a2 = load i8, i8* %a2_addr %t3 = icmp sgt i8 %a1, %a2 ; signed @@ -512,14 +512,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsb w8, [x0] ; CHECK-NEXT: ldrsb w9, [x1] -; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w11, w9, w8, gt -; CHECK-NEXT: csel w9, w8, w9, gt -; CHECK-NEXT: sub w9, w9, w11 -; CHECK-NEXT: cneg w10, w10, le -; CHECK-NEXT: ubfx w9, w9, #1, #7 -; CHECK-NEXT: madd w0, w9, w10, w8 +; CHECK-NEXT: csel w10, w9, w8, gt +; CHECK-NEXT: csel w11, w8, w9, gt +; CHECK-NEXT: sub w10, w11, w10 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: ubfx w9, w10, #1, #7 +; CHECK-NEXT: cneg w9, w9, gt +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %a1 = load i8, i8* %a1_addr %a2 = load i8, i8* %a2_addr diff --git a/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll --- a/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll +++ b/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll @@ -24,8 +24,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %shl = shl i64 %a, %b @@ -51,9 +51,9 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: // %if.end +; CHECK-NEXT: mul x9, x0, x1 ; CHECK-NEXT: cmn x8, #1 // =1 -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: csel x0, x0, x9, lt ; CHECK-NEXT: ret entry: %shl = shl i64 %a, %b @@ -80,8 +80,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %shl = shl i64 %a, %b @@ -108,8 +108,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %shl = shl i64 %a, %b @@ -136,8 +136,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB4_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %shl = shl i64 %a, %b @@ -163,8 +163,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %cmp = icmp sgt i64 %a, -2 @@ -188,9 +188,9 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB6_2: // %if.end +; CHECK-NEXT: mul x8, x0, x1 ; CHECK-NEXT: cmn x0, #1 // =1 -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %cmp = icmp sgt i64 %a, -1 @@ -215,8 +215,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB7_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %cmp = icmp sgt i64 %a, 0 @@ -241,8 +241,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB8_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %cmp = icmp sgt i64 %a, 1 @@ -267,8 +267,8 @@ ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB9_2: // %if.end -; CHECK-NEXT: csinc x8, x1, xzr, ge -; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: mul x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lt ; CHECK-NEXT: ret entry: %cmp = icmp sgt i64 %a, 2 @@ -288,9 +288,9 @@ ; CHECK-LABEL: i_a_op_b__2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: mul w9, w0, w1 ; CHECK-NEXT: cmn w8, #2 // =2 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w9, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -315,9 +315,9 @@ ; CHECK-LABEL: i_a_op_b__1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: mul w9, w0, w1 ; CHECK-NEXT: cmn w8, #1 // =1 -; CHECK-NEXT: csinc w9, w1, wzr, eq -; CHECK-NEXT: mul w9, w9, w0 +; CHECK-NEXT: csel w9, w9, w0, eq ; CHECK-NEXT: cmp w8, #0 // =0 ; CHECK-NEXT: csel w8, w1, w9, ge ; CHECK-NEXT: sxtw x0, w8 @@ -343,9 +343,9 @@ ; CHECK-LABEL: i_a_op_b_0: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: mul w9, w0, w1 ; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w9, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -370,9 +370,9 @@ ; CHECK-LABEL: i_a_op_b_1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: mul w9, w0, w1 ; CHECK-NEXT: cmp w8, #1 // =1 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w9, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -397,9 +397,9 @@ ; CHECK-LABEL: i_a_op_b_2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: mul w9, w0, w1 ; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w9, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -423,9 +423,9 @@ define i64 @i_a__2(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a__2: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w8, w0, w1 ; CHECK-NEXT: cmn w0, #2 // =2 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w8, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -448,9 +448,9 @@ define i64 @i_a__1(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a__1: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w8, w0, w1 ; CHECK-NEXT: cmn w0, #1 // =1 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w8, w0, eq ; CHECK-NEXT: cmp w0, #0 // =0 ; CHECK-NEXT: csel w8, w1, w8, ge ; CHECK-NEXT: sxtw x0, w8 @@ -474,9 +474,9 @@ define i64 @i_a_0(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a_0: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w8, w0, w1 ; CHECK-NEXT: cmp w0, #0 // =0 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w8, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -499,9 +499,9 @@ define i64 @i_a_1(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a_1: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w8, w0, w1 ; CHECK-NEXT: cmp w0, #1 // =1 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w8, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -524,9 +524,9 @@ define i64 @i_a_2(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a_2: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w8, w0, w1 ; CHECK-NEXT: cmp w0, #2 // =2 -; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w8, w0, eq ; CHECK-NEXT: csel w8, w1, w8, gt ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -336,8 +336,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv i16 %x, %y @@ -384,8 +385,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -437,14 +439,13 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv i16 %x, %y @@ -493,16 +494,15 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s3, s3, 30 ; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s3, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -536,21 +536,22 @@ ; ; GCN-LABEL: udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 +; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv i8 %x, %y store i8 %r, i8 addrspace(1)* %out @@ -595,8 +596,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -648,14 +650,13 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv i8 %x, %y @@ -693,31 +694,30 @@ ; ; GCN-LABEL: srem_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GCN-NEXT: s_sext_i32_i8 s3, s0 +; GCN-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_sext_i32_i8 s3, s4 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-NEXT: s_xor_b32 s1, s3, s1 +; GCN-NEXT: s_xor_b32 s2, s3, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_ashr_i32 s2, s2, 30 +; GCN-NEXT: s_or_b32 s2, s2, 1 +; GCN-NEXT: s_lshr_b32 s5, s4, 8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s2, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_lshr_b32 s2, s0, 8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, i8 addrspace(1)* %out @@ -1821,50 +1821,54 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_lshr_b32 s9, s0, 16 ; GCN-NEXT: s_and_b32 s0, s0, s8 -; GCN-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GCN-NEXT: s_and_b32 s2, s3, s8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GCN-NEXT: s_and_b32 s2, s3, s8 +; GCN-NEXT: s_lshr_b32 s10, s3, 16 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GCN-NEXT: s_lshr_b32 s0, s1, 16 ; GCN-NEXT: s_and_b32 s1, s1, s8 -; GCN-NEXT: s_lshr_b32 s10, s3, 16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc -; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc +; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v3 ; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v4, v6, v7 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mad_f32 v4, -v4, v3, v6 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GCN-NEXT: v_mul_f32_e32 v8, v7, v8 +; GCN-NEXT: v_trunc_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GCN-NEXT: v_mad_f32 v1, -v1, v4, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v6 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN-NEXT: v_mad_f32 v4, -v8, v3, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v9 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_and_b32_e32 v0, s8, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v9, v5, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_and_b32_e32 v1, s8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v0, s8, v0 ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1978,58 +1982,62 @@ ; GCN-NEXT: s_and_b32 s9, s2, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_and_b32 s10, s0, s8 -; GCN-NEXT: s_lshr_b32 s11, s2, 16 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GCN-NEXT: s_lshr_b32 s11, s2, 16 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s11 ; GCN-NEXT: s_lshr_b32 s9, s0, 16 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GCN-NEXT: s_lshr_b32 s12, s3, 16 +; GCN-NEXT: s_lshr_b32 s10, s1, 16 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v3, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 ; GCN-NEXT: s_and_b32 s2, s3, s8 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GCN-NEXT: s_and_b32 s2, s1, s8 ; GCN-NEXT: v_mul_lo_u32 v1, v1, s11 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_lshr_b32 s12, s3, 16 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, s10 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 -; GCN-NEXT: s_lshr_b32 s10, s1, 16 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, s10 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GCN-NEXT: v_and_b32_e32 v0, s8, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GCN-NEXT: v_mul_f32_e32 v2, v6, v7 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GCN-NEXT: v_mad_f32 v2, -v2, v4, v6 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, s12 -; GCN-NEXT: v_and_b32_e32 v0, s8, v0 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_and_b32_e32 v1, s8, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v1, s8, v1 ; GCN-NEXT: v_or_b32_e32 v1, v1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2148,76 +2156,72 @@ ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s8, s2 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_sext_i32_i16 s9, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GCN-NEXT: s_xor_b32 s8, s9, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_sext_i32_i16 s9, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s9 +; GCN-NEXT: s_sext_i32_i16 s8, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 ; GCN-NEXT: s_ashr_i32 s2, s2, 16 -; GCN-NEXT: s_ashr_i32 s8, s8, 30 -; GCN-NEXT: s_or_b32 s8, s8, 1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GCN-NEXT: s_ashr_i32 s0, s0, 16 +; GCN-NEXT: s_xor_b32 s8, s8, s9 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GCN-NEXT: s_ashr_i32 s8, s8, 30 +; GCN-NEXT: s_or_b32 s8, s8, 1 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_ashr_i32 s0, s0, 16 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v1 ; GCN-NEXT: s_xor_b32 s0, s0, s2 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s0, v2 ; GCN-NEXT: s_sext_i32_i16 s0, s3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN-NEXT: v_mad_f32 v1, -v1, v3, v5 +; GCN-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GCN-NEXT: s_sext_i32_i16 s2, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v5 ; GCN-NEXT: s_xor_b32 s0, s2, s0 +; GCN-NEXT: s_ashr_i32 s2, s3, 16 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mul_f32_e32 v3, v1, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v1, -v3, v5, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: s_ashr_i32 s0, s3, 16 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: s_ashr_i32 s1, s1, 16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s0, v3 +; GCN-NEXT: s_ashr_i32 s0, s1, 16 +; GCN-NEXT: v_cvt_f32_i32_e32 v7, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v5| +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GCN-NEXT: s_xor_b32 s0, s0, s2 +; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v3 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 -; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mad_f32 v3, -v3, v4, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s0, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| +; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GCN-NEXT: s_mov_b32 s0, 0xffff -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_and_b32_e32 v1, s0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2349,68 +2353,64 @@ ; GCN-NEXT: s_xor_b32 s8, s9, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s8, s8, 30 +; GCN-NEXT: s_ashr_i32 s10, s2, 16 ; GCN-NEXT: s_or_b32 s8, s8, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 -; GCN-NEXT: s_ashr_i32 s2, s2, 16 +; GCN-NEXT: s_ashr_i32 s2, s0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v3 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: s_ashr_i32 s0, s0, 16 +; GCN-NEXT: s_xor_b32 s0, s2, s10 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: v_mad_f32 v1, -v2, v3, v1 +; GCN-NEXT: s_sext_i32_i16 s8, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; GCN-NEXT: s_sext_i32_i16 s0, s3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GCN-NEXT: s_xor_b32 s8, s0, s2 -; GCN-NEXT: s_ashr_i32 s8, s8, 30 -; GCN-NEXT: s_or_b32 s8, s8, 1 -; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s2 -; GCN-NEXT: s_sext_i32_i16 s2, s3 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 -; GCN-NEXT: s_sext_i32_i16 s0, s1 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s10 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s8 +; GCN-NEXT: s_xor_b32 s0, s8, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_xor_b32 s0, s0, s2 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, s2, v1 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v1 +; GCN-NEXT: v_mad_f32 v1, -v1, v2, v3 +; GCN-NEXT: s_ashr_i32 s2, s1, 16 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v4 ; GCN-NEXT: s_ashr_i32 s0, s3, 16 +; GCN-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: s_ashr_i32 s2, s1, 16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v6 ; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 ; GCN-NEXT: s_xor_b32 s3, s2, s0 ; GCN-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 -; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v3 ; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v6, s3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mad_f32 v2, -v3, v6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s3, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v6| +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 ; GCN-NEXT: s_mov_b32 s0, 0xffff ; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 @@ -2418,7 +2418,7 @@ ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -2466,8 +2466,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 7, v0 ; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2502,27 +2503,28 @@ ; ; GCN-LABEL: urem_i3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 +; GCN-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: s_and_b32 s2, s0, 7 +; GCN-NEXT: s_and_b32 s2, s4, 7 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 -; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_lshr_b32 s2, s4, 8 ; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_and_b32_e32 v0, 7, v0 -; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = urem i3 %x, %y store i3 %r, i3 addrspace(1)* %out @@ -2570,14 +2572,13 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 7, v0 ; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2616,32 +2617,31 @@ ; ; GCN-LABEL: srem_i3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GCN-NEXT: s_bfe_i32 s3, s0, 0x30000 +; GCN-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_bfe_i32 s3, s4, 0x30000 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-NEXT: s_xor_b32 s1, s3, s1 +; GCN-NEXT: s_xor_b32 s2, s3, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_ashr_i32 s2, s2, 30 +; GCN-NEXT: s_or_b32 s2, s2, 1 +; GCN-NEXT: s_lshr_b32 s5, s4, 8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s2, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_lshr_b32 s2, s0, 8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_and_b32_e32 v0, 7, v0 -; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = srem i3 %x, %y store i3 %r, i3 addrspace(1)* %out @@ -2724,39 +2724,42 @@ ; GCN-NEXT: s_and_b32 s6, s0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: s_and_b32 s6, s2, s8 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GCN-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: s_and_b32 s0, s1, s8 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GCN-NEXT: s_and_b32 s0, s3, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v2 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: s_and_b32 s0, s1, s8 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GCN-NEXT: s_and_b32 s0, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1 +; GCN-NEXT: v_mad_f32 v1, -v1, v3, v4 +; GCN-NEXT: v_mul_f32_e32 v7, v6, v7 +; GCN-NEXT: v_trunc_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_u32_f32_e32 v8, v7 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GCN-NEXT: v_mad_f32 v3, -v7, v2, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_and_b32_e32 v0, s8, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2845,53 +2848,56 @@ ; GCN-NEXT: s_mov_b32 s8, 0xffff ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: s_and_b32 s6, s0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: s_and_b32 s6, s2, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_alignbit_b32 v4, s1, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, s8, v4 -; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v5 -; GCN-NEXT: v_and_b32_e32 v3, s8, v1 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GCN-NEXT: v_and_b32_e32 v6, s8, v4 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN-NEXT: v_alignbit_b32 v2, s3, v3, 16 +; GCN-NEXT: v_and_b32_e32 v3, s8, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-NEXT: s_and_b32 s0, s1, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GCN-NEXT: s_and_b32 s0, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GCN-NEXT: v_mad_f32 v3, -v5, v2, v3 +; GCN-NEXT: v_mad_f32 v3, -v5, v1, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-NEXT: s_and_b32 s0, s3, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 +; GCN-NEXT: v_mul_lo_u32 v1, v1, v4 ; GCN-NEXT: v_mad_f32 v3, -v3, v6, v7 +; GCN-NEXT: v_and_b32_e32 v0, s8, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_and_b32_e32 v0, s8, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 ; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2993,48 +2999,45 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, 16 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s8, s8, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GCN-NEXT: s_or_b32 s8, s8, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_ashr_i32 s2, s2, 16 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v3 ; GCN-NEXT: s_xor_b32 s0, s2, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v1, -v2, v3, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: s_sext_i32_i16 s0, s1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GCN-NEXT: s_sext_i32_i16 s1, s3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s0, v2 +; GCN-NEXT: s_sext_i32_i16 s0, s3 +; GCN-NEXT: v_cvt_f32_i32_e32 v6, s0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: v_mul_f32_e32 v2, v6, v7 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_mad_f32 v2, -v2, v4, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s0, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -3133,67 +3136,65 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s8, s2 -; GCN-NEXT: s_sext_i32_i16 s6, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_xor_b32 s6, s8, s6 -; GCN-NEXT: s_ashr_i32 s6, s6, 30 +; GCN-NEXT: s_sext_i32_i16 s9, s2 +; GCN-NEXT: s_sext_i32_i16 s8, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 +; GCN-NEXT: s_xor_b32 s8, s9, s8 +; GCN-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s6, s6, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_or_b32 s8, s8, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_alignbit_b32 v3, s3, v3, 16 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 16 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, v3 -; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GCN-NEXT: v_bfe_i32 v5, v1, 0, 16 +; GCN-NEXT: v_bfe_i32 v5, v3, 0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v6, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 16 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, v2 +; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_xor_b32_e32 v3, v5, v3 ; GCN-NEXT: s_sext_i32_i16 s0, s1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_mul_f32_e32 v5, v6, v7 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v6, -v5, v4, v6 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GCN-NEXT: v_or_b32_e32 v3, 1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v7, v5 +; GCN-NEXT: v_mad_f32 v5, -v5, v4, v6 +; GCN-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GCN-NEXT: s_sext_i32_i16 s2, s3 -; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v6 +; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v1, v2, v1 ; GCN-NEXT: s_xor_b32 s0, s2, s0 +; GCN-NEXT: v_mul_f32_e32 v2, v4, v5 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v2 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 -; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v6, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s0, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v6| +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s1 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v2 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -3281,40 +3282,43 @@ ; GCN-NEXT: s_movk_i32 s3, 0x7fff ; GCN-NEXT: s_and_b32 s9, s0, s3 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_and_b32 s8, s2, s3 -; GCN-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GCN-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, s2 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GCN-NEXT: v_and_b32_e32 v2, s3, v2 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-NEXT: v_and_b32_e32 v2, s3, v2 +; GCN-NEXT: s_bfe_u32 s2, s2, 0xf000f ; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_and_b32_e32 v0, s3, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GCN-NEXT: v_mul_f32_e32 v1, v6, v7 -; GCN-NEXT: v_and_b32_e32 v0, s3, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mad_f32 v4, -v1, v5, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v1, v0, v6 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v2 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v1 +; GCN-NEXT: v_mul_f32_e32 v7, v0, v7 +; GCN-NEXT: v_trunc_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_u32_f32_e32 v8, v7 +; GCN-NEXT: v_mad_f32 v1, -v1, v5, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_mad_f32 v0, -v7, v2, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v8 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GCN-NEXT: v_and_b32_e32 v2, s3, v3 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc ; GCN-NEXT: v_and_b32_e32 v3, s3, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -3412,53 +3416,56 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GCN-NEXT: s_movk_i32 s3, 0x7fff -; GCN-NEXT: s_and_b32 s10, s0, s3 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GCN-NEXT: s_and_b32 s9, s0, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_and_b32 s9, s2, s3 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-NEXT: v_alignbit_b32 v4, s1, v4, 30 ; GCN-NEXT: s_bfe_u32 s1, s0, 0xf000f -; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, s1 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GCN-NEXT: v_and_b32_e32 v4, s3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GCN-NEXT: v_and_b32_e32 v2, s3, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v6 ; GCN-NEXT: v_and_b32_e32 v0, s3, v0 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 -; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, s2, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GCN-NEXT: v_mul_f32_e32 v1, v2, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v3, -v1, v5, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GCN-NEXT: v_mad_f32 v2, -v1, v6, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v3 ; GCN-NEXT: s_lshr_b32 s0, s0, 15 -; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 -; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 ; GCN-NEXT: s_lshr_b32 s8, s2, 15 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v6 +; GCN-NEXT: v_mul_f32_e32 v2, v7, v8 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GCN-NEXT: v_mad_f32 v2, -v2, v3, v7 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_and_b32_e32 v3, s3, v3 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GCN-NEXT: v_and_b32_e32 v2, s3, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_and_b32_e32 v2, s3, v5 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -3559,59 +3566,57 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GCN-NEXT: s_bfe_i32 s3, s0, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GCN-NEXT: s_bfe_i32 s1, s2, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_xor_b32 s1, s1, s3 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-NEXT: s_bfe_i32 s8, s2, 0xf0000 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GCN-NEXT: s_xor_b32 s1, s8, s3 ; GCN-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v4 +; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s1, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: s_bfe_i32 s1, s2, 0xf000f -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v6 +; GCN-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GCN-NEXT: v_cvt_f32_i32_e32 v5, v2 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 +; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 +; GCN-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-NEXT: v_mad_f32 v1, -v4, v6, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GCN-NEXT: v_cvt_f32_i32_e32 v8, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v5 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 -; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, v1 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s0, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v6| +; GCN-NEXT: v_mul_f32_e32 v1, v8, v9 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_mad_f32 v1, -v1, v5, v8 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v5| ; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_and_b32_e32 v3, s0, v3 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_and_b32_e32 v2, s0, v3 +; GCN-NEXT: v_and_b32_e32 v3, s0, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -3716,72 +3721,70 @@ ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GCN-NEXT: s_movk_i32 s3, 0x7fff -; GCN-NEXT: s_and_b32 s11, s0, s3 -; GCN-NEXT: s_bfe_i32 s11, s11, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s11 +; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GCN-NEXT: s_and_b32 s1, s0, s3 +; GCN-NEXT: s_bfe_i32 s1, s1, 0xf0000 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s1 ; GCN-NEXT: s_and_b32 s9, s2, s3 ; GCN-NEXT: s_bfe_i32 s9, s9, 0xf0000 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_xor_b32 s9, s9, s11 -; GCN-NEXT: s_ashr_i32 s9, s9, 30 -; GCN-NEXT: s_or_b32 s9, s9, 1 +; GCN-NEXT: s_xor_b32 s1, s9, s1 +; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_bfe_u32 s12, s0, 0xf000f ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v4 +; GCN-NEXT: s_or_b32 s1, s1, 1 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f +; GCN-NEXT: v_add_i32_e32 v4, vcc, s1, v5 +; GCN-NEXT: s_bfe_i32 s1, s12, 0xf0000 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: s_bfe_u32 s12, s0, 0xf000f -; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 -; GCN-NEXT: s_lshr_b32 s1, s0, 15 -; GCN-NEXT: s_bfe_i32 s0, s12, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GCN-NEXT: s_lshr_b32 s8, s2, 15 -; GCN-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GCN-NEXT: s_lshr_b32 s11, s0, 15 +; GCN-NEXT: s_bfe_i32 s0, s10, 0xf0000 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GCN-NEXT: s_xor_b32 s0, s2, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: v_and_b32_e32 v1, s3, v1 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: v_bfe_i32 v7, v1, 0, 15 ; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v6, v5 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v8, v7 +; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_and_b32_e32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s0, v6 +; GCN-NEXT: v_and_b32_e32 v0, s3, v0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GCN-NEXT: v_bfe_i32 v4, v1, 0, 15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_bfe_i32 v4, v0, 0, 15 +; GCN-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc ; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, s3, v0 -; GCN-NEXT: v_bfe_i32 v6, v0, 0, 15 -; GCN-NEXT: v_cvt_f32_i32_e32 v7, v6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v5 -; GCN-NEXT: v_xor_b32_e32 v4, v6, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v8 +; GCN-NEXT: v_xor_b32_e32 v4, v4, v7 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 30, v4 ; GCN-NEXT: v_or_b32_e32 v4, 1, v4 -; GCN-NEXT: v_mul_f32_e32 v6, v7, v8 +; GCN-NEXT: v_mul_f32_e32 v6, v5, v6 ; GCN-NEXT: v_trunc_f32_e32 v6, v6 -; GCN-NEXT: v_mad_f32 v7, -v6, v5, v7 -; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_cvt_i32_f32_e32 v7, v6 +; GCN-NEXT: v_mad_f32 v5, -v6, v8, v5 +; GCN-NEXT: v_mul_lo_u32 v3, v3, s11 +; GCN-NEXT: s_lshr_b32 s8, s2, 15 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v8| +; GCN-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v4, v1 -; GCN-NEXT: v_and_b32_e32 v2, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 ; GCN-NEXT: v_and_b32_e32 v3, s3, v3 +; GCN-NEXT: v_and_b32_e32 v2, s3, v2 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -1125,7 +1125,8 @@ ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.ext = ashr i64 %a, 32 @@ -1147,7 +1148,8 @@ ; GFX9-NEXT: v_mad_f32 v0, -v2, v1, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.mask = and i64 %a, 4294967295 %b.mask = and i64 %b, 4294967295 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -259,11 +259,12 @@ ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 +; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 @@ -312,7 +313,8 @@ ; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v9, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v8, v8, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc @@ -365,13 +367,13 @@ ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v8 ; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0| +; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 +; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 @@ -417,11 +419,11 @@ ; GFX9-NEXT: v_mul_f32_e32 v9, v10, v1 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 -; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v2, v11, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v11, v2, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -1475,25 +1475,25 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s6 ; GCN-NEXT: s_mov_b32 s9, s7 -; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 -; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 +; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; GCN-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v1, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -1508,25 +1508,25 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s8, s6 ; TONGA-NEXT: s_mov_b32 s9, s7 -; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 -; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 +; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 +; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 +; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2 +; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 -; TONGA-NEXT: v_trunc_f32_e32 v1, v1 -; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 -; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 -; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 +; TONGA-NEXT: v_trunc_f32_e32 v2, v2 +; TONGA-NEXT: v_cvt_i32_f32_e32 v4, v2 +; TONGA-NEXT: v_mad_f32 v2, -v2, v1, v3 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm @@ -1541,25 +1541,25 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s6 ; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 -; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 -; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1642,11 +1642,11 @@ ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v4 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v3, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -1683,11 +1683,11 @@ ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 ; TONGA-NEXT: v_mul_f32_e32 v2, v1, v4 ; TONGA-NEXT: v_trunc_f32_e32 v2, v2 +; TONGA-NEXT: v_cvt_i32_f32_e32 v4, v2 ; TONGA-NEXT: v_mad_f32 v1, -v2, v3, v1 -; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm @@ -1727,8 +1727,8 @@ ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v3, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1822,11 +1822,11 @@ ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 ; GCN-NEXT: v_mul_f32_e32 v3, v0, v4 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v3 ; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -1860,11 +1860,11 @@ ; TONGA-NEXT: v_or_b32_e32 v1, 1, v1 ; TONGA-NEXT: v_mul_f32_e32 v3, v0, v4 ; TONGA-NEXT: v_trunc_f32_e32 v3, v3 +; TONGA-NEXT: v_cvt_i32_f32_e32 v4, v3 ; TONGA-NEXT: v_mad_f32 v0, -v3, v2, v0 -; TONGA-NEXT: v_cvt_i32_f32_e32 v3, v3 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v4, v1 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| -; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm @@ -1874,17 +1874,17 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 +; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; GFX9-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 @@ -1901,8 +1901,8 @@ ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v0, -v3, v2, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -503,24 +503,23 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_xor_b32 s4, s6, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -533,24 +532,23 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -575,8 +573,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -593,8 +592,9 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -620,15 +620,14 @@ ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_or_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -648,15 +647,14 @@ ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_or_b32 s4, s4, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -675,24 +673,23 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 33 ; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_xor_b32 s4, s6, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -705,24 +702,23 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 33 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -742,24 +738,23 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 41 ; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_xor_b32 s4, s6, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -772,24 +767,23 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 41 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -809,24 +803,23 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 39 ; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_xor_b32 s4, s6, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -839,24 +832,23 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 39 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -882,35 +874,33 @@ ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 ; GCN-NEXT: s_xor_b32 s0, s8, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v4 ; GCN-NEXT: s_xor_b32 s0, s10, s2 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -929,35 +919,33 @@ ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 ; GCN-IR-NEXT: s_xor_b32 s0, s8, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s10 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s0, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v4 ; GCN-IR-NEXT: s_xor_b32 s0, s10, s2 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, s0 +; GCN-IR-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v5, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, v4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s0, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -975,13 +963,13 @@ ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN-NEXT: s_load_dword s6, s[0:1], 0xd ; GCN-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_sext_i32_i16 s1, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_sext_i32_i16 s0, s0 ; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 @@ -994,11 +982,11 @@ ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v1, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1886,55 +1874,49 @@ ; GCN-LABEL: s_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s3, 0x41c00000 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 s5, 0x41c00000 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mul_f32_e32 v1, s5, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GCN-NEXT: v_mad_f32 v1, -v1, v0, s5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s5, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s5 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s4, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = sdiv i64 24, %x.shr @@ -1945,54 +1927,52 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i32 s4, s6, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i32 s4, s6, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = sdiv i64 %x.shr, 23423 @@ -2012,11 +1992,11 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2032,11 +2012,11 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -2057,11 +2037,11 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2077,11 +2057,11 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -2111,11 +2091,11 @@ ; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38000000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, s4, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, s4 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -492,17 +492,16 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -524,17 +523,16 @@ ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -563,17 +561,16 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -595,17 +592,16 @@ ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -632,11 +628,11 @@ ; GCN-NEXT: v_or_b32_e32 v5, 1, v5 ; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_i32_f32_e32 v6, v4 ; GCN-NEXT: v_mad_f32 v1, -v4, v3, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v5 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -656,11 +652,11 @@ ; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v1, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v6, v4 ; GCN-IR-NEXT: v_mad_f32 v1, -v4, v3, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v6, v5 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -688,17 +684,16 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -720,17 +715,16 @@ ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -759,17 +753,16 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -791,17 +784,16 @@ ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -818,60 +810,58 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_xor_b32 s1, s7, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-NEXT: s_xor_b32 s2, s3, s4 +; GCN-NEXT: s_ashr_i32 s2, s2, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_or_b32 s2, s2, 1 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s2, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: s_xor_b32 s1, s7, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-IR-NEXT: s_xor_b32 s2, s3, s4 +; GCN-IR-NEXT: s_ashr_i32 s2, s2, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_or_b32 s2, s2, 1 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s2, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 32 %2 = ashr i64 %y, 32 @@ -1150,39 +1140,39 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-NEXT: s_load_dword s5, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_sext_i32_i16 s1, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 24 +; GCN-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 24 +; GCN-NEXT: v_alignbit_b32 v2, s3, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_xor_b32_e32 v5, v2, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 30, v5 ; GCN-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_i32_f32_e32 v6, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_48: @@ -2078,17 +2068,16 @@ ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GCN-NEXT: s_ashr_i32 s5, s4, 30 ; GCN-NEXT: s_or_b32 s5, s5, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v1, s6, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s6 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GCN-NEXT: v_mad_f32 v1, -v1, v0, s6 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2105,17 +2094,16 @@ ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GCN-IR-NEXT: s_ashr_i32 s5, s4, 30 ; GCN-IR-NEXT: s_or_b32 s5, s5, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, s6, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s6 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s6 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s5, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2131,60 +2119,58 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s1, 0x46b6fe00 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_ashr_i32 s0, s6, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x38331158, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v0, -v2, s1, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s0, 0x5b7f -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_ashr_i32 s3, s2, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s3, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: s_movk_i32 s3, 0x5b7f +; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s1, 0x46b6fe00 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_ashr_i32 s0, s6, 30 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38331158, v0 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v0, -v2, s1, v0 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-IR-NEXT: s_movk_i32 s0, 0x5b7f -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_ashr_i32 s3, s2, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s3, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = srem i64 %x.shr, 23423 @@ -2204,11 +2190,11 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2226,11 +2212,11 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2253,11 +2239,11 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2275,11 +2261,11 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2313,11 +2299,11 @@ ; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x38000000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v3 ; GCN-IR-NEXT: v_mad_f32 v1, -v3, s4, v1 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, s4 -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-IR-NEXT: v_lshlrev_b32_e32 v1, 15, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -435,51 +435,51 @@ define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 s0, s7, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 40 @@ -502,10 +502,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_udiv24_i64: @@ -520,10 +521,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %1 = lshr i64 %x, 40 %2 = lshr i64 %y, 40 @@ -535,47 +537,45 @@ ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 32 %2 = lshr i64 %y, 32 @@ -587,51 +587,51 @@ define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 s0, s7, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s2, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s3, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 33 @@ -644,51 +644,51 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 s0, s7, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s2, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s3, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 41 @@ -1811,10 +1811,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -1832,10 +1833,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 @@ -1860,10 +1862,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -1882,10 +1885,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, 1, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 @@ -1906,10 +1910,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_udiv24_k_num_i64: @@ -1923,10 +1928,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 24, %x.shr @@ -1945,10 +1951,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_udiv24_pow2_k_num_i64: @@ -1962,10 +1969,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 32768, %x.shr @@ -1990,10 +1998,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, 1, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 %x.shr, 32768 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -465,10 +465,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -491,10 +492,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -524,27 +526,29 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_lshr_b32 s1, s11, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GCN-NEXT: v_mad_f32 v1, -v1, v4, v3 ; GCN-NEXT: s_brev_b32 s0, -2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_and_b32_e32 v2, s0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -566,27 +570,29 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v5 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mad_f32 v1, -v1, v4, v3 ; GCN-IR-NEXT: s_brev_b32 s0, -2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s3 ; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -615,10 +621,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -641,10 +648,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -674,27 +682,29 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_lshr_b32 s1, s11, 9 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GCN-NEXT: v_mad_f32 v1, -v1, v4, v3 ; GCN-NEXT: s_brev_b32 s0, -2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_and_b32_e32 v2, s0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -716,27 +726,29 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v5 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mad_f32 v1, -v1, v4, v3 ; GCN-IR-NEXT: s_brev_b32 s0, -2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s3 ; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -1462,10 +1474,11 @@ ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1485,10 +1498,11 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s5 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1503,51 +1517,49 @@ ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s6, 0x46b6fe00 +; GCN-NEXT: s_movk_i32 s5, 0x5b7f ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s3, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_movk_i32 s3, 0x5b7f -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-NEXT: v_mad_f32 v0, -v1, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_mov_b32 s6, 0x46b6fe00 +; GCN-IR-NEXT: s_movk_i32 s5, 0x5b7f ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s6, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s5 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = urem i64 %x.shr, 23423 @@ -1567,8 +1579,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 @@ -1586,8 +1599,9 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 @@ -1610,8 +1624,9 @@ ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 @@ -1629,8 +1644,9 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 @@ -1659,8 +1675,9 @@ ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, s4, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, s4 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GCN-IR-NEXT: v_lshlrev_b32_e32 v1, 15, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 diff --git a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll --- a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll @@ -77,58 +77,56 @@ ; CHECK-BE-LABEL: rotl_i64: ; CHECK-BE: # %bb.0: ; CHECK-BE-NEXT: negu $1, $7 -; CHECK-BE-NEXT: andi $3, $1, 63 -; CHECK-BE-NEXT: srlv $6, $4, $3 +; CHECK-BE-NEXT: andi $2, $1, 63 +; CHECK-BE-NEXT: srlv $3, $5, $2 +; CHECK-BE-NEXT: not $6, $2 +; CHECK-BE-NEXT: sll $8, $4, 1 +; CHECK-BE-NEXT: sllv $6, $8, $6 +; CHECK-BE-NEXT: or $6, $6, $3 +; CHECK-BE-NEXT: srlv $2, $4, $2 ; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: andi $2, $7, 63 -; CHECK-BE-NEXT: move $8, $6 -; CHECK-BE-NEXT: movn $8, $zero, $1 -; CHECK-BE-NEXT: sllv $9, $4, $2 -; CHECK-BE-NEXT: srl $10, $5, 1 -; CHECK-BE-NEXT: not $11, $2 -; CHECK-BE-NEXT: srlv $10, $10, $11 -; CHECK-BE-NEXT: or $9, $9, $10 -; CHECK-BE-NEXT: sllv $10, $5, $2 +; CHECK-BE-NEXT: movn $6, $2, $1 +; CHECK-BE-NEXT: andi $8, $7, 63 +; CHECK-BE-NEXT: sllv $9, $5, $8 +; CHECK-BE-NEXT: or $3, $6, $9 ; CHECK-BE-NEXT: andi $7, $7, 32 -; CHECK-BE-NEXT: movn $9, $10, $7 -; CHECK-BE-NEXT: or $2, $9, $8 -; CHECK-BE-NEXT: srlv $5, $5, $3 -; CHECK-BE-NEXT: not $3, $3 -; CHECK-BE-NEXT: sll $4, $4, 1 -; CHECK-BE-NEXT: sllv $3, $4, $3 -; CHECK-BE-NEXT: or $3, $3, $5 -; CHECK-BE-NEXT: movn $3, $6, $1 -; CHECK-BE-NEXT: movn $10, $zero, $7 +; CHECK-BE-NEXT: movn $3, $6, $7 +; CHECK-BE-NEXT: movn $2, $zero, $1 +; CHECK-BE-NEXT: sllv $1, $4, $8 +; CHECK-BE-NEXT: srl $4, $5, 1 +; CHECK-BE-NEXT: not $5, $8 +; CHECK-BE-NEXT: srlv $4, $4, $5 +; CHECK-BE-NEXT: or $1, $1, $4 +; CHECK-BE-NEXT: movn $1, $9, $7 ; CHECK-BE-NEXT: jr $ra -; CHECK-BE-NEXT: or $3, $10, $3 +; CHECK-BE-NEXT: or $2, $1, $2 ; ; CHECK-LE-LABEL: rotl_i64: ; CHECK-LE: # %bb.0: ; CHECK-LE-NEXT: negu $1, $6 ; CHECK-LE-NEXT: andi $2, $1, 63 +; CHECK-LE-NEXT: srlv $3, $4, $2 +; CHECK-LE-NEXT: not $7, $2 +; CHECK-LE-NEXT: sll $8, $5, 1 +; CHECK-LE-NEXT: sllv $7, $8, $7 +; CHECK-LE-NEXT: or $3, $7, $3 ; CHECK-LE-NEXT: srlv $7, $5, $2 ; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: andi $3, $6, 63 -; CHECK-LE-NEXT: move $8, $7 -; CHECK-LE-NEXT: movn $8, $zero, $1 -; CHECK-LE-NEXT: sllv $9, $5, $3 -; CHECK-LE-NEXT: srl $10, $4, 1 -; CHECK-LE-NEXT: not $11, $3 -; CHECK-LE-NEXT: srlv $10, $10, $11 -; CHECK-LE-NEXT: or $9, $9, $10 -; CHECK-LE-NEXT: sllv $10, $4, $3 +; CHECK-LE-NEXT: movn $3, $7, $1 +; CHECK-LE-NEXT: andi $8, $6, 63 +; CHECK-LE-NEXT: sllv $9, $4, $8 +; CHECK-LE-NEXT: or $2, $3, $9 ; CHECK-LE-NEXT: andi $6, $6, 32 -; CHECK-LE-NEXT: movn $9, $10, $6 -; CHECK-LE-NEXT: or $3, $9, $8 -; CHECK-LE-NEXT: srlv $4, $4, $2 -; CHECK-LE-NEXT: not $2, $2 -; CHECK-LE-NEXT: sll $5, $5, 1 -; CHECK-LE-NEXT: sllv $2, $5, $2 -; CHECK-LE-NEXT: or $2, $2, $4 -; CHECK-LE-NEXT: movn $2, $7, $1 -; CHECK-LE-NEXT: movn $10, $zero, $6 +; CHECK-LE-NEXT: movn $2, $3, $6 +; CHECK-LE-NEXT: movn $7, $zero, $1 +; CHECK-LE-NEXT: sllv $1, $5, $8 +; CHECK-LE-NEXT: srl $3, $4, 1 +; CHECK-LE-NEXT: not $4, $8 +; CHECK-LE-NEXT: srlv $3, $3, $4 +; CHECK-LE-NEXT: or $1, $1, $3 +; CHECK-LE-NEXT: movn $1, $9, $6 ; CHECK-LE-NEXT: jr $ra -; CHECK-LE-NEXT: or $2, $10, $2 +; CHECK-LE-NEXT: or $3, $1, $7 %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } @@ -256,57 +254,55 @@ ; CHECK-BE: # %bb.0: ; CHECK-BE-NEXT: negu $1, $7 ; CHECK-BE-NEXT: andi $2, $1, 63 +; CHECK-BE-NEXT: sllv $3, $4, $2 +; CHECK-BE-NEXT: not $6, $2 +; CHECK-BE-NEXT: srl $8, $5, 1 +; CHECK-BE-NEXT: srlv $6, $8, $6 +; CHECK-BE-NEXT: or $3, $3, $6 ; CHECK-BE-NEXT: sllv $6, $5, $2 ; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: andi $3, $7, 63 -; CHECK-BE-NEXT: move $8, $6 -; CHECK-BE-NEXT: movn $8, $zero, $1 -; CHECK-BE-NEXT: srlv $9, $5, $3 -; CHECK-BE-NEXT: sll $10, $4, 1 -; CHECK-BE-NEXT: not $11, $3 -; CHECK-BE-NEXT: sllv $10, $10, $11 -; CHECK-BE-NEXT: or $9, $10, $9 -; CHECK-BE-NEXT: srlv $10, $4, $3 +; CHECK-BE-NEXT: movn $3, $6, $1 +; CHECK-BE-NEXT: andi $8, $7, 63 +; CHECK-BE-NEXT: srlv $9, $4, $8 +; CHECK-BE-NEXT: or $2, $3, $9 ; CHECK-BE-NEXT: andi $7, $7, 32 -; CHECK-BE-NEXT: movn $9, $10, $7 -; CHECK-BE-NEXT: or $3, $9, $8 -; CHECK-BE-NEXT: sllv $4, $4, $2 -; CHECK-BE-NEXT: not $2, $2 -; CHECK-BE-NEXT: srl $5, $5, 1 -; CHECK-BE-NEXT: srlv $2, $5, $2 -; CHECK-BE-NEXT: or $2, $4, $2 -; CHECK-BE-NEXT: movn $2, $6, $1 -; CHECK-BE-NEXT: movn $10, $zero, $7 +; CHECK-BE-NEXT: movn $2, $3, $7 +; CHECK-BE-NEXT: movn $6, $zero, $1 +; CHECK-BE-NEXT: srlv $1, $5, $8 +; CHECK-BE-NEXT: sll $3, $4, 1 +; CHECK-BE-NEXT: not $4, $8 +; CHECK-BE-NEXT: sllv $3, $3, $4 +; CHECK-BE-NEXT: or $1, $3, $1 +; CHECK-BE-NEXT: movn $1, $9, $7 ; CHECK-BE-NEXT: jr $ra -; CHECK-BE-NEXT: or $2, $10, $2 +; CHECK-BE-NEXT: or $3, $1, $6 ; ; CHECK-LE-LABEL: rotr_i64: ; CHECK-LE: # %bb.0: ; CHECK-LE-NEXT: negu $1, $6 -; CHECK-LE-NEXT: andi $3, $1, 63 -; CHECK-LE-NEXT: sllv $7, $4, $3 +; CHECK-LE-NEXT: andi $2, $1, 63 +; CHECK-LE-NEXT: sllv $3, $5, $2 +; CHECK-LE-NEXT: not $7, $2 +; CHECK-LE-NEXT: srl $8, $4, 1 +; CHECK-LE-NEXT: srlv $7, $8, $7 +; CHECK-LE-NEXT: or $7, $3, $7 +; CHECK-LE-NEXT: sllv $2, $4, $2 ; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: andi $2, $6, 63 -; CHECK-LE-NEXT: move $8, $7 -; CHECK-LE-NEXT: movn $8, $zero, $1 -; CHECK-LE-NEXT: srlv $9, $4, $2 -; CHECK-LE-NEXT: sll $10, $5, 1 -; CHECK-LE-NEXT: not $11, $2 -; CHECK-LE-NEXT: sllv $10, $10, $11 -; CHECK-LE-NEXT: or $9, $10, $9 -; CHECK-LE-NEXT: srlv $10, $5, $2 +; CHECK-LE-NEXT: movn $7, $2, $1 +; CHECK-LE-NEXT: andi $8, $6, 63 +; CHECK-LE-NEXT: srlv $9, $5, $8 +; CHECK-LE-NEXT: or $3, $7, $9 ; CHECK-LE-NEXT: andi $6, $6, 32 -; CHECK-LE-NEXT: movn $9, $10, $6 -; CHECK-LE-NEXT: or $2, $9, $8 -; CHECK-LE-NEXT: sllv $5, $5, $3 -; CHECK-LE-NEXT: not $3, $3 -; CHECK-LE-NEXT: srl $4, $4, 1 -; CHECK-LE-NEXT: srlv $3, $4, $3 -; CHECK-LE-NEXT: or $3, $5, $3 -; CHECK-LE-NEXT: movn $3, $7, $1 -; CHECK-LE-NEXT: movn $10, $zero, $6 +; CHECK-LE-NEXT: movn $3, $7, $6 +; CHECK-LE-NEXT: movn $2, $zero, $1 +; CHECK-LE-NEXT: srlv $1, $4, $8 +; CHECK-LE-NEXT: sll $4, $5, 1 +; CHECK-LE-NEXT: not $5, $8 +; CHECK-LE-NEXT: sllv $4, $4, $5 +; CHECK-LE-NEXT: or $1, $4, $1 +; CHECK-LE-NEXT: movn $1, $9, $6 ; CHECK-LE-NEXT: jr $ra -; CHECK-LE-NEXT: or $3, $10, $3 +; CHECK-LE-NEXT: or $2, $1, $2 %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll --- a/llvm/test/CodeGen/Mips/funnel-shift.ll +++ b/llvm/test/CodeGen/Mips/funnel-shift.ll @@ -72,37 +72,37 @@ ; CHECK-BE-NEXT: jal __umoddi3 ; CHECK-BE-NEXT: addiu $7, $zero, 37 ; CHECK-BE-NEXT: not $1, $3 -; CHECK-BE-NEXT: andi $2, $3, 63 -; CHECK-BE-NEXT: not $4, $2 -; CHECK-BE-NEXT: srl $5, $18, 1 -; CHECK-BE-NEXT: sllv $6, $19, $2 -; CHECK-BE-NEXT: srlv $4, $5, $4 -; CHECK-BE-NEXT: andi $5, $1, 63 -; CHECK-BE-NEXT: srl $7, $16, 5 -; CHECK-BE-NEXT: sll $8, $17, 27 -; CHECK-BE-NEXT: or $7, $8, $7 -; CHECK-BE-NEXT: srl $8, $7, 1 -; CHECK-BE-NEXT: srlv $9, $8, $5 -; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: move $10, $9 -; CHECK-BE-NEXT: movn $10, $zero, $1 -; CHECK-BE-NEXT: or $4, $6, $4 -; CHECK-BE-NEXT: sllv $6, $18, $2 +; CHECK-BE-NEXT: andi $2, $1, 63 +; CHECK-BE-NEXT: srl $4, $16, 5 +; CHECK-BE-NEXT: sll $5, $17, 27 +; CHECK-BE-NEXT: or $4, $5, $4 +; CHECK-BE-NEXT: sll $5, $4, 31 +; CHECK-BE-NEXT: sll $6, $16, 27 +; CHECK-BE-NEXT: srl $6, $6, 1 +; CHECK-BE-NEXT: or $5, $6, $5 +; CHECK-BE-NEXT: srlv $5, $5, $2 +; CHECK-BE-NEXT: not $6, $2 +; CHECK-BE-NEXT: srl $4, $4, 1 +; CHECK-BE-NEXT: sll $7, $4, 1 +; CHECK-BE-NEXT: sllv $6, $7, $6 +; CHECK-BE-NEXT: or $5, $6, $5 +; CHECK-BE-NEXT: srlv $2, $4, $2 +; CHECK-BE-NEXT: andi $4, $1, 32 +; CHECK-BE-NEXT: movn $5, $2, $4 +; CHECK-BE-NEXT: andi $6, $3, 63 +; CHECK-BE-NEXT: sllv $7, $18, $6 +; CHECK-BE-NEXT: or $1, $5, $7 ; CHECK-BE-NEXT: andi $3, $3, 32 -; CHECK-BE-NEXT: movn $4, $6, $3 -; CHECK-BE-NEXT: sll $7, $7, 31 -; CHECK-BE-NEXT: sll $2, $16, 27 -; CHECK-BE-NEXT: srl $11, $2, 1 -; CHECK-BE-NEXT: or $2, $4, $10 -; CHECK-BE-NEXT: movn $6, $zero, $3 -; CHECK-BE-NEXT: or $3, $11, $7 -; CHECK-BE-NEXT: srlv $3, $3, $5 -; CHECK-BE-NEXT: not $4, $5 -; CHECK-BE-NEXT: sll $5, $8, 1 -; CHECK-BE-NEXT: sllv $4, $5, $4 -; CHECK-BE-NEXT: or $3, $4, $3 -; CHECK-BE-NEXT: movn $3, $9, $1 -; CHECK-BE-NEXT: or $3, $6, $3 +; CHECK-BE-NEXT: movn $1, $5, $3 +; CHECK-BE-NEXT: movn $2, $zero, $4 +; CHECK-BE-NEXT: sllv $4, $19, $6 +; CHECK-BE-NEXT: not $5, $6 +; CHECK-BE-NEXT: srl $6, $18, 1 +; CHECK-BE-NEXT: srlv $5, $6, $5 +; CHECK-BE-NEXT: or $4, $4, $5 +; CHECK-BE-NEXT: movn $4, $7, $3 +; CHECK-BE-NEXT: or $2, $4, $2 +; CHECK-BE-NEXT: move $3, $1 ; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload @@ -135,37 +135,37 @@ ; CHECK-LE-NEXT: jal __umoddi3 ; CHECK-LE-NEXT: addiu $7, $zero, 0 ; CHECK-LE-NEXT: not $1, $2 -; CHECK-LE-NEXT: andi $3, $2, 63 -; CHECK-LE-NEXT: not $4, $3 -; CHECK-LE-NEXT: srl $5, $19, 1 -; CHECK-LE-NEXT: sllv $6, $18, $3 -; CHECK-LE-NEXT: srlv $4, $5, $4 -; CHECK-LE-NEXT: andi $5, $1, 63 -; CHECK-LE-NEXT: srl $7, $17, 5 -; CHECK-LE-NEXT: sll $8, $16, 27 -; CHECK-LE-NEXT: or $7, $8, $7 -; CHECK-LE-NEXT: srl $8, $7, 1 -; CHECK-LE-NEXT: srlv $9, $8, $5 -; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: move $10, $9 -; CHECK-LE-NEXT: movn $10, $zero, $1 -; CHECK-LE-NEXT: or $4, $6, $4 -; CHECK-LE-NEXT: sllv $6, $19, $3 +; CHECK-LE-NEXT: andi $3, $1, 63 +; CHECK-LE-NEXT: srl $4, $17, 5 +; CHECK-LE-NEXT: sll $5, $16, 27 +; CHECK-LE-NEXT: or $4, $5, $4 +; CHECK-LE-NEXT: sll $5, $4, 31 +; CHECK-LE-NEXT: sll $6, $17, 27 +; CHECK-LE-NEXT: srl $6, $6, 1 +; CHECK-LE-NEXT: or $5, $6, $5 +; CHECK-LE-NEXT: srlv $5, $5, $3 +; CHECK-LE-NEXT: not $6, $3 +; CHECK-LE-NEXT: srl $4, $4, 1 +; CHECK-LE-NEXT: sll $7, $4, 1 +; CHECK-LE-NEXT: sllv $6, $7, $6 +; CHECK-LE-NEXT: or $5, $6, $5 +; CHECK-LE-NEXT: srlv $3, $4, $3 +; CHECK-LE-NEXT: andi $4, $1, 32 +; CHECK-LE-NEXT: movn $5, $3, $4 +; CHECK-LE-NEXT: andi $6, $2, 63 +; CHECK-LE-NEXT: sllv $7, $19, $6 +; CHECK-LE-NEXT: or $1, $5, $7 ; CHECK-LE-NEXT: andi $2, $2, 32 -; CHECK-LE-NEXT: movn $4, $6, $2 -; CHECK-LE-NEXT: sll $7, $7, 31 -; CHECK-LE-NEXT: sll $3, $17, 27 -; CHECK-LE-NEXT: srl $11, $3, 1 -; CHECK-LE-NEXT: or $3, $4, $10 -; CHECK-LE-NEXT: movn $6, $zero, $2 -; CHECK-LE-NEXT: or $2, $11, $7 -; CHECK-LE-NEXT: srlv $2, $2, $5 -; CHECK-LE-NEXT: not $4, $5 -; CHECK-LE-NEXT: sll $5, $8, 1 -; CHECK-LE-NEXT: sllv $4, $5, $4 -; CHECK-LE-NEXT: or $2, $4, $2 -; CHECK-LE-NEXT: movn $2, $9, $1 -; CHECK-LE-NEXT: or $2, $6, $2 +; CHECK-LE-NEXT: movn $1, $5, $2 +; CHECK-LE-NEXT: movn $3, $zero, $4 +; CHECK-LE-NEXT: sllv $4, $18, $6 +; CHECK-LE-NEXT: not $5, $6 +; CHECK-LE-NEXT: srl $6, $19, 1 +; CHECK-LE-NEXT: srlv $5, $6, $5 +; CHECK-LE-NEXT: or $4, $4, $5 +; CHECK-LE-NEXT: movn $4, $7, $2 +; CHECK-LE-NEXT: or $3, $4, $3 +; CHECK-LE-NEXT: move $2, $1 ; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload @@ -339,39 +339,38 @@ ; CHECK-BE-NEXT: addiu $7, $zero, 37 ; CHECK-BE-NEXT: addiu $1, $3, 27 ; CHECK-BE-NEXT: andi $2, $1, 63 -; CHECK-BE-NEXT: not $3, $2 -; CHECK-BE-NEXT: srl $4, $16, 5 -; CHECK-BE-NEXT: sll $5, $17, 27 -; CHECK-BE-NEXT: or $4, $5, $4 -; CHECK-BE-NEXT: sll $5, $4, 1 -; CHECK-BE-NEXT: sll $6, $16, 27 -; CHECK-BE-NEXT: srlv $6, $6, $2 -; CHECK-BE-NEXT: sllv $3, $5, $3 -; CHECK-BE-NEXT: not $5, $1 -; CHECK-BE-NEXT: andi $7, $5, 63 -; CHECK-BE-NEXT: sll $8, $18, 1 -; CHECK-BE-NEXT: sllv $8, $8, $7 -; CHECK-BE-NEXT: andi $5, $5, 32 -; CHECK-BE-NEXT: move $9, $8 -; CHECK-BE-NEXT: movn $9, $zero, $5 -; CHECK-BE-NEXT: or $3, $3, $6 -; CHECK-BE-NEXT: srlv $2, $4, $2 +; CHECK-BE-NEXT: sll $3, $16, 27 +; CHECK-BE-NEXT: srlv $3, $3, $2 +; CHECK-BE-NEXT: not $4, $2 +; CHECK-BE-NEXT: srl $5, $16, 5 +; CHECK-BE-NEXT: sll $6, $17, 27 +; CHECK-BE-NEXT: or $5, $6, $5 +; CHECK-BE-NEXT: sll $6, $5, 1 +; CHECK-BE-NEXT: sllv $4, $6, $4 +; CHECK-BE-NEXT: or $4, $4, $3 +; CHECK-BE-NEXT: srlv $2, $5, $2 +; CHECK-BE-NEXT: andi $5, $1, 32 +; CHECK-BE-NEXT: movn $4, $2, $5 +; CHECK-BE-NEXT: not $1, $1 +; CHECK-BE-NEXT: andi $6, $1, 63 +; CHECK-BE-NEXT: sll $3, $18, 1 +; CHECK-BE-NEXT: sllv $7, $3, $6 +; CHECK-BE-NEXT: or $3, $4, $7 ; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: movn $3, $2, $1 -; CHECK-BE-NEXT: srl $4, $18, 31 -; CHECK-BE-NEXT: sll $6, $19, 1 -; CHECK-BE-NEXT: or $4, $6, $4 -; CHECK-BE-NEXT: or $3, $9, $3 -; CHECK-BE-NEXT: movn $2, $zero, $1 -; CHECK-BE-NEXT: sllv $1, $4, $7 -; CHECK-BE-NEXT: not $4, $7 +; CHECK-BE-NEXT: srl $8, $18, 31 +; CHECK-BE-NEXT: sll $9, $19, 1 +; CHECK-BE-NEXT: or $8, $9, $8 +; CHECK-BE-NEXT: movn $3, $4, $1 +; CHECK-BE-NEXT: movn $2, $zero, $5 +; CHECK-BE-NEXT: sllv $4, $8, $6 +; CHECK-BE-NEXT: not $5, $6 ; CHECK-BE-NEXT: lui $6, 32767 ; CHECK-BE-NEXT: ori $6, $6, 65535 ; CHECK-BE-NEXT: and $6, $18, $6 -; CHECK-BE-NEXT: srlv $4, $6, $4 -; CHECK-BE-NEXT: or $1, $1, $4 -; CHECK-BE-NEXT: movn $1, $8, $5 -; CHECK-BE-NEXT: or $2, $1, $2 +; CHECK-BE-NEXT: srlv $5, $6, $5 +; CHECK-BE-NEXT: or $4, $4, $5 +; CHECK-BE-NEXT: movn $4, $7, $1 +; CHECK-BE-NEXT: or $2, $4, $2 ; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload @@ -405,39 +404,38 @@ ; CHECK-LE-NEXT: addiu $7, $zero, 0 ; CHECK-LE-NEXT: addiu $1, $2, 27 ; CHECK-LE-NEXT: andi $2, $1, 63 -; CHECK-LE-NEXT: not $3, $2 -; CHECK-LE-NEXT: srl $4, $17, 5 -; CHECK-LE-NEXT: sll $5, $16, 27 -; CHECK-LE-NEXT: or $4, $5, $4 -; CHECK-LE-NEXT: sll $5, $4, 1 -; CHECK-LE-NEXT: sll $6, $17, 27 -; CHECK-LE-NEXT: srlv $6, $6, $2 -; CHECK-LE-NEXT: sllv $3, $5, $3 -; CHECK-LE-NEXT: not $5, $1 -; CHECK-LE-NEXT: andi $7, $5, 63 -; CHECK-LE-NEXT: sll $8, $19, 1 -; CHECK-LE-NEXT: sllv $8, $8, $7 -; CHECK-LE-NEXT: andi $5, $5, 32 -; CHECK-LE-NEXT: move $9, $8 -; CHECK-LE-NEXT: movn $9, $zero, $5 -; CHECK-LE-NEXT: or $3, $3, $6 -; CHECK-LE-NEXT: srlv $4, $4, $2 +; CHECK-LE-NEXT: sll $3, $17, 27 +; CHECK-LE-NEXT: srlv $3, $3, $2 +; CHECK-LE-NEXT: not $4, $2 +; CHECK-LE-NEXT: srl $5, $17, 5 +; CHECK-LE-NEXT: sll $6, $16, 27 +; CHECK-LE-NEXT: or $5, $6, $5 +; CHECK-LE-NEXT: sll $6, $5, 1 +; CHECK-LE-NEXT: sllv $4, $6, $4 +; CHECK-LE-NEXT: or $3, $4, $3 +; CHECK-LE-NEXT: srlv $4, $5, $2 +; CHECK-LE-NEXT: andi $5, $1, 32 +; CHECK-LE-NEXT: movn $3, $4, $5 +; CHECK-LE-NEXT: not $1, $1 +; CHECK-LE-NEXT: andi $6, $1, 63 +; CHECK-LE-NEXT: sll $2, $19, 1 +; CHECK-LE-NEXT: sllv $7, $2, $6 +; CHECK-LE-NEXT: or $2, $3, $7 ; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: movn $3, $4, $1 -; CHECK-LE-NEXT: srl $2, $19, 31 -; CHECK-LE-NEXT: sll $6, $18, 1 -; CHECK-LE-NEXT: or $6, $6, $2 -; CHECK-LE-NEXT: or $2, $9, $3 -; CHECK-LE-NEXT: movn $4, $zero, $1 -; CHECK-LE-NEXT: sllv $1, $6, $7 -; CHECK-LE-NEXT: not $3, $7 +; CHECK-LE-NEXT: srl $8, $19, 31 +; CHECK-LE-NEXT: sll $9, $18, 1 +; CHECK-LE-NEXT: or $8, $9, $8 +; CHECK-LE-NEXT: movn $2, $3, $1 +; CHECK-LE-NEXT: movn $4, $zero, $5 +; CHECK-LE-NEXT: sllv $3, $8, $6 +; CHECK-LE-NEXT: not $5, $6 ; CHECK-LE-NEXT: lui $6, 32767 ; CHECK-LE-NEXT: ori $6, $6, 65535 ; CHECK-LE-NEXT: and $6, $19, $6 -; CHECK-LE-NEXT: srlv $3, $6, $3 -; CHECK-LE-NEXT: or $1, $1, $3 -; CHECK-LE-NEXT: movn $1, $8, $5 -; CHECK-LE-NEXT: or $3, $1, $4 +; CHECK-LE-NEXT: srlv $5, $6, $5 +; CHECK-LE-NEXT: or $3, $3, $5 +; CHECK-LE-NEXT: movn $3, $7, $1 +; CHECK-LE-NEXT: or $3, $3, $4 ; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll @@ -274,21 +274,21 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) { ; MIPS-LABEL: ashr_i64: -; MIPS: # %bb.0: -; MIPS-NEXT: andi $1, $7, 32 -; MIPS-NEXT: bnez $1, $BB4_2 -; MIPS-NEXT: srav $3, $4, $7 -; MIPS-NEXT: # %bb.1: -; MIPS-NEXT: srlv $1, $5, $7 +; MIPS: # %bb.0: # %entry +; MIPS-NEXT: andi $1, $7, 32 +; MIPS-NEXT: bnez $1, $BB4_2 +; MIPS-NEXT: srav $3, $4, $7 +; MIPS-NEXT: # %bb.1: # %entry +; MIPS-NEXT: srlv $1, $5, $7 ; MIPS-NEXT: not $2, $7 ; MIPS-NEXT: sll $4, $4, 1 -; MIPS-NEXT: sllv $2, $4, $2 -; MIPS-NEXT: or $1, $2, $1 -; MIPS-NEXT: move $2, $3 -; MIPS-NEXT: jr $ra -; MIPS-NEXT: move $3, $1 +; MIPS-NEXT: sllv $2, $4, $2 +; MIPS-NEXT: or $1, $2, $1 +; MIPS-NEXT: move $2, $3 +; MIPS-NEXT: jr $ra +; MIPS-NEXT: move $3, $1 ; MIPS-NEXT: $BB4_2: -; MIPS-NEXT: jr $ra +; MIPS-NEXT: jr $ra ; MIPS-NEXT: sra $2, $4, 31 ; ; MIPS32-LABEL: ashr_i64: @@ -395,221 +395,221 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MIPS-LABEL: ashr_i128: -; MIPS: # %bb.0: -; MIPS-NEXT: lw $2, 28($sp) +; MIPS: # %bb.0: # %entry +; MIPS-NEXT: lw $2, 28($sp) ; MIPS-NEXT: addiu $1, $zero, 64 -; MIPS-NEXT: subu $9, $1, $2 -; MIPS-NEXT: sllv $10, $5, $9 -; MIPS-NEXT: andi $13, $9, 32 -; MIPS-NEXT: andi $3, $2, 32 -; MIPS-NEXT: addiu $11, $zero, 0 -; MIPS-NEXT: bnez $13, $BB5_2 -; MIPS-NEXT: addiu $12, $zero, 0 -; MIPS-NEXT: # %bb.1: -; MIPS-NEXT: move $12, $10 -; MIPS-NEXT: $BB5_2: -; MIPS-NEXT: not $8, $2 -; MIPS-NEXT: bnez $3, $BB5_5 -; MIPS-NEXT: srlv $14, $6, $2 -; MIPS-NEXT: # %bb.3: +; MIPS-NEXT: subu $3, $1, $2 +; MIPS-NEXT: sllv $9, $5, $3 +; MIPS-NEXT: andi $12, $3, 32 +; MIPS-NEXT: bnez $12, $BB5_2 +; MIPS-NEXT: move $11, $9 +; MIPS-NEXT: # %bb.1: # %entry +; MIPS-NEXT: sllv $1, $4, $3 +; MIPS-NEXT: not $3, $3 +; MIPS-NEXT: srl $8, $5, 1 +; MIPS-NEXT: srlv $3, $8, $3 +; MIPS-NEXT: or $11, $1, $3 +; MIPS-NEXT: $BB5_2: # %entry +; MIPS-NEXT: not $3, $2 +; MIPS-NEXT: srlv $13, $6, $2 +; MIPS-NEXT: andi $8, $2, 32 +; MIPS-NEXT: bnez $8, $BB5_5 +; MIPS-NEXT: addiu $14, $zero, 0 +; MIPS-NEXT: # %bb.3: # %entry ; MIPS-NEXT: sll $1, $6, 1 -; MIPS-NEXT: srlv $11, $7, $2 -; MIPS-NEXT: sllv $1, $1, $8 -; MIPS-NEXT: or $15, $1, $11 -; MIPS-NEXT: bnez $13, $BB5_7 -; MIPS-NEXT: move $11, $14 -; MIPS-NEXT: # %bb.4: +; MIPS-NEXT: srlv $10, $7, $2 +; MIPS-NEXT: sllv $1, $1, $3 +; MIPS-NEXT: or $10, $1, $10 +; MIPS-NEXT: move $14, $13 +; MIPS-NEXT: bnez $12, $BB5_7 +; MIPS-NEXT: addiu $13, $2, -64 +; MIPS-NEXT: # %bb.4: # %entry ; MIPS-NEXT: b $BB5_6 ; MIPS-NEXT: nop ; MIPS-NEXT: $BB5_5: -; MIPS-NEXT: bnez $13, $BB5_7 -; MIPS-NEXT: move $15, $14 -; MIPS-NEXT: $BB5_6: -; MIPS-NEXT: sllv $1, $4, $9 -; MIPS-NEXT: not $9, $9 -; MIPS-NEXT: srl $10, $5, 1 -; MIPS-NEXT: srlv $9, $10, $9 -; MIPS-NEXT: or $10, $1, $9 -; MIPS-NEXT: $BB5_7: -; MIPS-NEXT: addiu $24, $2, -64 -; MIPS-NEXT: sll $13, $4, 1 -; MIPS-NEXT: srav $14, $4, $24 -; MIPS-NEXT: andi $1, $24, 32 -; MIPS-NEXT: bnez $1, $BB5_10 +; MIPS-NEXT: move $10, $13 +; MIPS-NEXT: bnez $12, $BB5_7 +; MIPS-NEXT: addiu $13, $2, -64 +; MIPS-NEXT: $BB5_6: # %entry +; MIPS-NEXT: or $10, $10, $9 +; MIPS-NEXT: $BB5_7: # %entry +; MIPS-NEXT: sll $12, $4, 1 +; MIPS-NEXT: srav $25, $4, $13 +; MIPS-NEXT: andi $1, $13, 32 +; MIPS-NEXT: bnez $1, $BB5_10 ; MIPS-NEXT: sra $9, $4, 31 -; MIPS-NEXT: # %bb.8: -; MIPS-NEXT: srlv $1, $5, $24 -; MIPS-NEXT: not $24, $24 -; MIPS-NEXT: sllv $24, $13, $24 -; MIPS-NEXT: or $25, $24, $1 -; MIPS-NEXT: move $24, $14 -; MIPS-NEXT: sltiu $14, $2, 64 -; MIPS-NEXT: beqz $14, $BB5_12 -; MIPS-NEXT: nop -; MIPS-NEXT: # %bb.9: +; MIPS-NEXT: # %bb.8: # %entry +; MIPS-NEXT: srlv $1, $5, $13 +; MIPS-NEXT: not $13, $13 +; MIPS-NEXT: sllv $13, $12, $13 +; MIPS-NEXT: or $15, $13, $1 +; MIPS-NEXT: sltiu $13, $2, 64 +; MIPS-NEXT: beqz $13, $BB5_12 +; MIPS-NEXT: move $24, $25 +; MIPS-NEXT: # %bb.9: # %entry ; MIPS-NEXT: b $BB5_11 ; MIPS-NEXT: nop ; MIPS-NEXT: $BB5_10: -; MIPS-NEXT: move $25, $14 -; MIPS-NEXT: sltiu $14, $2, 64 -; MIPS-NEXT: beqz $14, $BB5_12 -; MIPS-NEXT: move $24, $9 +; MIPS-NEXT: move $24, $9 +; MIPS-NEXT: sltiu $13, $2, 64 +; MIPS-NEXT: beqz $13, $BB5_12 +; MIPS-NEXT: move $15, $25 ; MIPS-NEXT: $BB5_11: -; MIPS-NEXT: or $25, $15, $12 -; MIPS-NEXT: $BB5_12: -; MIPS-NEXT: sltiu $12, $2, 1 -; MIPS-NEXT: beqz $12, $BB5_18 +; MIPS-NEXT: or $24, $14, $11 +; MIPS-NEXT: $BB5_12: # %entry +; MIPS-NEXT: sltiu $11, $2, 1 +; MIPS-NEXT: beqz $11, $BB5_18 ; MIPS-NEXT: nop -; MIPS-NEXT: # %bb.13: -; MIPS-NEXT: bnez $14, $BB5_19 +; MIPS-NEXT: # %bb.13: # %entry +; MIPS-NEXT: beqz $13, $BB5_19 ; MIPS-NEXT: nop -; MIPS-NEXT: $BB5_14: -; MIPS-NEXT: beqz $12, $BB5_20 +; MIPS-NEXT: $BB5_14: # %entry +; MIPS-NEXT: beqz $11, $BB5_20 ; MIPS-NEXT: nop -; MIPS-NEXT: $BB5_15: -; MIPS-NEXT: bnez $3, $BB5_21 -; MIPS-NEXT: srav $4, $4, $2 -; MIPS-NEXT: $BB5_16: -; MIPS-NEXT: srlv $1, $5, $2 -; MIPS-NEXT: sllv $2, $13, $8 -; MIPS-NEXT: or $3, $2, $1 -; MIPS-NEXT: bnez $14, $BB5_23 -; MIPS-NEXT: move $2, $4 -; MIPS-NEXT: # %bb.17: +; MIPS-NEXT: $BB5_15: # %entry +; MIPS-NEXT: bnez $8, $BB5_21 +; MIPS-NEXT: srav $4, $4, $2 +; MIPS-NEXT: $BB5_16: # %entry +; MIPS-NEXT: srlv $1, $5, $2 +; MIPS-NEXT: sllv $2, $12, $3 +; MIPS-NEXT: or $3, $2, $1 +; MIPS-NEXT: bnez $13, $BB5_23 +; MIPS-NEXT: move $2, $4 +; MIPS-NEXT: # %bb.17: # %entry ; MIPS-NEXT: b $BB5_22 ; MIPS-NEXT: nop -; MIPS-NEXT: $BB5_18: -; MIPS-NEXT: beqz $14, $BB5_14 -; MIPS-NEXT: move $7, $25 -; MIPS-NEXT: $BB5_19: -; MIPS-NEXT: bnez $12, $BB5_15 -; MIPS-NEXT: or $24, $11, $10 -; MIPS-NEXT: $BB5_20: -; MIPS-NEXT: move $6, $24 -; MIPS-NEXT: beqz $3, $BB5_16 -; MIPS-NEXT: srav $4, $4, $2 +; MIPS-NEXT: $BB5_18: # %entry +; MIPS-NEXT: bnez $13, $BB5_14 +; MIPS-NEXT: move $6, $24 +; MIPS-NEXT: $BB5_19: # %entry +; MIPS-NEXT: bnez $11, $BB5_15 +; MIPS-NEXT: move $10, $15 +; MIPS-NEXT: $BB5_20: # %entry +; MIPS-NEXT: move $7, $10 +; MIPS-NEXT: beqz $8, $BB5_16 +; MIPS-NEXT: srav $4, $4, $2 ; MIPS-NEXT: $BB5_21: -; MIPS-NEXT: move $2, $9 -; MIPS-NEXT: bnez $14, $BB5_23 -; MIPS-NEXT: move $3, $4 -; MIPS-NEXT: $BB5_22: -; MIPS-NEXT: move $2, $9 -; MIPS-NEXT: $BB5_23: -; MIPS-NEXT: bnez $14, $BB5_25 +; MIPS-NEXT: move $2, $9 +; MIPS-NEXT: bnez $13, $BB5_23 +; MIPS-NEXT: move $3, $4 +; MIPS-NEXT: $BB5_22: # %entry +; MIPS-NEXT: move $2, $9 +; MIPS-NEXT: $BB5_23: # %entry +; MIPS-NEXT: bnez $13, $BB5_25 ; MIPS-NEXT: nop -; MIPS-NEXT: # %bb.24: -; MIPS-NEXT: move $3, $9 -; MIPS-NEXT: $BB5_25: -; MIPS-NEXT: move $4, $6 -; MIPS-NEXT: jr $ra -; MIPS-NEXT: move $5, $7 +; MIPS-NEXT: # %bb.24: # %entry +; MIPS-NEXT: move $3, $9 +; MIPS-NEXT: $BB5_25: # %entry +; MIPS-NEXT: move $4, $6 +; MIPS-NEXT: jr $ra +; MIPS-NEXT: move $5, $7 ; ; MIPS32-LABEL: ashr_i128: ; MIPS32: # %bb.0: # %entry -; MIPS32-NEXT: lw $9, 28($sp) -; MIPS32-NEXT: srlv $1, $7, $9 -; MIPS32-NEXT: not $2, $9 -; MIPS32-NEXT: sll $3, $6, 1 -; MIPS32-NEXT: sllv $3, $3, $2 +; MIPS32-NEXT: lw $3, 28($sp) +; MIPS32-NEXT: srlv $1, $7, $3 +; MIPS32-NEXT: not $9, $3 +; MIPS32-NEXT: sll $2, $6, 1 +; MIPS32-NEXT: sllv $2, $2, $9 ; MIPS32-NEXT: addiu $8, $zero, 64 -; MIPS32-NEXT: or $1, $3, $1 -; MIPS32-NEXT: srlv $10, $6, $9 -; MIPS32-NEXT: subu $3, $8, $9 -; MIPS32-NEXT: sllv $11, $5, $3 +; MIPS32-NEXT: or $1, $2, $1 +; MIPS32-NEXT: srlv $2, $6, $3 +; MIPS32-NEXT: subu $8, $8, $3 +; MIPS32-NEXT: sllv $10, $5, $8 +; MIPS32-NEXT: andi $11, $8, 32 ; MIPS32-NEXT: andi $12, $3, 32 -; MIPS32-NEXT: andi $13, $9, 32 -; MIPS32-NEXT: move $8, $11 -; MIPS32-NEXT: movn $8, $zero, $12 -; MIPS32-NEXT: movn $1, $10, $13 -; MIPS32-NEXT: addiu $14, $9, -64 -; MIPS32-NEXT: srlv $15, $5, $14 -; MIPS32-NEXT: sll $24, $4, 1 -; MIPS32-NEXT: not $25, $14 -; MIPS32-NEXT: sllv $25, $24, $25 -; MIPS32-NEXT: or $gp, $1, $8 -; MIPS32-NEXT: or $1, $25, $15 -; MIPS32-NEXT: srav $8, $4, $14 -; MIPS32-NEXT: andi $14, $14, 32 +; MIPS32-NEXT: move $13, $10 +; MIPS32-NEXT: movn $13, $zero, $11 +; MIPS32-NEXT: movn $1, $2, $12 +; MIPS32-NEXT: sllv $14, $4, $8 +; MIPS32-NEXT: not $8, $8 +; MIPS32-NEXT: srl $15, $5, 1 +; MIPS32-NEXT: srlv $8, $15, $8 +; MIPS32-NEXT: or $8, $14, $8 +; MIPS32-NEXT: movn $8, $10, $11 +; MIPS32-NEXT: or $10, $8, $2 +; MIPS32-NEXT: addiu $2, $3, -64 +; MIPS32-NEXT: movn $10, $8, $12 +; MIPS32-NEXT: or $11, $1, $13 +; MIPS32-NEXT: srlv $1, $5, $2 +; MIPS32-NEXT: sll $13, $4, 1 +; MIPS32-NEXT: not $8, $2 +; MIPS32-NEXT: sllv $8, $13, $8 +; MIPS32-NEXT: or $1, $8, $1 +; MIPS32-NEXT: srav $8, $4, $2 +; MIPS32-NEXT: andi $14, $2, 32 ; MIPS32-NEXT: movn $1, $8, $14 -; MIPS32-NEXT: sllv $15, $4, $3 -; MIPS32-NEXT: not $3, $3 -; MIPS32-NEXT: srl $25, $5, 1 -; MIPS32-NEXT: srlv $3, $25, $3 -; MIPS32-NEXT: sltiu $25, $9, 64 -; MIPS32-NEXT: movn $1, $gp, $25 -; MIPS32-NEXT: or $15, $15, $3 -; MIPS32-NEXT: srlv $3, $5, $9 -; MIPS32-NEXT: sllv $2, $24, $2 -; MIPS32-NEXT: or $5, $2, $3 -; MIPS32-NEXT: srav $24, $4, $9 -; MIPS32-NEXT: movn $5, $24, $13 +; MIPS32-NEXT: sltiu $15, $3, 64 +; MIPS32-NEXT: movn $1, $11, $15 +; MIPS32-NEXT: movz $1, $7, $3 +; MIPS32-NEXT: srav $7, $4, $3 +; MIPS32-NEXT: srlv $5, $5, $3 ; MIPS32-NEXT: sra $2, $4, 31 -; MIPS32-NEXT: movz $1, $7, $9 -; MIPS32-NEXT: move $3, $2 -; MIPS32-NEXT: movn $3, $5, $25 -; MIPS32-NEXT: movn $15, $11, $12 -; MIPS32-NEXT: movn $10, $zero, $13 -; MIPS32-NEXT: or $4, $10, $15 ; MIPS32-NEXT: movn $8, $2, $14 -; MIPS32-NEXT: movn $8, $4, $25 -; MIPS32-NEXT: movz $8, $6, $9 -; MIPS32-NEXT: movn $24, $2, $13 -; MIPS32-NEXT: movn $2, $24, $25 +; MIPS32-NEXT: movn $8, $10, $15 +; MIPS32-NEXT: sllv $4, $13, $9 +; MIPS32-NEXT: movz $8, $6, $3 +; MIPS32-NEXT: or $4, $4, $5 +; MIPS32-NEXT: movn $4, $7, $12 +; MIPS32-NEXT: move $3, $2 +; MIPS32-NEXT: movn $3, $4, $15 +; MIPS32-NEXT: movn $7, $2, $12 +; MIPS32-NEXT: movn $2, $7, $15 ; MIPS32-NEXT: move $4, $8 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: move $5, $1 ; ; 32R2-LABEL: ashr_i128: ; 32R2: # %bb.0: # %entry -; 32R2-NEXT: lw $9, 28($sp) -; 32R2-NEXT: srlv $1, $7, $9 -; 32R2-NEXT: not $2, $9 -; 32R2-NEXT: sll $3, $6, 1 -; 32R2-NEXT: sllv $3, $3, $2 +; 32R2-NEXT: lw $3, 28($sp) +; 32R2-NEXT: srlv $1, $7, $3 +; 32R2-NEXT: not $9, $3 +; 32R2-NEXT: sll $2, $6, 1 +; 32R2-NEXT: sllv $2, $2, $9 ; 32R2-NEXT: addiu $8, $zero, 64 -; 32R2-NEXT: or $1, $3, $1 -; 32R2-NEXT: srlv $10, $6, $9 -; 32R2-NEXT: subu $3, $8, $9 -; 32R2-NEXT: sllv $11, $5, $3 +; 32R2-NEXT: or $1, $2, $1 +; 32R2-NEXT: srlv $2, $6, $3 +; 32R2-NEXT: subu $8, $8, $3 +; 32R2-NEXT: sllv $10, $5, $8 +; 32R2-NEXT: andi $11, $8, 32 ; 32R2-NEXT: andi $12, $3, 32 -; 32R2-NEXT: andi $13, $9, 32 -; 32R2-NEXT: move $8, $11 -; 32R2-NEXT: movn $8, $zero, $12 -; 32R2-NEXT: movn $1, $10, $13 -; 32R2-NEXT: addiu $14, $9, -64 -; 32R2-NEXT: srlv $15, $5, $14 -; 32R2-NEXT: sll $24, $4, 1 -; 32R2-NEXT: not $25, $14 -; 32R2-NEXT: sllv $25, $24, $25 -; 32R2-NEXT: or $gp, $1, $8 -; 32R2-NEXT: or $1, $25, $15 -; 32R2-NEXT: srav $8, $4, $14 -; 32R2-NEXT: andi $14, $14, 32 +; 32R2-NEXT: move $13, $10 +; 32R2-NEXT: movn $13, $zero, $11 +; 32R2-NEXT: movn $1, $2, $12 +; 32R2-NEXT: sllv $14, $4, $8 +; 32R2-NEXT: not $8, $8 +; 32R2-NEXT: srl $15, $5, 1 +; 32R2-NEXT: srlv $8, $15, $8 +; 32R2-NEXT: or $8, $14, $8 +; 32R2-NEXT: movn $8, $10, $11 +; 32R2-NEXT: or $10, $8, $2 +; 32R2-NEXT: addiu $2, $3, -64 +; 32R2-NEXT: movn $10, $8, $12 +; 32R2-NEXT: or $11, $1, $13 +; 32R2-NEXT: srlv $1, $5, $2 +; 32R2-NEXT: sll $13, $4, 1 +; 32R2-NEXT: not $8, $2 +; 32R2-NEXT: sllv $8, $13, $8 +; 32R2-NEXT: or $1, $8, $1 +; 32R2-NEXT: srav $8, $4, $2 +; 32R2-NEXT: andi $14, $2, 32 ; 32R2-NEXT: movn $1, $8, $14 -; 32R2-NEXT: sllv $15, $4, $3 -; 32R2-NEXT: not $3, $3 -; 32R2-NEXT: srl $25, $5, 1 -; 32R2-NEXT: srlv $3, $25, $3 -; 32R2-NEXT: sltiu $25, $9, 64 -; 32R2-NEXT: movn $1, $gp, $25 -; 32R2-NEXT: or $15, $15, $3 -; 32R2-NEXT: srlv $3, $5, $9 -; 32R2-NEXT: sllv $2, $24, $2 -; 32R2-NEXT: or $5, $2, $3 -; 32R2-NEXT: srav $24, $4, $9 -; 32R2-NEXT: movn $5, $24, $13 +; 32R2-NEXT: sltiu $15, $3, 64 +; 32R2-NEXT: movn $1, $11, $15 +; 32R2-NEXT: movz $1, $7, $3 +; 32R2-NEXT: srav $7, $4, $3 +; 32R2-NEXT: srlv $5, $5, $3 ; 32R2-NEXT: sra $2, $4, 31 -; 32R2-NEXT: movz $1, $7, $9 -; 32R2-NEXT: move $3, $2 -; 32R2-NEXT: movn $3, $5, $25 -; 32R2-NEXT: movn $15, $11, $12 -; 32R2-NEXT: movn $10, $zero, $13 -; 32R2-NEXT: or $4, $10, $15 ; 32R2-NEXT: movn $8, $2, $14 -; 32R2-NEXT: movn $8, $4, $25 -; 32R2-NEXT: movz $8, $6, $9 -; 32R2-NEXT: movn $24, $2, $13 -; 32R2-NEXT: movn $2, $24, $25 +; 32R2-NEXT: movn $8, $10, $15 +; 32R2-NEXT: sllv $4, $13, $9 +; 32R2-NEXT: movz $8, $6, $3 +; 32R2-NEXT: or $4, $4, $5 +; 32R2-NEXT: movn $4, $7, $12 +; 32R2-NEXT: move $3, $2 +; 32R2-NEXT: movn $3, $4, $15 +; 32R2-NEXT: movn $7, $2, $12 +; 32R2-NEXT: movn $2, $7, $15 ; 32R2-NEXT: move $4, $8 ; 32R2-NEXT: jr $ra ; 32R2-NEXT: move $5, $1 @@ -621,93 +621,94 @@ ; 32R6-NEXT: subu $1, $1, $3 ; 32R6-NEXT: sllv $2, $5, $1 ; 32R6-NEXT: andi $8, $1, 32 -; 32R6-NEXT: selnez $9, $2, $8 -; 32R6-NEXT: sllv $10, $4, $1 +; 32R6-NEXT: sllv $9, $4, $1 ; 32R6-NEXT: not $1, $1 -; 32R6-NEXT: srl $11, $5, 1 -; 32R6-NEXT: srlv $1, $11, $1 -; 32R6-NEXT: or $1, $10, $1 -; 32R6-NEXT: seleqz $1, $1, $8 +; 32R6-NEXT: srl $10, $5, 1 +; 32R6-NEXT: srlv $1, $10, $1 ; 32R6-NEXT: or $1, $9, $1 -; 32R6-NEXT: srlv $9, $7, $3 -; 32R6-NEXT: not $10, $3 -; 32R6-NEXT: sll $11, $6, 1 -; 32R6-NEXT: sllv $11, $11, $10 -; 32R6-NEXT: or $9, $11, $9 -; 32R6-NEXT: andi $11, $3, 32 -; 32R6-NEXT: seleqz $9, $9, $11 -; 32R6-NEXT: srlv $12, $6, $3 -; 32R6-NEXT: selnez $13, $12, $11 -; 32R6-NEXT: seleqz $12, $12, $11 -; 32R6-NEXT: or $1, $12, $1 -; 32R6-NEXT: seleqz $2, $2, $8 -; 32R6-NEXT: or $8, $13, $9 -; 32R6-NEXT: addiu $9, $3, -64 -; 32R6-NEXT: srlv $12, $5, $9 -; 32R6-NEXT: sll $13, $4, 1 -; 32R6-NEXT: not $14, $9 -; 32R6-NEXT: sllv $14, $13, $14 -; 32R6-NEXT: sltiu $15, $3, 64 -; 32R6-NEXT: or $2, $8, $2 -; 32R6-NEXT: selnez $1, $1, $15 -; 32R6-NEXT: or $8, $14, $12 -; 32R6-NEXT: srav $12, $4, $9 -; 32R6-NEXT: andi $9, $9, 32 -; 32R6-NEXT: seleqz $14, $12, $9 -; 32R6-NEXT: sra $24, $4, 31 -; 32R6-NEXT: selnez $25, $24, $9 -; 32R6-NEXT: seleqz $8, $8, $9 -; 32R6-NEXT: or $14, $25, $14 -; 32R6-NEXT: seleqz $14, $14, $15 -; 32R6-NEXT: selnez $9, $12, $9 -; 32R6-NEXT: seleqz $12, $24, $15 -; 32R6-NEXT: or $1, $1, $14 -; 32R6-NEXT: selnez $14, $1, $3 -; 32R6-NEXT: selnez $1, $2, $15 -; 32R6-NEXT: or $2, $9, $8 -; 32R6-NEXT: srav $8, $4, $3 -; 32R6-NEXT: seleqz $4, $8, $11 -; 32R6-NEXT: selnez $9, $24, $11 -; 32R6-NEXT: or $4, $9, $4 -; 32R6-NEXT: selnez $9, $4, $15 -; 32R6-NEXT: seleqz $2, $2, $15 -; 32R6-NEXT: seleqz $4, $6, $3 -; 32R6-NEXT: seleqz $6, $7, $3 -; 32R6-NEXT: or $1, $1, $2 +; 32R6-NEXT: seleqz $9, $2, $8 +; 32R6-NEXT: srlv $10, $7, $3 +; 32R6-NEXT: not $11, $3 +; 32R6-NEXT: sll $12, $6, 1 +; 32R6-NEXT: sllv $12, $12, $11 +; 32R6-NEXT: or $10, $12, $10 +; 32R6-NEXT: andi $12, $3, 32 +; 32R6-NEXT: seleqz $10, $10, $12 +; 32R6-NEXT: srlv $13, $6, $3 +; 32R6-NEXT: selnez $14, $13, $12 +; 32R6-NEXT: or $10, $14, $10 +; 32R6-NEXT: or $9, $10, $9 +; 32R6-NEXT: selnez $2, $2, $8 +; 32R6-NEXT: seleqz $1, $1, $8 +; 32R6-NEXT: addiu $8, $3, -64 +; 32R6-NEXT: srlv $10, $5, $8 +; 32R6-NEXT: sll $14, $4, 1 +; 32R6-NEXT: not $15, $8 +; 32R6-NEXT: sllv $15, $14, $15 +; 32R6-NEXT: sltiu $24, $3, 64 +; 32R6-NEXT: selnez $9, $9, $24 +; 32R6-NEXT: or $1, $2, $1 +; 32R6-NEXT: or $2, $15, $10 +; 32R6-NEXT: andi $10, $8, 32 +; 32R6-NEXT: seleqz $2, $2, $10 +; 32R6-NEXT: srav $8, $4, $8 +; 32R6-NEXT: selnez $15, $8, $10 +; 32R6-NEXT: or $2, $15, $2 +; 32R6-NEXT: seleqz $2, $2, $24 +; 32R6-NEXT: srav $15, $4, $3 +; 32R6-NEXT: selnez $25, $1, $12 +; 32R6-NEXT: or $1, $1, $13 +; 32R6-NEXT: or $2, $9, $2 +; 32R6-NEXT: seleqz $9, $15, $12 +; 32R6-NEXT: sra $4, $4, 31 +; 32R6-NEXT: selnez $13, $4, $12 +; 32R6-NEXT: or $9, $13, $9 +; 32R6-NEXT: seleqz $13, $4, $24 +; 32R6-NEXT: selnez $2, $2, $3 +; 32R6-NEXT: selnez $9, $9, $24 +; 32R6-NEXT: seleqz $7, $7, $3 +; 32R6-NEXT: seleqz $6, $6, $3 +; 32R6-NEXT: seleqz $1, $1, $12 +; 32R6-NEXT: or $1, $25, $1 +; 32R6-NEXT: seleqz $8, $8, $10 +; 32R6-NEXT: selnez $4, $4, $10 +; 32R6-NEXT: selnez $1, $1, $24 +; 32R6-NEXT: or $4, $4, $8 +; 32R6-NEXT: seleqz $4, $4, $24 +; 32R6-NEXT: or $1, $1, $4 ; 32R6-NEXT: selnez $1, $1, $3 -; 32R6-NEXT: or $1, $6, $1 -; 32R6-NEXT: or $4, $4, $14 -; 32R6-NEXT: or $2, $9, $12 +; 32R6-NEXT: or $4, $6, $1 +; 32R6-NEXT: or $1, $7, $2 +; 32R6-NEXT: or $2, $9, $13 ; 32R6-NEXT: srlv $3, $5, $3 -; 32R6-NEXT: sllv $5, $13, $10 +; 32R6-NEXT: sllv $5, $14, $11 ; 32R6-NEXT: or $3, $5, $3 -; 32R6-NEXT: seleqz $3, $3, $11 -; 32R6-NEXT: selnez $5, $8, $11 +; 32R6-NEXT: seleqz $3, $3, $12 +; 32R6-NEXT: selnez $5, $15, $12 ; 32R6-NEXT: or $3, $5, $3 -; 32R6-NEXT: selnez $3, $3, $15 -; 32R6-NEXT: or $3, $3, $12 +; 32R6-NEXT: selnez $3, $3, $24 +; 32R6-NEXT: or $3, $3, $13 ; 32R6-NEXT: jr $ra ; 32R6-NEXT: move $5, $1 ; ; MIPS3-LABEL: ashr_i128: ; MIPS3: # %bb.0: # %entry ; MIPS3-NEXT: sll $2, $7, 0 -; MIPS3-NEXT: andi $1, $2, 64 -; MIPS3-NEXT: bnez $1, .LBB5_2 +; MIPS3-NEXT: andi $1, $2, 64 +; MIPS3-NEXT: bnez $1, .LBB5_2 ; MIPS3-NEXT: dsrav $3, $4, $7 -; MIPS3-NEXT: # %bb.1: +; MIPS3-NEXT: # %bb.1: # %entry ; MIPS3-NEXT: dsrlv $1, $5, $7 -; MIPS3-NEXT: dsll $4, $4, 1 +; MIPS3-NEXT: dsll $4, $4, 1 ; MIPS3-NEXT: not $2, $2 ; MIPS3-NEXT: dsllv $2, $4, $2 -; MIPS3-NEXT: or $1, $2, $1 -; MIPS3-NEXT: move $2, $3 -; MIPS3-NEXT: jr $ra -; MIPS3-NEXT: move $3, $1 +; MIPS3-NEXT: or $1, $2, $1 +; MIPS3-NEXT: move $2, $3 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: move $3, $1 ; MIPS3-NEXT: .LBB5_2: -; MIPS3-NEXT: jr $ra -; MIPS3-NEXT: dsra $2, $4, 63 - +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: dsra $2, $4, 63 ; ; MIPS64-LABEL: ashr_i128: ; MIPS64: # %bb.0: # %entry @@ -761,88 +762,86 @@ ; ; MMR3-LABEL: ashr_i128: ; MMR3: # %bb.0: # %entry -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: swp $16, 40($sp) +; MMR3-NEXT: addiusp -32 +; MMR3-NEXT: .cfi_def_cfa_offset 32 +; MMR3-NEXT: swp $16, 24($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: move $8, $7 -; MMR3-NEXT: sw $6, 32($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $5, 36($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $16, 76($sp) -; MMR3-NEXT: srlv $4, $7, $16 -; MMR3-NEXT: not16 $3, $16 -; MMR3-NEXT: sw $3, 24($sp) # 4-byte Folded Spill -; MMR3-NEXT: sll16 $2, $6, 1 -; MMR3-NEXT: sllv $3, $2, $3 -; MMR3-NEXT: li16 $2, 64 -; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: srlv $6, $6, $16 -; MMR3-NEXT: sw $6, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: subu16 $7, $2, $16 -; MMR3-NEXT: sllv $9, $5, $7 -; MMR3-NEXT: andi16 $2, $7, 32 -; MMR3-NEXT: sw $2, 28($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $5, $16, 32 -; MMR3-NEXT: sw $5, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $4, $9 -; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movn $4, $17, $2 -; MMR3-NEXT: movn $3, $6, $5 -; MMR3-NEXT: addiu $2, $16, -64 -; MMR3-NEXT: lw $5, 36($sp) # 4-byte Folded Reload -; MMR3-NEXT: srlv $5, $5, $2 -; MMR3-NEXT: sw $5, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: sll16 $6, $17, 1 ; MMR3-NEXT: sw $6, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: not16 $5, $2 -; MMR3-NEXT: sllv $5, $6, $5 -; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $5, $4 -; MMR3-NEXT: srav $1, $17, $2 -; MMR3-NEXT: andi16 $2, $2, 32 -; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $5, $1, $2 -; MMR3-NEXT: sllv $2, $17, $7 -; MMR3-NEXT: not16 $4, $7 -; MMR3-NEXT: lw $7, 36($sp) # 4-byte Folded Reload -; MMR3-NEXT: srl16 $6, $7, 1 -; MMR3-NEXT: srlv $6, $6, $4 -; MMR3-NEXT: sltiu $10, $16, 64 -; MMR3-NEXT: movn $5, $3, $10 -; MMR3-NEXT: or16 $6, $2 -; MMR3-NEXT: srlv $2, $7, $16 -; MMR3-NEXT: lw $3, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: lw $4, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: sllv $3, $4, $3 -; MMR3-NEXT: or16 $3, $2 -; MMR3-NEXT: srav $11, $17, $16 -; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $11, $4 -; MMR3-NEXT: sra $2, $17, 31 -; MMR3-NEXT: movz $5, $8, $16 -; MMR3-NEXT: move $8, $2 -; MMR3-NEXT: movn $8, $3, $10 -; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $6, $9, $3 -; MMR3-NEXT: li16 $3, 0 +; MMR3-NEXT: move $17, $5 +; MMR3-NEXT: sw $5, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: lw $3, 60($sp) +; MMR3-NEXT: srlv $4, $7, $3 +; MMR3-NEXT: not16 $7, $3 +; MMR3-NEXT: sw $7, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: sll16 $2, $6, 1 +; MMR3-NEXT: sllv $2, $2, $7 +; MMR3-NEXT: li16 $7, 64 +; MMR3-NEXT: or16 $2, $4 +; MMR3-NEXT: srlv $5, $6, $3 +; MMR3-NEXT: subu16 $16, $7, $3 +; MMR3-NEXT: sllv $1, $17, $16 +; MMR3-NEXT: andi16 $4, $16, 32 +; MMR3-NEXT: andi16 $17, $3, 32 +; MMR3-NEXT: sw $17, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $7, $1 +; MMR3-NEXT: li16 $6, 0 +; MMR3-NEXT: movn $7, $6, $4 +; MMR3-NEXT: move $6, $4 +; MMR3-NEXT: movn $2, $5, $17 +; MMR3-NEXT: lw $17, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $17, $17, $16 +; MMR3-NEXT: sw $17, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $16, $16 +; MMR3-NEXT: lw $4, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: srl16 $17, $4, 1 +; MMR3-NEXT: srlv $16, $17, $16 +; MMR3-NEXT: lw $17, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $16, $17 +; MMR3-NEXT: movn $16, $1, $6 +; MMR3-NEXT: or16 $5, $16 +; MMR3-NEXT: addiu $6, $3, -64 +; MMR3-NEXT: lw $17, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $5, $16, $17 +; MMR3-NEXT: or16 $2, $7 +; MMR3-NEXT: srlv $7, $4, $6 +; MMR3-NEXT: lw $17, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $4, $17, 1 +; MMR3-NEXT: sw $4, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $16, $6 +; MMR3-NEXT: sllv $16, $4, $16 +; MMR3-NEXT: or16 $16, $7 +; MMR3-NEXT: srav $1, $17, $6 +; MMR3-NEXT: andi16 $6, $6, 32 +; MMR3-NEXT: movn $16, $1, $6 +; MMR3-NEXT: sltiu $9, $3, 64 +; MMR3-NEXT: movn $16, $2, $9 +; MMR3-NEXT: movz $16, $8, $3 +; MMR3-NEXT: move $2, $17 +; MMR3-NEXT: srav $8, $17, $3 ; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $7, $3, $4 -; MMR3-NEXT: or16 $7, $6 -; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $1, $2, $3 -; MMR3-NEXT: movn $1, $7, $10 -; MMR3-NEXT: lw $3, 32($sp) # 4-byte Folded Reload -; MMR3-NEXT: movz $1, $3, $16 -; MMR3-NEXT: movn $11, $2, $4 -; MMR3-NEXT: movn $2, $11, $10 -; MMR3-NEXT: move $3, $8 +; MMR3-NEXT: srlv $17, $7, $3 +; MMR3-NEXT: sra $2, $2, 31 +; MMR3-NEXT: movn $1, $2, $6 +; MMR3-NEXT: movn $1, $5, $9 +; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $5, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $4, $5, $4 +; MMR3-NEXT: lw $5, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: movz $1, $5, $3 +; MMR3-NEXT: or16 $4, $17 +; MMR3-NEXT: lw $5, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $8, $5 +; MMR3-NEXT: move $3, $2 +; MMR3-NEXT: movn $3, $4, $9 +; MMR3-NEXT: movn $8, $2, $5 +; MMR3-NEXT: movn $2, $8, $9 ; MMR3-NEXT: move $4, $1 -; MMR3-NEXT: lwp $16, 40($sp) -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: move $5, $16 +; MMR3-NEXT: lwp $16, 24($sp) +; MMR3-NEXT: addiusp 32 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: ashr_i128: @@ -854,75 +853,83 @@ ; MMR6-NEXT: .cfi_offset 17, -4 ; MMR6-NEXT: .cfi_offset 16, -8 ; MMR6-NEXT: move $1, $7 +; MMR6-NEXT: move $17, $6 +; MMR6-NEXT: move $6, $5 +; MMR6-NEXT: move $5, $4 ; MMR6-NEXT: lw $3, 44($sp) ; MMR6-NEXT: li16 $2, 64 ; MMR6-NEXT: subu16 $7, $2, $3 -; MMR6-NEXT: sllv $8, $5, $7 +; MMR6-NEXT: sllv $8, $6, $7 ; MMR6-NEXT: andi16 $2, $7, 32 -; MMR6-NEXT: selnez $9, $8, $2 -; MMR6-NEXT: sllv $10, $4, $7 +; MMR6-NEXT: sllv $9, $4, $7 ; MMR6-NEXT: not16 $7, $7 -; MMR6-NEXT: srl16 $16, $5, 1 +; MMR6-NEXT: srl16 $16, $6, 1 ; MMR6-NEXT: srlv $7, $16, $7 -; MMR6-NEXT: or $7, $10, $7 -; MMR6-NEXT: seleqz $7, $7, $2 ; MMR6-NEXT: or $7, $9, $7 -; MMR6-NEXT: srlv $9, $1, $3 +; MMR6-NEXT: seleqz $9, $8, $2 +; MMR6-NEXT: srlv $10, $1, $3 ; MMR6-NEXT: not16 $16, $3 ; MMR6-NEXT: sw $16, 4($sp) # 4-byte Folded Spill -; MMR6-NEXT: sll16 $17, $6, 1 -; MMR6-NEXT: sllv $10, $17, $16 -; MMR6-NEXT: or $9, $10, $9 +; MMR6-NEXT: move $4, $17 +; MMR6-NEXT: sw $17, 0($sp) # 4-byte Folded Spill +; MMR6-NEXT: sll16 $17, $17, 1 +; MMR6-NEXT: sllv $11, $17, $16 +; MMR6-NEXT: or $10, $11, $10 ; MMR6-NEXT: andi16 $17, $3, 32 -; MMR6-NEXT: seleqz $9, $9, $17 -; MMR6-NEXT: srlv $10, $6, $3 -; MMR6-NEXT: selnez $11, $10, $17 ; MMR6-NEXT: seleqz $10, $10, $17 -; MMR6-NEXT: or $10, $10, $7 -; MMR6-NEXT: seleqz $12, $8, $2 -; MMR6-NEXT: or $8, $11, $9 +; MMR6-NEXT: srlv $11, $4, $3 +; MMR6-NEXT: selnez $12, $11, $17 +; MMR6-NEXT: or $10, $12, $10 +; MMR6-NEXT: or $9, $10, $9 +; MMR6-NEXT: selnez $8, $8, $2 +; MMR6-NEXT: seleqz $4, $7, $2 ; MMR6-NEXT: addiu $2, $3, -64 -; MMR6-NEXT: srlv $9, $5, $2 -; MMR6-NEXT: sll16 $7, $4, 1 +; MMR6-NEXT: srlv $10, $6, $2 +; MMR6-NEXT: sll16 $7, $5, 1 ; MMR6-NEXT: not16 $16, $2 -; MMR6-NEXT: sllv $11, $7, $16 +; MMR6-NEXT: sllv $12, $7, $16 ; MMR6-NEXT: sltiu $13, $3, 64 -; MMR6-NEXT: or $8, $8, $12 +; MMR6-NEXT: selnez $9, $9, $13 +; MMR6-NEXT: or $8, $8, $4 +; MMR6-NEXT: or $10, $12, $10 +; MMR6-NEXT: andi16 $4, $2, 32 +; MMR6-NEXT: seleqz $10, $10, $4 +; MMR6-NEXT: srav $2, $5, $2 +; MMR6-NEXT: selnez $12, $2, $4 +; MMR6-NEXT: or $10, $12, $10 +; MMR6-NEXT: seleqz $10, $10, $13 +; MMR6-NEXT: srav $12, $5, $3 +; MMR6-NEXT: selnez $14, $8, $17 +; MMR6-NEXT: or $8, $8, $11 +; MMR6-NEXT: or $9, $9, $10 +; MMR6-NEXT: seleqz $10, $12, $17 +; MMR6-NEXT: sra $5, $5, 31 +; MMR6-NEXT: selnez $11, $5, $17 +; MMR6-NEXT: or $10, $11, $10 +; MMR6-NEXT: seleqz $11, $5, $13 +; MMR6-NEXT: selnez $9, $9, $3 ; MMR6-NEXT: selnez $10, $10, $13 -; MMR6-NEXT: or $9, $11, $9 -; MMR6-NEXT: srav $11, $4, $2 -; MMR6-NEXT: andi16 $2, $2, 32 -; MMR6-NEXT: seleqz $12, $11, $2 -; MMR6-NEXT: sra $14, $4, 31 -; MMR6-NEXT: selnez $15, $14, $2 -; MMR6-NEXT: seleqz $9, $9, $2 -; MMR6-NEXT: or $12, $15, $12 -; MMR6-NEXT: seleqz $12, $12, $13 -; MMR6-NEXT: selnez $2, $11, $2 -; MMR6-NEXT: seleqz $11, $14, $13 -; MMR6-NEXT: or $10, $10, $12 -; MMR6-NEXT: selnez $10, $10, $3 -; MMR6-NEXT: selnez $8, $8, $13 -; MMR6-NEXT: or $2, $2, $9 -; MMR6-NEXT: srav $9, $4, $3 -; MMR6-NEXT: seleqz $4, $9, $17 -; MMR6-NEXT: selnez $12, $14, $17 -; MMR6-NEXT: or $4, $12, $4 -; MMR6-NEXT: selnez $12, $4, $13 -; MMR6-NEXT: seleqz $2, $2, $13 -; MMR6-NEXT: seleqz $4, $6, $3 ; MMR6-NEXT: seleqz $1, $1, $3 -; MMR6-NEXT: or $2, $8, $2 +; MMR6-NEXT: lw $16, 0($sp) # 4-byte Folded Reload +; MMR6-NEXT: seleqz $15, $16, $3 +; MMR6-NEXT: seleqz $8, $8, $17 +; MMR6-NEXT: or $8, $14, $8 +; MMR6-NEXT: seleqz $2, $2, $4 +; MMR6-NEXT: selnez $4, $5, $4 +; MMR6-NEXT: selnez $5, $8, $13 +; MMR6-NEXT: or $2, $4, $2 +; MMR6-NEXT: seleqz $2, $2, $13 +; MMR6-NEXT: or $2, $5, $2 ; MMR6-NEXT: selnez $2, $2, $3 -; MMR6-NEXT: or $1, $1, $2 -; MMR6-NEXT: or $4, $4, $10 -; MMR6-NEXT: or $2, $12, $11 -; MMR6-NEXT: srlv $3, $5, $3 +; MMR6-NEXT: or $4, $15, $2 +; MMR6-NEXT: or $1, $1, $9 +; MMR6-NEXT: or $2, $10, $11 +; MMR6-NEXT: srlv $3, $6, $3 ; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload ; MMR6-NEXT: sllv $5, $7, $5 ; MMR6-NEXT: or $3, $5, $3 ; MMR6-NEXT: seleqz $3, $3, $17 -; MMR6-NEXT: selnez $5, $9, $17 +; MMR6-NEXT: selnez $5, $12, $17 ; MMR6-NEXT: or $3, $5, $3 ; MMR6-NEXT: selnez $3, $3, $13 ; MMR6-NEXT: or $3, $3, $11 @@ -931,6 +938,7 @@ ; MMR6-NEXT: lw $17, 12($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 16 ; MMR6-NEXT: jrc $ra + entry: ; o32 shouldn't use TImode helpers. ; GP32-NOT: lw $25, %call16(__ashrti3)($gp) diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll @@ -398,83 +398,83 @@ ; MIPS2: # %bb.0: # %entry ; MIPS2-NEXT: lw $2, 28($sp) ; MIPS2-NEXT: addiu $1, $zero, 64 -; MIPS2-NEXT: subu $12, $1, $2 -; MIPS2-NEXT: sllv $10, $5, $12 -; MIPS2-NEXT: andi $15, $12, 32 -; MIPS2-NEXT: andi $8, $2, 32 -; MIPS2-NEXT: addiu $3, $zero, 0 -; MIPS2-NEXT: bnez $15, $BB5_2 -; MIPS2-NEXT: addiu $13, $zero, 0 +; MIPS2-NEXT: subu $3, $1, $2 +; MIPS2-NEXT: sllv $12, $5, $3 +; MIPS2-NEXT: andi $14, $3, 32 +; MIPS2-NEXT: bnez $14, $BB5_2 +; MIPS2-NEXT: move $11, $12 ; MIPS2-NEXT: # %bb.1: # %entry -; MIPS2-NEXT: move $13, $10 +; MIPS2-NEXT: sllv $1, $4, $3 +; MIPS2-NEXT: not $3, $3 +; MIPS2-NEXT: srl $8, $5, 1 +; MIPS2-NEXT: srlv $3, $8, $3 +; MIPS2-NEXT: or $11, $1, $3 ; MIPS2-NEXT: $BB5_2: # %entry -; MIPS2-NEXT: not $9, $2 -; MIPS2-NEXT: bnez $8, $BB5_5 -; MIPS2-NEXT: srlv $24, $6, $2 +; MIPS2-NEXT: not $8, $2 +; MIPS2-NEXT: srlv $15, $6, $2 +; MIPS2-NEXT: andi $9, $2, 32 +; MIPS2-NEXT: bnez $9, $BB5_5 +; MIPS2-NEXT: addiu $3, $zero, 0 ; MIPS2-NEXT: # %bb.3: # %entry ; MIPS2-NEXT: sll $1, $6, 1 -; MIPS2-NEXT: srlv $11, $7, $2 -; MIPS2-NEXT: sllv $1, $1, $9 -; MIPS2-NEXT: or $14, $1, $11 -; MIPS2-NEXT: bnez $15, $BB5_7 -; MIPS2-NEXT: move $11, $24 +; MIPS2-NEXT: srlv $10, $7, $2 +; MIPS2-NEXT: sllv $1, $1, $8 +; MIPS2-NEXT: or $10, $1, $10 +; MIPS2-NEXT: move $13, $15 +; MIPS2-NEXT: bnez $14, $BB5_7 +; MIPS2-NEXT: addiu $15, $2, -64 ; MIPS2-NEXT: # %bb.4: # %entry ; MIPS2-NEXT: b $BB5_6 ; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_5: -; MIPS2-NEXT: addiu $11, $zero, 0 -; MIPS2-NEXT: bnez $15, $BB5_7 -; MIPS2-NEXT: move $14, $24 +; MIPS2-NEXT: addiu $13, $zero, 0 +; MIPS2-NEXT: move $10, $15 +; MIPS2-NEXT: bnez $14, $BB5_7 +; MIPS2-NEXT: addiu $15, $2, -64 ; MIPS2-NEXT: $BB5_6: # %entry -; MIPS2-NEXT: sllv $1, $4, $12 -; MIPS2-NEXT: not $10, $12 -; MIPS2-NEXT: srl $12, $5, 1 -; MIPS2-NEXT: srlv $10, $12, $10 -; MIPS2-NEXT: or $10, $1, $10 +; MIPS2-NEXT: or $10, $10, $12 ; MIPS2-NEXT: $BB5_7: # %entry -; MIPS2-NEXT: addiu $15, $2, -64 ; MIPS2-NEXT: sll $12, $4, 1 ; MIPS2-NEXT: andi $1, $15, 32 ; MIPS2-NEXT: bnez $1, $BB5_10 -; MIPS2-NEXT: srlv $25, $4, $15 +; MIPS2-NEXT: srlv $24, $4, $15 ; MIPS2-NEXT: # %bb.8: # %entry ; MIPS2-NEXT: srlv $1, $5, $15 -; MIPS2-NEXT: not $15, $15 -; MIPS2-NEXT: sllv $15, $12, $15 -; MIPS2-NEXT: or $24, $15, $1 -; MIPS2-NEXT: move $15, $25 -; MIPS2-NEXT: sltiu $25, $2, 64 -; MIPS2-NEXT: beqz $25, $BB5_12 -; MIPS2-NEXT: nop +; MIPS2-NEXT: not $14, $15 +; MIPS2-NEXT: sllv $14, $12, $14 +; MIPS2-NEXT: move $15, $24 +; MIPS2-NEXT: sltiu $24, $2, 64 +; MIPS2-NEXT: beqz $24, $BB5_12 +; MIPS2-NEXT: or $14, $14, $1 ; MIPS2-NEXT: # %bb.9: # %entry ; MIPS2-NEXT: b $BB5_11 ; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_10: -; MIPS2-NEXT: move $24, $25 -; MIPS2-NEXT: sltiu $25, $2, 64 -; MIPS2-NEXT: beqz $25, $BB5_12 +; MIPS2-NEXT: move $14, $24 +; MIPS2-NEXT: sltiu $24, $2, 64 +; MIPS2-NEXT: beqz $24, $BB5_12 ; MIPS2-NEXT: addiu $15, $zero, 0 ; MIPS2-NEXT: $BB5_11: -; MIPS2-NEXT: or $24, $14, $13 +; MIPS2-NEXT: or $15, $13, $11 ; MIPS2-NEXT: $BB5_12: # %entry -; MIPS2-NEXT: sltiu $13, $2, 1 -; MIPS2-NEXT: beqz $13, $BB5_19 +; MIPS2-NEXT: sltiu $11, $2, 1 +; MIPS2-NEXT: beqz $11, $BB5_19 ; MIPS2-NEXT: nop ; MIPS2-NEXT: # %bb.13: # %entry -; MIPS2-NEXT: bnez $25, $BB5_20 +; MIPS2-NEXT: beqz $24, $BB5_20 ; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_14: # %entry -; MIPS2-NEXT: bnez $13, $BB5_16 -; MIPS2-NEXT: addiu $10, $zero, 63 +; MIPS2-NEXT: bnez $11, $BB5_16 +; MIPS2-NEXT: addiu $13, $zero, 63 ; MIPS2-NEXT: $BB5_15: # %entry -; MIPS2-NEXT: move $6, $15 +; MIPS2-NEXT: move $7, $10 ; MIPS2-NEXT: $BB5_16: # %entry -; MIPS2-NEXT: sltu $10, $10, $2 -; MIPS2-NEXT: bnez $8, $BB5_22 +; MIPS2-NEXT: sltu $10, $13, $2 +; MIPS2-NEXT: bnez $9, $BB5_22 ; MIPS2-NEXT: srlv $11, $4, $2 ; MIPS2-NEXT: # %bb.17: # %entry ; MIPS2-NEXT: srlv $1, $5, $2 -; MIPS2-NEXT: sllv $2, $12, $9 +; MIPS2-NEXT: sllv $2, $12, $8 ; MIPS2-NEXT: or $4, $2, $1 ; MIPS2-NEXT: move $5, $11 ; MIPS2-NEXT: bnez $10, $BB5_24 @@ -483,13 +483,13 @@ ; MIPS2-NEXT: b $BB5_23 ; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_19: # %entry -; MIPS2-NEXT: beqz $25, $BB5_14 -; MIPS2-NEXT: move $7, $24 -; MIPS2-NEXT: $BB5_20: -; MIPS2-NEXT: or $15, $11, $10 -; MIPS2-NEXT: bnez $13, $BB5_16 -; MIPS2-NEXT: addiu $10, $zero, 63 -; MIPS2-NEXT: # %bb.21: +; MIPS2-NEXT: bnez $24, $BB5_14 +; MIPS2-NEXT: move $6, $15 +; MIPS2-NEXT: $BB5_20: # %entry +; MIPS2-NEXT: move $10, $14 +; MIPS2-NEXT: bnez $11, $BB5_16 +; MIPS2-NEXT: addiu $13, $zero, 63 +; MIPS2-NEXT: # %bb.21: # %entry ; MIPS2-NEXT: b $BB5_15 ; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_22: @@ -511,184 +511,181 @@ ; ; MIPS32-LABEL: lshr_i128: ; MIPS32: # %bb.0: # %entry -; MIPS32-NEXT: lw $9, 28($sp) -; MIPS32-NEXT: addiu $1, $zero, 64 -; MIPS32-NEXT: subu $2, $1, $9 -; MIPS32-NEXT: sllv $10, $5, $2 -; MIPS32-NEXT: andi $11, $2, 32 -; MIPS32-NEXT: move $1, $10 -; MIPS32-NEXT: movn $1, $zero, $11 -; MIPS32-NEXT: srlv $3, $7, $9 -; MIPS32-NEXT: not $12, $9 -; MIPS32-NEXT: sll $8, $6, 1 -; MIPS32-NEXT: sllv $8, $8, $12 -; MIPS32-NEXT: or $3, $8, $3 -; MIPS32-NEXT: srlv $13, $6, $9 -; MIPS32-NEXT: andi $14, $9, 32 -; MIPS32-NEXT: movn $3, $13, $14 -; MIPS32-NEXT: addiu $15, $9, -64 -; MIPS32-NEXT: or $3, $3, $1 -; MIPS32-NEXT: srlv $1, $5, $15 -; MIPS32-NEXT: sll $24, $4, 1 -; MIPS32-NEXT: not $8, $15 -; MIPS32-NEXT: sllv $8, $24, $8 +; MIPS32-NEXT: lw $3, 28($sp) +; MIPS32-NEXT: srlv $1, $7, $3 +; MIPS32-NEXT: not $9, $3 +; MIPS32-NEXT: sll $2, $6, 1 +; MIPS32-NEXT: sllv $2, $2, $9 +; MIPS32-NEXT: addiu $8, $zero, 64 +; MIPS32-NEXT: or $1, $2, $1 +; MIPS32-NEXT: srlv $2, $6, $3 +; MIPS32-NEXT: subu $8, $8, $3 +; MIPS32-NEXT: sllv $10, $5, $8 +; MIPS32-NEXT: andi $11, $8, 32 +; MIPS32-NEXT: andi $12, $3, 32 +; MIPS32-NEXT: move $13, $10 +; MIPS32-NEXT: movn $13, $zero, $11 +; MIPS32-NEXT: movn $1, $2, $12 +; MIPS32-NEXT: sllv $14, $4, $8 +; MIPS32-NEXT: not $8, $8 +; MIPS32-NEXT: srl $15, $5, 1 +; MIPS32-NEXT: srlv $8, $15, $8 +; MIPS32-NEXT: or $8, $14, $8 +; MIPS32-NEXT: movn $8, $10, $11 +; MIPS32-NEXT: or $10, $8, $2 +; MIPS32-NEXT: addiu $2, $3, -64 +; MIPS32-NEXT: movn $10, $8, $12 +; MIPS32-NEXT: or $11, $1, $13 +; MIPS32-NEXT: srlv $1, $5, $2 +; MIPS32-NEXT: sll $13, $4, 1 +; MIPS32-NEXT: not $8, $2 +; MIPS32-NEXT: sllv $8, $13, $8 ; MIPS32-NEXT: or $1, $8, $1 -; MIPS32-NEXT: srlv $8, $4, $15 -; MIPS32-NEXT: andi $15, $15, 32 -; MIPS32-NEXT: movn $1, $8, $15 -; MIPS32-NEXT: sltiu $25, $9, 64 -; MIPS32-NEXT: movn $1, $3, $25 -; MIPS32-NEXT: sllv $3, $4, $2 -; MIPS32-NEXT: not $2, $2 -; MIPS32-NEXT: srl $gp, $5, 1 -; MIPS32-NEXT: srlv $2, $gp, $2 -; MIPS32-NEXT: or $gp, $3, $2 -; MIPS32-NEXT: srlv $2, $5, $9 -; MIPS32-NEXT: sllv $3, $24, $12 -; MIPS32-NEXT: or $3, $3, $2 -; MIPS32-NEXT: srlv $2, $4, $9 -; MIPS32-NEXT: movn $3, $2, $14 -; MIPS32-NEXT: movz $1, $7, $9 -; MIPS32-NEXT: movz $3, $zero, $25 -; MIPS32-NEXT: movn $gp, $10, $11 -; MIPS32-NEXT: movn $13, $zero, $14 -; MIPS32-NEXT: or $4, $13, $gp -; MIPS32-NEXT: movn $8, $zero, $15 -; MIPS32-NEXT: movn $8, $4, $25 -; MIPS32-NEXT: movz $8, $6, $9 -; MIPS32-NEXT: movn $2, $zero, $14 -; MIPS32-NEXT: movz $2, $zero, $25 +; MIPS32-NEXT: srlv $8, $4, $2 +; MIPS32-NEXT: andi $14, $2, 32 +; MIPS32-NEXT: movn $1, $8, $14 +; MIPS32-NEXT: sltiu $15, $3, 64 +; MIPS32-NEXT: movn $1, $11, $15 +; MIPS32-NEXT: movz $1, $7, $3 +; MIPS32-NEXT: srlv $2, $4, $3 +; MIPS32-NEXT: srlv $4, $5, $3 +; MIPS32-NEXT: movn $8, $zero, $14 +; MIPS32-NEXT: movn $8, $10, $15 +; MIPS32-NEXT: sllv $5, $13, $9 +; MIPS32-NEXT: movz $8, $6, $3 +; MIPS32-NEXT: or $3, $5, $4 +; MIPS32-NEXT: movn $3, $2, $12 +; MIPS32-NEXT: movz $3, $zero, $15 +; MIPS32-NEXT: movn $2, $zero, $12 +; MIPS32-NEXT: movz $2, $zero, $15 ; MIPS32-NEXT: move $4, $8 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: move $5, $1 ; ; MIPS32R2-LABEL: lshr_i128: ; MIPS32R2: # %bb.0: # %entry -; MIPS32R2-NEXT: lw $9, 28($sp) -; MIPS32R2-NEXT: addiu $1, $zero, 64 -; MIPS32R2-NEXT: subu $2, $1, $9 -; MIPS32R2-NEXT: sllv $10, $5, $2 -; MIPS32R2-NEXT: andi $11, $2, 32 -; MIPS32R2-NEXT: move $1, $10 -; MIPS32R2-NEXT: movn $1, $zero, $11 -; MIPS32R2-NEXT: srlv $3, $7, $9 -; MIPS32R2-NEXT: not $12, $9 -; MIPS32R2-NEXT: sll $8, $6, 1 -; MIPS32R2-NEXT: sllv $8, $8, $12 -; MIPS32R2-NEXT: or $3, $8, $3 -; MIPS32R2-NEXT: srlv $13, $6, $9 -; MIPS32R2-NEXT: andi $14, $9, 32 -; MIPS32R2-NEXT: movn $3, $13, $14 -; MIPS32R2-NEXT: addiu $15, $9, -64 -; MIPS32R2-NEXT: or $3, $3, $1 -; MIPS32R2-NEXT: srlv $1, $5, $15 -; MIPS32R2-NEXT: sll $24, $4, 1 -; MIPS32R2-NEXT: not $8, $15 -; MIPS32R2-NEXT: sllv $8, $24, $8 +; MIPS32R2-NEXT: lw $3, 28($sp) +; MIPS32R2-NEXT: srlv $1, $7, $3 +; MIPS32R2-NEXT: not $9, $3 +; MIPS32R2-NEXT: sll $2, $6, 1 +; MIPS32R2-NEXT: sllv $2, $2, $9 +; MIPS32R2-NEXT: addiu $8, $zero, 64 +; MIPS32R2-NEXT: or $1, $2, $1 +; MIPS32R2-NEXT: srlv $2, $6, $3 +; MIPS32R2-NEXT: subu $8, $8, $3 +; MIPS32R2-NEXT: sllv $10, $5, $8 +; MIPS32R2-NEXT: andi $11, $8, 32 +; MIPS32R2-NEXT: andi $12, $3, 32 +; MIPS32R2-NEXT: move $13, $10 +; MIPS32R2-NEXT: movn $13, $zero, $11 +; MIPS32R2-NEXT: movn $1, $2, $12 +; MIPS32R2-NEXT: sllv $14, $4, $8 +; MIPS32R2-NEXT: not $8, $8 +; MIPS32R2-NEXT: srl $15, $5, 1 +; MIPS32R2-NEXT: srlv $8, $15, $8 +; MIPS32R2-NEXT: or $8, $14, $8 +; MIPS32R2-NEXT: movn $8, $10, $11 +; MIPS32R2-NEXT: or $10, $8, $2 +; MIPS32R2-NEXT: addiu $2, $3, -64 +; MIPS32R2-NEXT: movn $10, $8, $12 +; MIPS32R2-NEXT: or $11, $1, $13 +; MIPS32R2-NEXT: srlv $1, $5, $2 +; MIPS32R2-NEXT: sll $13, $4, 1 +; MIPS32R2-NEXT: not $8, $2 +; MIPS32R2-NEXT: sllv $8, $13, $8 ; MIPS32R2-NEXT: or $1, $8, $1 -; MIPS32R2-NEXT: srlv $8, $4, $15 -; MIPS32R2-NEXT: andi $15, $15, 32 -; MIPS32R2-NEXT: movn $1, $8, $15 -; MIPS32R2-NEXT: sltiu $25, $9, 64 -; MIPS32R2-NEXT: movn $1, $3, $25 -; MIPS32R2-NEXT: sllv $3, $4, $2 -; MIPS32R2-NEXT: not $2, $2 -; MIPS32R2-NEXT: srl $gp, $5, 1 -; MIPS32R2-NEXT: srlv $2, $gp, $2 -; MIPS32R2-NEXT: or $gp, $3, $2 -; MIPS32R2-NEXT: srlv $2, $5, $9 -; MIPS32R2-NEXT: sllv $3, $24, $12 -; MIPS32R2-NEXT: or $3, $3, $2 -; MIPS32R2-NEXT: srlv $2, $4, $9 -; MIPS32R2-NEXT: movn $3, $2, $14 -; MIPS32R2-NEXT: movz $1, $7, $9 -; MIPS32R2-NEXT: movz $3, $zero, $25 -; MIPS32R2-NEXT: movn $gp, $10, $11 -; MIPS32R2-NEXT: movn $13, $zero, $14 -; MIPS32R2-NEXT: or $4, $13, $gp -; MIPS32R2-NEXT: movn $8, $zero, $15 -; MIPS32R2-NEXT: movn $8, $4, $25 -; MIPS32R2-NEXT: movz $8, $6, $9 -; MIPS32R2-NEXT: movn $2, $zero, $14 -; MIPS32R2-NEXT: movz $2, $zero, $25 +; MIPS32R2-NEXT: srlv $8, $4, $2 +; MIPS32R2-NEXT: andi $14, $2, 32 +; MIPS32R2-NEXT: movn $1, $8, $14 +; MIPS32R2-NEXT: sltiu $15, $3, 64 +; MIPS32R2-NEXT: movn $1, $11, $15 +; MIPS32R2-NEXT: movz $1, $7, $3 +; MIPS32R2-NEXT: srlv $2, $4, $3 +; MIPS32R2-NEXT: srlv $4, $5, $3 +; MIPS32R2-NEXT: movn $8, $zero, $14 +; MIPS32R2-NEXT: movn $8, $10, $15 +; MIPS32R2-NEXT: sllv $5, $13, $9 +; MIPS32R2-NEXT: movz $8, $6, $3 +; MIPS32R2-NEXT: or $3, $5, $4 +; MIPS32R2-NEXT: movn $3, $2, $12 +; MIPS32R2-NEXT: movz $3, $zero, $15 +; MIPS32R2-NEXT: movn $2, $zero, $12 +; MIPS32R2-NEXT: movz $2, $zero, $15 ; MIPS32R2-NEXT: move $4, $8 ; MIPS32R2-NEXT: jr $ra ; MIPS32R2-NEXT: move $5, $1 ; ; MIPS32R6-LABEL: lshr_i128: ; MIPS32R6: # %bb.0: # %entry -; MIPS32R6-NEXT: addiu $sp, $sp, -8 -; MIPS32R6-NEXT: .cfi_def_cfa_offset 8 -; MIPS32R6-NEXT: sw $16, 4($sp) # 4-byte Folded Spill -; MIPS32R6-NEXT: .cfi_offset 16, -4 -; MIPS32R6-NEXT: lw $1, 36($sp) -; MIPS32R6-NEXT: srlv $2, $7, $1 -; MIPS32R6-NEXT: not $3, $1 -; MIPS32R6-NEXT: sll $8, $6, 1 -; MIPS32R6-NEXT: sllv $8, $8, $3 -; MIPS32R6-NEXT: or $2, $8, $2 -; MIPS32R6-NEXT: addiu $8, $1, -64 -; MIPS32R6-NEXT: srlv $9, $5, $8 -; MIPS32R6-NEXT: sll $10, $4, 1 -; MIPS32R6-NEXT: not $11, $8 -; MIPS32R6-NEXT: sllv $11, $10, $11 -; MIPS32R6-NEXT: andi $12, $1, 32 -; MIPS32R6-NEXT: seleqz $2, $2, $12 -; MIPS32R6-NEXT: or $9, $11, $9 -; MIPS32R6-NEXT: srlv $11, $6, $1 -; MIPS32R6-NEXT: selnez $13, $11, $12 -; MIPS32R6-NEXT: addiu $14, $zero, 64 -; MIPS32R6-NEXT: subu $14, $14, $1 -; MIPS32R6-NEXT: sllv $15, $5, $14 -; MIPS32R6-NEXT: andi $24, $14, 32 -; MIPS32R6-NEXT: andi $25, $8, 32 -; MIPS32R6-NEXT: seleqz $9, $9, $25 -; MIPS32R6-NEXT: seleqz $gp, $15, $24 -; MIPS32R6-NEXT: or $2, $13, $2 -; MIPS32R6-NEXT: selnez $13, $15, $24 -; MIPS32R6-NEXT: sllv $15, $4, $14 -; MIPS32R6-NEXT: not $14, $14 -; MIPS32R6-NEXT: srl $16, $5, 1 -; MIPS32R6-NEXT: srlv $14, $16, $14 -; MIPS32R6-NEXT: or $14, $15, $14 -; MIPS32R6-NEXT: seleqz $14, $14, $24 +; MIPS32R6-NEXT: lw $3, 28($sp) +; MIPS32R6-NEXT: addiu $1, $zero, 64 +; MIPS32R6-NEXT: subu $1, $1, $3 +; MIPS32R6-NEXT: sllv $2, $5, $1 +; MIPS32R6-NEXT: andi $8, $1, 32 +; MIPS32R6-NEXT: sllv $9, $4, $1 +; MIPS32R6-NEXT: not $1, $1 +; MIPS32R6-NEXT: srl $10, $5, 1 +; MIPS32R6-NEXT: srlv $1, $10, $1 +; MIPS32R6-NEXT: or $1, $9, $1 +; MIPS32R6-NEXT: seleqz $9, $2, $8 +; MIPS32R6-NEXT: srlv $10, $7, $3 +; MIPS32R6-NEXT: not $11, $3 +; MIPS32R6-NEXT: sll $12, $6, 1 +; MIPS32R6-NEXT: sllv $12, $12, $11 +; MIPS32R6-NEXT: or $10, $12, $10 +; MIPS32R6-NEXT: andi $12, $3, 32 +; MIPS32R6-NEXT: seleqz $10, $10, $12 +; MIPS32R6-NEXT: srlv $13, $6, $3 +; MIPS32R6-NEXT: selnez $14, $13, $12 +; MIPS32R6-NEXT: or $10, $14, $10 +; MIPS32R6-NEXT: or $9, $10, $9 +; MIPS32R6-NEXT: selnez $2, $2, $8 +; MIPS32R6-NEXT: seleqz $1, $1, $8 +; MIPS32R6-NEXT: addiu $8, $3, -64 +; MIPS32R6-NEXT: srlv $10, $5, $8 +; MIPS32R6-NEXT: sll $14, $4, 1 +; MIPS32R6-NEXT: not $15, $8 +; MIPS32R6-NEXT: sllv $15, $14, $15 +; MIPS32R6-NEXT: sltiu $24, $3, 64 +; MIPS32R6-NEXT: selnez $9, $9, $24 +; MIPS32R6-NEXT: or $1, $2, $1 +; MIPS32R6-NEXT: or $2, $15, $10 +; MIPS32R6-NEXT: andi $10, $8, 32 +; MIPS32R6-NEXT: seleqz $2, $2, $10 ; MIPS32R6-NEXT: srlv $8, $4, $8 -; MIPS32R6-NEXT: or $13, $13, $14 -; MIPS32R6-NEXT: or $2, $2, $gp -; MIPS32R6-NEXT: srlv $5, $5, $1 -; MIPS32R6-NEXT: selnez $14, $8, $25 -; MIPS32R6-NEXT: sltiu $15, $1, 64 -; MIPS32R6-NEXT: selnez $2, $2, $15 -; MIPS32R6-NEXT: or $9, $14, $9 -; MIPS32R6-NEXT: sllv $3, $10, $3 -; MIPS32R6-NEXT: seleqz $10, $11, $12 -; MIPS32R6-NEXT: or $10, $10, $13 -; MIPS32R6-NEXT: or $3, $3, $5 -; MIPS32R6-NEXT: seleqz $5, $9, $15 -; MIPS32R6-NEXT: seleqz $9, $zero, $15 -; MIPS32R6-NEXT: srlv $4, $4, $1 -; MIPS32R6-NEXT: seleqz $11, $4, $12 -; MIPS32R6-NEXT: selnez $11, $11, $15 -; MIPS32R6-NEXT: seleqz $7, $7, $1 -; MIPS32R6-NEXT: or $2, $2, $5 -; MIPS32R6-NEXT: selnez $2, $2, $1 -; MIPS32R6-NEXT: or $5, $7, $2 -; MIPS32R6-NEXT: or $2, $9, $11 +; MIPS32R6-NEXT: selnez $15, $8, $10 +; MIPS32R6-NEXT: or $2, $15, $2 +; MIPS32R6-NEXT: seleqz $2, $2, $24 +; MIPS32R6-NEXT: srlv $15, $4, $3 +; MIPS32R6-NEXT: selnez $4, $1, $12 +; MIPS32R6-NEXT: or $1, $1, $13 +; MIPS32R6-NEXT: or $2, $9, $2 +; MIPS32R6-NEXT: seleqz $9, $15, $12 +; MIPS32R6-NEXT: seleqz $13, $zero, $24 +; MIPS32R6-NEXT: selnez $2, $2, $3 +; MIPS32R6-NEXT: selnez $9, $9, $24 +; MIPS32R6-NEXT: seleqz $7, $7, $3 +; MIPS32R6-NEXT: seleqz $6, $6, $3 +; MIPS32R6-NEXT: seleqz $1, $1, $12 +; MIPS32R6-NEXT: or $1, $4, $1 +; MIPS32R6-NEXT: seleqz $4, $8, $10 +; MIPS32R6-NEXT: selnez $1, $1, $24 +; MIPS32R6-NEXT: seleqz $4, $4, $24 +; MIPS32R6-NEXT: or $1, $1, $4 +; MIPS32R6-NEXT: selnez $1, $1, $3 +; MIPS32R6-NEXT: or $4, $6, $1 +; MIPS32R6-NEXT: or $1, $7, $2 +; MIPS32R6-NEXT: or $2, $13, $9 +; MIPS32R6-NEXT: srlv $3, $5, $3 +; MIPS32R6-NEXT: sllv $5, $14, $11 +; MIPS32R6-NEXT: or $3, $5, $3 ; MIPS32R6-NEXT: seleqz $3, $3, $12 -; MIPS32R6-NEXT: selnez $7, $4, $12 -; MIPS32R6-NEXT: seleqz $4, $6, $1 -; MIPS32R6-NEXT: selnez $6, $10, $15 -; MIPS32R6-NEXT: seleqz $8, $8, $25 -; MIPS32R6-NEXT: seleqz $8, $8, $15 -; MIPS32R6-NEXT: or $6, $6, $8 -; MIPS32R6-NEXT: selnez $1, $6, $1 -; MIPS32R6-NEXT: or $4, $4, $1 -; MIPS32R6-NEXT: or $1, $7, $3 -; MIPS32R6-NEXT: selnez $1, $1, $15 -; MIPS32R6-NEXT: or $3, $9, $1 -; MIPS32R6-NEXT: lw $16, 4($sp) # 4-byte Folded Reload +; MIPS32R6-NEXT: selnez $5, $15, $12 +; MIPS32R6-NEXT: or $3, $5, $3 +; MIPS32R6-NEXT: selnez $3, $3, $24 +; MIPS32R6-NEXT: or $3, $13, $3 ; MIPS32R6-NEXT: jr $ra -; MIPS32R6-NEXT: addiu $sp, $sp, 8 +; MIPS32R6-NEXT: move $5, $1 ; ; MIPS3-LABEL: lshr_i128: ; MIPS3: # %bb.0: # %entry @@ -770,183 +767,178 @@ ; ; MMR3-LABEL: lshr_i128: ; MMR3: # %bb.0: # %entry -; MMR3-NEXT: addiusp -40 -; MMR3-NEXT: .cfi_def_cfa_offset 40 -; MMR3-NEXT: swp $16, 32($sp) +; MMR3-NEXT: addiusp -32 +; MMR3-NEXT: .cfi_def_cfa_offset 32 +; MMR3-NEXT: swp $16, 24($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: move $8, $7 -; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $4, 28($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $16, 68($sp) -; MMR3-NEXT: li16 $2, 64 -; MMR3-NEXT: subu16 $7, $2, $16 -; MMR3-NEXT: sllv $9, $5, $7 -; MMR3-NEXT: move $17, $5 -; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $3, $7, 32 +; MMR3-NEXT: move $2, $6 +; MMR3-NEXT: move $6, $5 +; MMR3-NEXT: sw $5, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: lw $17, 60($sp) +; MMR3-NEXT: srlv $3, $7, $17 +; MMR3-NEXT: not16 $7, $17 +; MMR3-NEXT: sw $7, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $4, $2 +; MMR3-NEXT: sw $2, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: sll16 $2, $2, 1 +; MMR3-NEXT: sllv $2, $2, $7 +; MMR3-NEXT: li16 $7, 64 +; MMR3-NEXT: or16 $2, $3 +; MMR3-NEXT: srlv $5, $4, $17 +; MMR3-NEXT: subu16 $16, $7, $17 +; MMR3-NEXT: sllv $1, $6, $16 +; MMR3-NEXT: andi16 $4, $16, 32 +; MMR3-NEXT: andi16 $3, $17, 32 ; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: li16 $2, 0 -; MMR3-NEXT: move $4, $9 -; MMR3-NEXT: movn $4, $2, $3 -; MMR3-NEXT: srlv $5, $8, $16 -; MMR3-NEXT: not16 $3, $16 -; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: sll16 $2, $6, 1 -; MMR3-NEXT: sllv $2, $2, $3 -; MMR3-NEXT: or16 $2, $5 -; MMR3-NEXT: srlv $5, $6, $16 -; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $3, $16, 32 -; MMR3-NEXT: sw $3, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: li16 $7, 0 +; MMR3-NEXT: move $6, $1 +; MMR3-NEXT: movn $6, $7, $4 ; MMR3-NEXT: movn $2, $5, $3 -; MMR3-NEXT: addiu $3, $16, -64 -; MMR3-NEXT: or16 $2, $4 -; MMR3-NEXT: srlv $4, $17, $3 -; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $4, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: sll16 $6, $4, 1 -; MMR3-NEXT: not16 $5, $3 -; MMR3-NEXT: sllv $5, $6, $5 -; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $5, $17 -; MMR3-NEXT: srlv $1, $4, $3 -; MMR3-NEXT: andi16 $3, $3, 32 -; MMR3-NEXT: sw $3, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $5, $1, $3 -; MMR3-NEXT: sltiu $10, $16, 64 -; MMR3-NEXT: movn $5, $2, $10 -; MMR3-NEXT: sllv $2, $4, $7 -; MMR3-NEXT: not16 $3, $7 -; MMR3-NEXT: lw $7, 0($sp) # 4-byte Folded Reload -; MMR3-NEXT: srl16 $4, $7, 1 -; MMR3-NEXT: srlv $4, $4, $3 -; MMR3-NEXT: or16 $4, $2 -; MMR3-NEXT: srlv $2, $7, $16 ; MMR3-NEXT: lw $3, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: sllv $3, $6, $3 -; MMR3-NEXT: or16 $3, $2 -; MMR3-NEXT: lw $2, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: srlv $2, $2, $16 -; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $2, $17 -; MMR3-NEXT: movz $5, $8, $16 -; MMR3-NEXT: li16 $6, 0 -; MMR3-NEXT: movz $3, $6, $10 -; MMR3-NEXT: lw $7, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $4, $9, $7 -; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $6, $7, $17 -; MMR3-NEXT: or16 $6, $4 -; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $1, $7, $4 -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $1, $6, $10 -; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: movz $1, $4, $16 -; MMR3-NEXT: movn $2, $7, $17 +; MMR3-NEXT: sllv $3, $3, $16 +; MMR3-NEXT: sw $3, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $16, $16 +; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: srl16 $7, $3, 1 +; MMR3-NEXT: srlv $7, $7, $16 +; MMR3-NEXT: lw $16, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $7, $16 +; MMR3-NEXT: movn $7, $1, $4 +; MMR3-NEXT: or16 $5, $7 +; MMR3-NEXT: addiu $4, $17, -64 +; MMR3-NEXT: lw $16, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $5, $7, $16 +; MMR3-NEXT: or16 $2, $6 +; MMR3-NEXT: srlv $7, $3, $4 +; MMR3-NEXT: lw $6, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $3, $6, 1 +; MMR3-NEXT: sw $3, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $16, $4 +; MMR3-NEXT: sllv $16, $3, $16 +; MMR3-NEXT: or16 $16, $7 +; MMR3-NEXT: srlv $1, $6, $4 +; MMR3-NEXT: andi16 $4, $4, 32 +; MMR3-NEXT: movn $16, $1, $4 +; MMR3-NEXT: sltiu $7, $17, 64 +; MMR3-NEXT: movn $16, $2, $7 +; MMR3-NEXT: movz $16, $8, $17 +; MMR3-NEXT: srlv $2, $6, $17 +; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: srlv $6, $3, $17 +; MMR3-NEXT: li16 $3, 0 +; MMR3-NEXT: movn $1, $3, $4 +; MMR3-NEXT: movn $1, $5, $7 +; MMR3-NEXT: lw $3, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $4, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $3, $4, $3 +; MMR3-NEXT: lw $4, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: movz $1, $4, $17 +; MMR3-NEXT: or16 $3, $6 +; MMR3-NEXT: lw $5, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $2, $5 +; MMR3-NEXT: li16 $4, 0 +; MMR3-NEXT: movz $3, $4, $7 +; MMR3-NEXT: movn $2, $4, $5 ; MMR3-NEXT: li16 $4, 0 -; MMR3-NEXT: movz $2, $4, $10 +; MMR3-NEXT: movz $2, $4, $7 ; MMR3-NEXT: move $4, $1 -; MMR3-NEXT: lwp $16, 32($sp) -; MMR3-NEXT: addiusp 40 +; MMR3-NEXT: move $5, $16 +; MMR3-NEXT: lwp $16, 24($sp) +; MMR3-NEXT: addiusp 32 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: lshr_i128: ; MMR6: # %bb.0: # %entry -; MMR6-NEXT: addiu $sp, $sp, -32 -; MMR6-NEXT: .cfi_def_cfa_offset 32 -; MMR6-NEXT: sw $17, 28($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 24($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -16 +; MMR6-NEXT: .cfi_def_cfa_offset 16 +; MMR6-NEXT: sw $17, 12($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 17, -4 ; MMR6-NEXT: .cfi_offset 16, -8 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: move $7, $5 -; MMR6-NEXT: lw $3, 60($sp) -; MMR6-NEXT: srlv $2, $1, $3 -; MMR6-NEXT: not16 $5, $3 -; MMR6-NEXT: sw $5, 12($sp) # 4-byte Folded Spill ; MMR6-NEXT: move $17, $6 -; MMR6-NEXT: sw $6, 16($sp) # 4-byte Folded Spill -; MMR6-NEXT: sll16 $6, $6, 1 -; MMR6-NEXT: sllv $6, $6, $5 -; MMR6-NEXT: or $8, $6, $2 -; MMR6-NEXT: addiu $5, $3, -64 -; MMR6-NEXT: srlv $9, $7, $5 -; MMR6-NEXT: move $6, $4 -; MMR6-NEXT: sll16 $2, $4, 1 -; MMR6-NEXT: sw $2, 8($sp) # 4-byte Folded Spill -; MMR6-NEXT: not16 $16, $5 -; MMR6-NEXT: sllv $10, $2, $16 -; MMR6-NEXT: andi16 $16, $3, 32 -; MMR6-NEXT: seleqz $8, $8, $16 +; MMR6-NEXT: move $6, $5 +; MMR6-NEXT: move $5, $4 +; MMR6-NEXT: lw $3, 44($sp) +; MMR6-NEXT: li16 $2, 64 +; MMR6-NEXT: subu16 $7, $2, $3 +; MMR6-NEXT: sllv $8, $6, $7 +; MMR6-NEXT: andi16 $2, $7, 32 +; MMR6-NEXT: sllv $9, $4, $7 +; MMR6-NEXT: not16 $7, $7 +; MMR6-NEXT: srl16 $16, $6, 1 +; MMR6-NEXT: srlv $7, $16, $7 +; MMR6-NEXT: or $7, $9, $7 +; MMR6-NEXT: seleqz $9, $8, $2 +; MMR6-NEXT: srlv $10, $1, $3 +; MMR6-NEXT: not16 $16, $3 +; MMR6-NEXT: sw $16, 4($sp) # 4-byte Folded Spill +; MMR6-NEXT: move $4, $17 +; MMR6-NEXT: sw $17, 0($sp) # 4-byte Folded Spill +; MMR6-NEXT: sll16 $17, $17, 1 +; MMR6-NEXT: sllv $11, $17, $16 +; MMR6-NEXT: or $10, $11, $10 +; MMR6-NEXT: andi16 $17, $3, 32 +; MMR6-NEXT: seleqz $10, $10, $17 +; MMR6-NEXT: srlv $11, $4, $3 +; MMR6-NEXT: selnez $12, $11, $17 +; MMR6-NEXT: or $10, $12, $10 ; MMR6-NEXT: or $9, $10, $9 -; MMR6-NEXT: srlv $10, $17, $3 -; MMR6-NEXT: selnez $11, $10, $16 -; MMR6-NEXT: li16 $17, 64 -; MMR6-NEXT: subu16 $2, $17, $3 -; MMR6-NEXT: sllv $12, $7, $2 -; MMR6-NEXT: move $17, $7 +; MMR6-NEXT: selnez $8, $8, $2 +; MMR6-NEXT: seleqz $4, $7, $2 +; MMR6-NEXT: addiu $2, $3, -64 +; MMR6-NEXT: srlv $10, $6, $2 +; MMR6-NEXT: sll16 $7, $5, 1 +; MMR6-NEXT: not16 $16, $2 +; MMR6-NEXT: sllv $12, $7, $16 +; MMR6-NEXT: sltiu $13, $3, 64 +; MMR6-NEXT: selnez $9, $9, $13 +; MMR6-NEXT: or $8, $8, $4 +; MMR6-NEXT: or $10, $12, $10 ; MMR6-NEXT: andi16 $4, $2, 32 -; MMR6-NEXT: andi16 $7, $5, 32 -; MMR6-NEXT: sw $7, 20($sp) # 4-byte Folded Spill -; MMR6-NEXT: seleqz $9, $9, $7 -; MMR6-NEXT: seleqz $13, $12, $4 -; MMR6-NEXT: or $8, $11, $8 -; MMR6-NEXT: selnez $11, $12, $4 -; MMR6-NEXT: sllv $12, $6, $2 -; MMR6-NEXT: move $7, $6 -; MMR6-NEXT: sw $6, 4($sp) # 4-byte Folded Spill -; MMR6-NEXT: not16 $2, $2 -; MMR6-NEXT: srl16 $6, $17, 1 -; MMR6-NEXT: srlv $2, $6, $2 -; MMR6-NEXT: or $2, $12, $2 -; MMR6-NEXT: seleqz $2, $2, $4 -; MMR6-NEXT: srlv $4, $7, $5 -; MMR6-NEXT: or $11, $11, $2 -; MMR6-NEXT: or $5, $8, $13 -; MMR6-NEXT: srlv $6, $17, $3 -; MMR6-NEXT: lw $2, 20($sp) # 4-byte Folded Reload -; MMR6-NEXT: selnez $7, $4, $2 -; MMR6-NEXT: sltiu $8, $3, 64 -; MMR6-NEXT: selnez $12, $5, $8 -; MMR6-NEXT: or $7, $7, $9 -; MMR6-NEXT: lw $5, 12($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $2, 8($sp) # 4-byte Folded Reload -; MMR6-NEXT: sllv $9, $2, $5 -; MMR6-NEXT: seleqz $10, $10, $16 -; MMR6-NEXT: li16 $5, 0 -; MMR6-NEXT: or $10, $10, $11 -; MMR6-NEXT: or $6, $9, $6 -; MMR6-NEXT: seleqz $2, $7, $8 -; MMR6-NEXT: seleqz $7, $5, $8 -; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload -; MMR6-NEXT: srlv $9, $5, $3 -; MMR6-NEXT: seleqz $11, $9, $16 -; MMR6-NEXT: selnez $11, $11, $8 +; MMR6-NEXT: seleqz $10, $10, $4 +; MMR6-NEXT: srlv $2, $5, $2 +; MMR6-NEXT: selnez $12, $2, $4 +; MMR6-NEXT: or $10, $12, $10 +; MMR6-NEXT: seleqz $10, $10, $13 +; MMR6-NEXT: srlv $5, $5, $3 +; MMR6-NEXT: selnez $12, $8, $17 +; MMR6-NEXT: or $8, $8, $11 +; MMR6-NEXT: or $9, $9, $10 +; MMR6-NEXT: seleqz $10, $5, $17 +; MMR6-NEXT: li16 $16, 0 +; MMR6-NEXT: seleqz $11, $16, $13 +; MMR6-NEXT: selnez $9, $9, $3 +; MMR6-NEXT: selnez $10, $10, $13 ; MMR6-NEXT: seleqz $1, $1, $3 -; MMR6-NEXT: or $2, $12, $2 +; MMR6-NEXT: lw $16, 0($sp) # 4-byte Folded Reload +; MMR6-NEXT: seleqz $14, $16, $3 +; MMR6-NEXT: seleqz $8, $8, $17 +; MMR6-NEXT: or $8, $12, $8 +; MMR6-NEXT: seleqz $2, $2, $4 +; MMR6-NEXT: selnez $4, $8, $13 +; MMR6-NEXT: seleqz $2, $2, $13 +; MMR6-NEXT: or $2, $4, $2 ; MMR6-NEXT: selnez $2, $2, $3 -; MMR6-NEXT: or $5, $1, $2 -; MMR6-NEXT: or $2, $7, $11 -; MMR6-NEXT: seleqz $1, $6, $16 -; MMR6-NEXT: selnez $6, $9, $16 -; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload -; MMR6-NEXT: seleqz $9, $16, $3 -; MMR6-NEXT: selnez $10, $10, $8 -; MMR6-NEXT: lw $16, 20($sp) # 4-byte Folded Reload -; MMR6-NEXT: seleqz $4, $4, $16 -; MMR6-NEXT: seleqz $4, $4, $8 -; MMR6-NEXT: or $4, $10, $4 -; MMR6-NEXT: selnez $3, $4, $3 -; MMR6-NEXT: or $4, $9, $3 -; MMR6-NEXT: or $1, $6, $1 -; MMR6-NEXT: selnez $1, $1, $8 -; MMR6-NEXT: or $3, $7, $1 -; MMR6-NEXT: lw $16, 24($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 28($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 32 +; MMR6-NEXT: or $4, $14, $2 +; MMR6-NEXT: or $1, $1, $9 +; MMR6-NEXT: or $2, $11, $10 +; MMR6-NEXT: srlv $3, $6, $3 +; MMR6-NEXT: lw $6, 4($sp) # 4-byte Folded Reload +; MMR6-NEXT: sllv $6, $7, $6 +; MMR6-NEXT: or $3, $6, $3 +; MMR6-NEXT: seleqz $3, $3, $17 +; MMR6-NEXT: selnez $5, $5, $17 +; MMR6-NEXT: or $3, $5, $3 +; MMR6-NEXT: selnez $3, $3, $13 +; MMR6-NEXT: or $3, $11, $3 +; MMR6-NEXT: move $5, $1 +; MMR6-NEXT: lw $16, 8($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $17, 12($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 16 ; MMR6-NEXT: jrc $ra entry: diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll @@ -456,237 +456,233 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MIPS2-LABEL: shl_i128: ; MIPS2: # %bb.0: # %entry -; MIPS2-NEXT: addiu $sp, $sp, -8 -; MIPS2-NEXT: .cfi_def_cfa_offset 8 -; MIPS2-NEXT: sw $17, 4($sp) # 4-byte Folded Spill -; MIPS2-NEXT: sw $16, 0($sp) # 4-byte Folded Spill -; MIPS2-NEXT: .cfi_offset 17, -4 -; MIPS2-NEXT: .cfi_offset 16, -8 -; MIPS2-NEXT: lw $8, 36($sp) +; MIPS2-NEXT: lw $2, 28($sp) ; MIPS2-NEXT: addiu $1, $zero, 64 -; MIPS2-NEXT: subu $3, $1, $8 -; MIPS2-NEXT: srlv $9, $6, $3 -; MIPS2-NEXT: andi $1, $3, 32 +; MIPS2-NEXT: subu $9, $1, $2 +; MIPS2-NEXT: srlv $3, $6, $9 +; MIPS2-NEXT: andi $1, $9, 32 ; MIPS2-NEXT: bnez $1, $BB5_2 -; MIPS2-NEXT: addiu $2, $zero, 0 +; MIPS2-NEXT: addiu $8, $zero, 0 ; MIPS2-NEXT: # %bb.1: # %entry -; MIPS2-NEXT: srlv $1, $7, $3 -; MIPS2-NEXT: not $3, $3 +; MIPS2-NEXT: srlv $1, $7, $9 +; MIPS2-NEXT: not $9, $9 ; MIPS2-NEXT: sll $10, $6, 1 -; MIPS2-NEXT: sllv $3, $10, $3 -; MIPS2-NEXT: or $3, $3, $1 +; MIPS2-NEXT: sllv $9, $10, $9 +; MIPS2-NEXT: or $10, $9, $1 ; MIPS2-NEXT: b $BB5_3 -; MIPS2-NEXT: move $15, $9 +; MIPS2-NEXT: move $13, $3 ; MIPS2-NEXT: $BB5_2: -; MIPS2-NEXT: addiu $15, $zero, 0 -; MIPS2-NEXT: move $3, $9 +; MIPS2-NEXT: addiu $13, $zero, 0 +; MIPS2-NEXT: move $10, $3 ; MIPS2-NEXT: $BB5_3: # %entry -; MIPS2-NEXT: not $13, $8 -; MIPS2-NEXT: sllv $9, $5, $8 -; MIPS2-NEXT: andi $10, $8, 32 -; MIPS2-NEXT: bnez $10, $BB5_5 -; MIPS2-NEXT: move $25, $9 +; MIPS2-NEXT: not $3, $2 +; MIPS2-NEXT: sllv $25, $5, $2 +; MIPS2-NEXT: andi $9, $2, 32 +; MIPS2-NEXT: bnez $9, $BB5_5 +; MIPS2-NEXT: move $24, $25 ; MIPS2-NEXT: # %bb.4: # %entry -; MIPS2-NEXT: sllv $1, $4, $8 -; MIPS2-NEXT: srl $11, $5, 1 -; MIPS2-NEXT: srlv $11, $11, $13 -; MIPS2-NEXT: or $25, $1, $11 +; MIPS2-NEXT: srl $1, $5, 1 +; MIPS2-NEXT: sllv $11, $4, $2 +; MIPS2-NEXT: srlv $1, $1, $3 +; MIPS2-NEXT: or $24, $11, $1 ; MIPS2-NEXT: $BB5_5: # %entry -; MIPS2-NEXT: addiu $14, $8, -64 -; MIPS2-NEXT: srl $24, $7, 1 -; MIPS2-NEXT: sllv $11, $7, $14 -; MIPS2-NEXT: andi $12, $14, 32 -; MIPS2-NEXT: bnez $12, $BB5_7 -; MIPS2-NEXT: move $gp, $11 +; MIPS2-NEXT: addiu $15, $2, -64 +; MIPS2-NEXT: srl $11, $7, 1 +; MIPS2-NEXT: sllv $12, $7, $15 +; MIPS2-NEXT: andi $14, $15, 32 +; MIPS2-NEXT: beqz $14, $BB5_19 +; MIPS2-NEXT: move $gp, $12 ; MIPS2-NEXT: # %bb.6: # %entry -; MIPS2-NEXT: sllv $1, $6, $14 -; MIPS2-NEXT: not $14, $14 -; MIPS2-NEXT: srlv $14, $24, $14 -; MIPS2-NEXT: or $gp, $1, $14 +; MIPS2-NEXT: beqz $9, $BB5_20 +; MIPS2-NEXT: sltiu $15, $2, 64 ; MIPS2-NEXT: $BB5_7: # %entry -; MIPS2-NEXT: sltiu $14, $8, 64 -; MIPS2-NEXT: beqz $14, $BB5_9 +; MIPS2-NEXT: bnez $15, $BB5_21 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB5_8: # %entry +; MIPS2-NEXT: sltiu $24, $2, 1 +; MIPS2-NEXT: bnez $24, $BB5_10 ; MIPS2-NEXT: nop -; MIPS2-NEXT: # %bb.8: -; MIPS2-NEXT: or $gp, $25, $15 ; MIPS2-NEXT: $BB5_9: # %entry -; MIPS2-NEXT: sllv $25, $7, $8 -; MIPS2-NEXT: bnez $10, $BB5_11 -; MIPS2-NEXT: addiu $17, $zero, 0 -; MIPS2-NEXT: # %bb.10: # %entry -; MIPS2-NEXT: move $17, $25 -; MIPS2-NEXT: $BB5_11: # %entry -; MIPS2-NEXT: addiu $1, $zero, 63 -; MIPS2-NEXT: sltiu $15, $8, 1 -; MIPS2-NEXT: beqz $15, $BB5_21 -; MIPS2-NEXT: sltu $16, $1, $8 -; MIPS2-NEXT: # %bb.12: # %entry -; MIPS2-NEXT: beqz $16, $BB5_22 +; MIPS2-NEXT: move $4, $gp +; MIPS2-NEXT: $BB5_10: # %entry +; MIPS2-NEXT: sllv $13, $7, $2 +; MIPS2-NEXT: beqz $9, $BB5_23 +; MIPS2-NEXT: addiu $25, $zero, 0 +; MIPS2-NEXT: # %bb.11: # %entry +; MIPS2-NEXT: beqz $14, $BB5_24 ; MIPS2-NEXT: addiu $7, $zero, 0 +; MIPS2-NEXT: $BB5_12: # %entry +; MIPS2-NEXT: beqz $15, $BB5_25 +; MIPS2-NEXT: addiu $12, $zero, 63 ; MIPS2-NEXT: $BB5_13: # %entry -; MIPS2-NEXT: beqz $10, $BB5_23 -; MIPS2-NEXT: nop +; MIPS2-NEXT: beqz $24, $BB5_26 +; MIPS2-NEXT: sltu $12, $12, $2 ; MIPS2-NEXT: $BB5_14: # %entry -; MIPS2-NEXT: beqz $16, $BB5_24 -; MIPS2-NEXT: addiu $6, $zero, 0 +; MIPS2-NEXT: beqz $12, $BB5_27 +; MIPS2-NEXT: addiu $7, $zero, 0 ; MIPS2-NEXT: $BB5_15: # %entry -; MIPS2-NEXT: beqz $10, $BB5_25 -; MIPS2-NEXT: addiu $8, $zero, 0 +; MIPS2-NEXT: beqz $9, $BB5_28 +; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_16: # %entry -; MIPS2-NEXT: beqz $12, $BB5_26 +; MIPS2-NEXT: bnez $12, $BB5_18 ; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_17: # %entry -; MIPS2-NEXT: bnez $14, $BB5_27 -; MIPS2-NEXT: nop +; MIPS2-NEXT: move $8, $13 ; MIPS2-NEXT: $BB5_18: # %entry -; MIPS2-NEXT: bnez $15, $BB5_20 -; MIPS2-NEXT: nop -; MIPS2-NEXT: $BB5_19: # %entry -; MIPS2-NEXT: move $5, $2 -; MIPS2-NEXT: $BB5_20: # %entry ; MIPS2-NEXT: move $2, $4 ; MIPS2-NEXT: move $3, $5 -; MIPS2-NEXT: move $4, $6 -; MIPS2-NEXT: move $5, $7 -; MIPS2-NEXT: lw $16, 0($sp) # 4-byte Folded Reload -; MIPS2-NEXT: lw $17, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: move $4, $8 ; MIPS2-NEXT: jr $ra -; MIPS2-NEXT: addiu $sp, $sp, 8 -; MIPS2-NEXT: $BB5_21: # %entry -; MIPS2-NEXT: move $4, $gp -; MIPS2-NEXT: bnez $16, $BB5_13 -; MIPS2-NEXT: addiu $7, $zero, 0 -; MIPS2-NEXT: $BB5_22: # %entry -; MIPS2-NEXT: bnez $10, $BB5_14 -; MIPS2-NEXT: move $7, $17 +; MIPS2-NEXT: move $5, $7 +; MIPS2-NEXT: $BB5_19: # %entry +; MIPS2-NEXT: sllv $1, $6, $15 +; MIPS2-NEXT: not $15, $15 +; MIPS2-NEXT: srlv $15, $11, $15 +; MIPS2-NEXT: or $gp, $1, $15 +; MIPS2-NEXT: bnez $9, $BB5_7 +; MIPS2-NEXT: sltiu $15, $2, 64 +; MIPS2-NEXT: $BB5_20: # %entry +; MIPS2-NEXT: beqz $15, $BB5_8 +; MIPS2-NEXT: or $10, $10, $25 +; MIPS2-NEXT: $BB5_21: +; MIPS2-NEXT: or $gp, $24, $13 +; MIPS2-NEXT: sltiu $24, $2, 1 +; MIPS2-NEXT: bnez $24, $BB5_10 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.22: +; MIPS2-NEXT: b $BB5_9 +; MIPS2-NEXT: nop ; MIPS2-NEXT: $BB5_23: # %entry -; MIPS2-NEXT: sllv $1, $6, $8 -; MIPS2-NEXT: srlv $6, $24, $13 -; MIPS2-NEXT: or $25, $1, $6 -; MIPS2-NEXT: bnez $16, $BB5_15 -; MIPS2-NEXT: addiu $6, $zero, 0 +; MIPS2-NEXT: move $25, $13 +; MIPS2-NEXT: bnez $14, $BB5_12 +; MIPS2-NEXT: addiu $7, $zero, 0 ; MIPS2-NEXT: $BB5_24: # %entry -; MIPS2-NEXT: move $6, $25 -; MIPS2-NEXT: bnez $10, $BB5_16 -; MIPS2-NEXT: addiu $8, $zero, 0 +; MIPS2-NEXT: move $7, $12 +; MIPS2-NEXT: bnez $15, $BB5_13 +; MIPS2-NEXT: addiu $12, $zero, 63 ; MIPS2-NEXT: $BB5_25: # %entry -; MIPS2-NEXT: bnez $12, $BB5_17 -; MIPS2-NEXT: move $8, $9 +; MIPS2-NEXT: move $10, $7 +; MIPS2-NEXT: bnez $24, $BB5_14 +; MIPS2-NEXT: sltu $12, $12, $2 ; MIPS2-NEXT: $BB5_26: # %entry -; MIPS2-NEXT: beqz $14, $BB5_18 -; MIPS2-NEXT: move $2, $11 -; MIPS2-NEXT: $BB5_27: -; MIPS2-NEXT: bnez $15, $BB5_20 -; MIPS2-NEXT: or $2, $8, $3 -; MIPS2-NEXT: # %bb.28: -; MIPS2-NEXT: b $BB5_19 +; MIPS2-NEXT: move $5, $10 +; MIPS2-NEXT: bnez $12, $BB5_15 +; MIPS2-NEXT: addiu $7, $zero, 0 +; MIPS2-NEXT: $BB5_27: # %entry +; MIPS2-NEXT: bnez $9, $BB5_16 +; MIPS2-NEXT: move $7, $25 +; MIPS2-NEXT: $BB5_28: # %entry +; MIPS2-NEXT: sllv $1, $6, $2 +; MIPS2-NEXT: srlv $2, $11, $3 +; MIPS2-NEXT: bnez $12, $BB5_18 +; MIPS2-NEXT: or $13, $1, $2 +; MIPS2-NEXT: # %bb.29: # %entry +; MIPS2-NEXT: b $BB5_17 ; MIPS2-NEXT: nop ; ; MIPS32-LABEL: shl_i128: ; MIPS32: # %bb.0: # %entry ; MIPS32-NEXT: lw $8, 28($sp) -; MIPS32-NEXT: addiu $1, $zero, 64 -; MIPS32-NEXT: subu $1, $1, $8 -; MIPS32-NEXT: srlv $9, $6, $1 -; MIPS32-NEXT: andi $10, $1, 32 -; MIPS32-NEXT: move $2, $9 -; MIPS32-NEXT: movn $2, $zero, $10 -; MIPS32-NEXT: sllv $3, $4, $8 -; MIPS32-NEXT: not $11, $8 -; MIPS32-NEXT: srl $12, $5, 1 -; MIPS32-NEXT: srlv $12, $12, $11 -; MIPS32-NEXT: or $3, $3, $12 -; MIPS32-NEXT: sllv $12, $5, $8 -; MIPS32-NEXT: andi $13, $8, 32 -; MIPS32-NEXT: movn $3, $12, $13 -; MIPS32-NEXT: addiu $14, $8, -64 -; MIPS32-NEXT: or $15, $3, $2 -; MIPS32-NEXT: sllv $2, $6, $14 -; MIPS32-NEXT: srl $24, $7, 1 -; MIPS32-NEXT: not $3, $14 -; MIPS32-NEXT: srlv $3, $24, $3 +; MIPS32-NEXT: sllv $1, $4, $8 +; MIPS32-NEXT: not $9, $8 +; MIPS32-NEXT: srl $2, $5, 1 +; MIPS32-NEXT: srlv $2, $2, $9 +; MIPS32-NEXT: addiu $3, $zero, 64 +; MIPS32-NEXT: or $1, $1, $2 +; MIPS32-NEXT: sllv $2, $5, $8 +; MIPS32-NEXT: subu $3, $3, $8 +; MIPS32-NEXT: srlv $10, $6, $3 +; MIPS32-NEXT: andi $11, $3, 32 +; MIPS32-NEXT: andi $12, $8, 32 +; MIPS32-NEXT: move $13, $10 +; MIPS32-NEXT: movn $13, $zero, $11 +; MIPS32-NEXT: movn $1, $2, $12 +; MIPS32-NEXT: srlv $14, $7, $3 +; MIPS32-NEXT: not $3, $3 +; MIPS32-NEXT: sll $15, $6, 1 +; MIPS32-NEXT: sllv $3, $15, $3 +; MIPS32-NEXT: or $3, $3, $14 +; MIPS32-NEXT: movn $3, $10, $11 +; MIPS32-NEXT: or $10, $3, $2 +; MIPS32-NEXT: addiu $11, $8, -64 +; MIPS32-NEXT: movn $10, $3, $12 +; MIPS32-NEXT: or $1, $1, $13 +; MIPS32-NEXT: sllv $2, $6, $11 +; MIPS32-NEXT: srl $13, $7, 1 +; MIPS32-NEXT: not $3, $11 +; MIPS32-NEXT: srlv $3, $13, $3 ; MIPS32-NEXT: or $2, $2, $3 -; MIPS32-NEXT: sllv $3, $7, $14 -; MIPS32-NEXT: andi $14, $14, 32 -; MIPS32-NEXT: movn $2, $3, $14 -; MIPS32-NEXT: sltiu $25, $8, 64 -; MIPS32-NEXT: movn $2, $15, $25 -; MIPS32-NEXT: srlv $15, $7, $1 -; MIPS32-NEXT: not $1, $1 -; MIPS32-NEXT: sll $gp, $6, 1 -; MIPS32-NEXT: sllv $1, $gp, $1 -; MIPS32-NEXT: or $15, $1, $15 -; MIPS32-NEXT: sllv $1, $6, $8 -; MIPS32-NEXT: srlv $6, $24, $11 -; MIPS32-NEXT: or $1, $1, $6 -; MIPS32-NEXT: sllv $6, $7, $8 -; MIPS32-NEXT: movn $1, $6, $13 +; MIPS32-NEXT: sllv $3, $7, $11 +; MIPS32-NEXT: andi $11, $11, 32 +; MIPS32-NEXT: movn $2, $3, $11 +; MIPS32-NEXT: sltiu $14, $8, 64 +; MIPS32-NEXT: movn $2, $1, $14 ; MIPS32-NEXT: movz $2, $4, $8 -; MIPS32-NEXT: movz $1, $zero, $25 -; MIPS32-NEXT: movn $15, $9, $10 -; MIPS32-NEXT: movn $12, $zero, $13 -; MIPS32-NEXT: or $4, $12, $15 -; MIPS32-NEXT: movn $3, $zero, $14 -; MIPS32-NEXT: movn $3, $4, $25 +; MIPS32-NEXT: sllv $1, $7, $8 +; MIPS32-NEXT: sllv $4, $6, $8 +; MIPS32-NEXT: movn $3, $zero, $11 +; MIPS32-NEXT: movn $3, $10, $14 +; MIPS32-NEXT: srlv $6, $13, $9 ; MIPS32-NEXT: movz $3, $5, $8 -; MIPS32-NEXT: movn $6, $zero, $13 -; MIPS32-NEXT: movz $6, $zero, $25 -; MIPS32-NEXT: move $4, $1 +; MIPS32-NEXT: or $4, $4, $6 +; MIPS32-NEXT: movn $4, $1, $12 +; MIPS32-NEXT: movz $4, $zero, $14 +; MIPS32-NEXT: movn $1, $zero, $12 +; MIPS32-NEXT: movz $1, $zero, $14 ; MIPS32-NEXT: jr $ra -; MIPS32-NEXT: move $5, $6 +; MIPS32-NEXT: move $5, $1 ; ; MIPS32R2-LABEL: shl_i128: ; MIPS32R2: # %bb.0: # %entry ; MIPS32R2-NEXT: lw $8, 28($sp) -; MIPS32R2-NEXT: addiu $1, $zero, 64 -; MIPS32R2-NEXT: subu $1, $1, $8 -; MIPS32R2-NEXT: srlv $9, $6, $1 -; MIPS32R2-NEXT: andi $10, $1, 32 -; MIPS32R2-NEXT: move $2, $9 -; MIPS32R2-NEXT: movn $2, $zero, $10 -; MIPS32R2-NEXT: sllv $3, $4, $8 -; MIPS32R2-NEXT: not $11, $8 -; MIPS32R2-NEXT: srl $12, $5, 1 -; MIPS32R2-NEXT: srlv $12, $12, $11 -; MIPS32R2-NEXT: or $3, $3, $12 -; MIPS32R2-NEXT: sllv $12, $5, $8 -; MIPS32R2-NEXT: andi $13, $8, 32 -; MIPS32R2-NEXT: movn $3, $12, $13 -; MIPS32R2-NEXT: addiu $14, $8, -64 -; MIPS32R2-NEXT: or $15, $3, $2 -; MIPS32R2-NEXT: sllv $2, $6, $14 -; MIPS32R2-NEXT: srl $24, $7, 1 -; MIPS32R2-NEXT: not $3, $14 -; MIPS32R2-NEXT: srlv $3, $24, $3 +; MIPS32R2-NEXT: sllv $1, $4, $8 +; MIPS32R2-NEXT: not $9, $8 +; MIPS32R2-NEXT: srl $2, $5, 1 +; MIPS32R2-NEXT: srlv $2, $2, $9 +; MIPS32R2-NEXT: addiu $3, $zero, 64 +; MIPS32R2-NEXT: or $1, $1, $2 +; MIPS32R2-NEXT: sllv $2, $5, $8 +; MIPS32R2-NEXT: subu $3, $3, $8 +; MIPS32R2-NEXT: srlv $10, $6, $3 +; MIPS32R2-NEXT: andi $11, $3, 32 +; MIPS32R2-NEXT: andi $12, $8, 32 +; MIPS32R2-NEXT: move $13, $10 +; MIPS32R2-NEXT: movn $13, $zero, $11 +; MIPS32R2-NEXT: movn $1, $2, $12 +; MIPS32R2-NEXT: srlv $14, $7, $3 +; MIPS32R2-NEXT: not $3, $3 +; MIPS32R2-NEXT: sll $15, $6, 1 +; MIPS32R2-NEXT: sllv $3, $15, $3 +; MIPS32R2-NEXT: or $3, $3, $14 +; MIPS32R2-NEXT: movn $3, $10, $11 +; MIPS32R2-NEXT: or $10, $3, $2 +; MIPS32R2-NEXT: addiu $11, $8, -64 +; MIPS32R2-NEXT: movn $10, $3, $12 +; MIPS32R2-NEXT: or $1, $1, $13 +; MIPS32R2-NEXT: sllv $2, $6, $11 +; MIPS32R2-NEXT: srl $13, $7, 1 +; MIPS32R2-NEXT: not $3, $11 +; MIPS32R2-NEXT: srlv $3, $13, $3 ; MIPS32R2-NEXT: or $2, $2, $3 -; MIPS32R2-NEXT: sllv $3, $7, $14 -; MIPS32R2-NEXT: andi $14, $14, 32 -; MIPS32R2-NEXT: movn $2, $3, $14 -; MIPS32R2-NEXT: sltiu $25, $8, 64 -; MIPS32R2-NEXT: movn $2, $15, $25 -; MIPS32R2-NEXT: srlv $15, $7, $1 -; MIPS32R2-NEXT: not $1, $1 -; MIPS32R2-NEXT: sll $gp, $6, 1 -; MIPS32R2-NEXT: sllv $1, $gp, $1 -; MIPS32R2-NEXT: or $15, $1, $15 -; MIPS32R2-NEXT: sllv $1, $6, $8 -; MIPS32R2-NEXT: srlv $6, $24, $11 -; MIPS32R2-NEXT: or $1, $1, $6 -; MIPS32R2-NEXT: sllv $6, $7, $8 -; MIPS32R2-NEXT: movn $1, $6, $13 +; MIPS32R2-NEXT: sllv $3, $7, $11 +; MIPS32R2-NEXT: andi $11, $11, 32 +; MIPS32R2-NEXT: movn $2, $3, $11 +; MIPS32R2-NEXT: sltiu $14, $8, 64 +; MIPS32R2-NEXT: movn $2, $1, $14 ; MIPS32R2-NEXT: movz $2, $4, $8 -; MIPS32R2-NEXT: movz $1, $zero, $25 -; MIPS32R2-NEXT: movn $15, $9, $10 -; MIPS32R2-NEXT: movn $12, $zero, $13 -; MIPS32R2-NEXT: or $4, $12, $15 -; MIPS32R2-NEXT: movn $3, $zero, $14 -; MIPS32R2-NEXT: movn $3, $4, $25 +; MIPS32R2-NEXT: sllv $1, $7, $8 +; MIPS32R2-NEXT: sllv $4, $6, $8 +; MIPS32R2-NEXT: movn $3, $zero, $11 +; MIPS32R2-NEXT: movn $3, $10, $14 +; MIPS32R2-NEXT: srlv $6, $13, $9 ; MIPS32R2-NEXT: movz $3, $5, $8 -; MIPS32R2-NEXT: movn $6, $zero, $13 -; MIPS32R2-NEXT: movz $6, $zero, $25 -; MIPS32R2-NEXT: move $4, $1 +; MIPS32R2-NEXT: or $4, $4, $6 +; MIPS32R2-NEXT: movn $4, $1, $12 +; MIPS32R2-NEXT: movz $4, $zero, $14 +; MIPS32R2-NEXT: movn $1, $zero, $12 +; MIPS32R2-NEXT: movz $1, $zero, $14 ; MIPS32R2-NEXT: jr $ra -; MIPS32R2-NEXT: move $5, $6 +; MIPS32R2-NEXT: move $5, $1 ; ; MIPS32R6-LABEL: shl_i128: ; MIPS32R6: # %bb.0: # %entry @@ -706,55 +702,57 @@ ; MIPS32R6-NEXT: andi $13, $11, 32 ; MIPS32R6-NEXT: seleqz $14, $12, $13 ; MIPS32R6-NEXT: or $1, $10, $1 -; MIPS32R6-NEXT: selnez $10, $12, $13 -; MIPS32R6-NEXT: srlv $12, $7, $11 +; MIPS32R6-NEXT: srlv $10, $7, $11 ; MIPS32R6-NEXT: not $11, $11 ; MIPS32R6-NEXT: sll $15, $6, 1 ; MIPS32R6-NEXT: sllv $11, $15, $11 -; MIPS32R6-NEXT: or $11, $11, $12 -; MIPS32R6-NEXT: seleqz $11, $11, $13 +; MIPS32R6-NEXT: or $10, $11, $10 +; MIPS32R6-NEXT: selnez $11, $12, $13 +; MIPS32R6-NEXT: seleqz $10, $10, $13 ; MIPS32R6-NEXT: addiu $12, $3, -64 -; MIPS32R6-NEXT: or $10, $10, $11 ; MIPS32R6-NEXT: or $1, $1, $14 -; MIPS32R6-NEXT: sllv $11, $6, $12 -; MIPS32R6-NEXT: srl $13, $7, 1 -; MIPS32R6-NEXT: not $14, $12 -; MIPS32R6-NEXT: srlv $14, $13, $14 -; MIPS32R6-NEXT: or $11, $11, $14 -; MIPS32R6-NEXT: andi $14, $12, 32 -; MIPS32R6-NEXT: seleqz $11, $11, $14 -; MIPS32R6-NEXT: sllv $12, $7, $12 -; MIPS32R6-NEXT: selnez $15, $12, $14 -; MIPS32R6-NEXT: sltiu $24, $3, 64 -; MIPS32R6-NEXT: selnez $1, $1, $24 -; MIPS32R6-NEXT: or $11, $15, $11 +; MIPS32R6-NEXT: sllv $13, $6, $12 +; MIPS32R6-NEXT: srl $14, $7, 1 +; MIPS32R6-NEXT: not $15, $12 +; MIPS32R6-NEXT: srlv $15, $14, $15 ; MIPS32R6-NEXT: sllv $6, $6, $3 -; MIPS32R6-NEXT: srlv $2, $13, $2 -; MIPS32R6-NEXT: seleqz $8, $8, $9 -; MIPS32R6-NEXT: or $8, $8, $10 -; MIPS32R6-NEXT: or $6, $6, $2 -; MIPS32R6-NEXT: seleqz $2, $11, $24 -; MIPS32R6-NEXT: seleqz $10, $zero, $24 +; MIPS32R6-NEXT: srlv $2, $14, $2 +; MIPS32R6-NEXT: or $10, $11, $10 +; MIPS32R6-NEXT: or $2, $6, $2 +; MIPS32R6-NEXT: or $6, $13, $15 +; MIPS32R6-NEXT: andi $11, $12, 32 +; MIPS32R6-NEXT: seleqz $6, $6, $11 +; MIPS32R6-NEXT: sllv $12, $7, $12 +; MIPS32R6-NEXT: selnez $13, $12, $11 +; MIPS32R6-NEXT: sltiu $14, $3, 64 +; MIPS32R6-NEXT: selnez $1, $1, $14 +; MIPS32R6-NEXT: or $6, $13, $6 +; MIPS32R6-NEXT: seleqz $2, $2, $9 ; MIPS32R6-NEXT: sllv $7, $7, $3 -; MIPS32R6-NEXT: seleqz $11, $7, $9 -; MIPS32R6-NEXT: selnez $11, $11, $24 +; MIPS32R6-NEXT: selnez $13, $7, $9 +; MIPS32R6-NEXT: selnez $15, $10, $9 +; MIPS32R6-NEXT: or $8, $10, $8 +; MIPS32R6-NEXT: or $10, $13, $2 +; MIPS32R6-NEXT: seleqz $2, $6, $14 +; MIPS32R6-NEXT: seleqz $6, $zero, $14 +; MIPS32R6-NEXT: seleqz $7, $7, $9 +; MIPS32R6-NEXT: selnez $7, $7, $14 ; MIPS32R6-NEXT: seleqz $4, $4, $3 ; MIPS32R6-NEXT: or $1, $1, $2 ; MIPS32R6-NEXT: selnez $1, $1, $3 ; MIPS32R6-NEXT: or $2, $4, $1 -; MIPS32R6-NEXT: or $1, $10, $11 -; MIPS32R6-NEXT: seleqz $4, $6, $9 -; MIPS32R6-NEXT: selnez $6, $7, $9 +; MIPS32R6-NEXT: or $1, $6, $7 +; MIPS32R6-NEXT: selnez $4, $10, $14 ; MIPS32R6-NEXT: seleqz $5, $5, $3 -; MIPS32R6-NEXT: selnez $7, $8, $24 -; MIPS32R6-NEXT: seleqz $8, $12, $14 -; MIPS32R6-NEXT: seleqz $8, $8, $24 +; MIPS32R6-NEXT: seleqz $7, $8, $9 +; MIPS32R6-NEXT: or $7, $15, $7 +; MIPS32R6-NEXT: seleqz $8, $12, $11 +; MIPS32R6-NEXT: selnez $7, $7, $14 +; MIPS32R6-NEXT: seleqz $8, $8, $14 ; MIPS32R6-NEXT: or $7, $7, $8 ; MIPS32R6-NEXT: selnez $3, $7, $3 ; MIPS32R6-NEXT: or $3, $5, $3 ; MIPS32R6-NEXT: or $4, $6, $4 -; MIPS32R6-NEXT: selnez $4, $4, $24 -; MIPS32R6-NEXT: or $4, $10, $4 ; MIPS32R6-NEXT: jr $ra ; MIPS32R6-NEXT: move $5, $1 ; @@ -849,81 +847,80 @@ ; MMR3-NEXT: swp $16, 32($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 -; MMR3-NEXT: move $17, $7 -; MMR3-NEXT: sw $7, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $7, $6 +; MMR3-NEXT: sw $7, 24($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $5, 16($sp) # 4-byte Folded Spill ; MMR3-NEXT: move $1, $4 ; MMR3-NEXT: lw $16, 68($sp) -; MMR3-NEXT: li16 $2, 64 -; MMR3-NEXT: subu16 $6, $2, $16 -; MMR3-NEXT: srlv $9, $7, $6 -; MMR3-NEXT: andi16 $4, $6, 32 -; MMR3-NEXT: sw $4, 24($sp) # 4-byte Folded Spill -; MMR3-NEXT: li16 $3, 0 -; MMR3-NEXT: move $2, $9 -; MMR3-NEXT: movn $2, $3, $4 -; MMR3-NEXT: sllv $3, $1, $16 -; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: sllv $2, $4, $16 ; MMR3-NEXT: not16 $4, $16 ; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill ; MMR3-NEXT: srl16 $3, $5, 1 -; MMR3-NEXT: srlv $3, $3, $4 -; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: sllv $5, $5, $16 -; MMR3-NEXT: sw $5, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $4, $16, 32 -; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $3, $5, $4 -; MMR3-NEXT: addiu $4, $16, -64 -; MMR3-NEXT: or16 $3, $2 -; MMR3-NEXT: sllv $2, $7, $4 +; MMR3-NEXT: srlv $4, $3, $4 +; MMR3-NEXT: li16 $3, 64 +; MMR3-NEXT: or16 $4, $2 +; MMR3-NEXT: sllv $2, $5, $16 +; MMR3-NEXT: sw $2, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: subu16 $3, $3, $16 +; MMR3-NEXT: srlv $8, $6, $3 +; MMR3-NEXT: sw $6, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: andi16 $5, $3, 32 +; MMR3-NEXT: andi16 $7, $16, 32 +; MMR3-NEXT: sw $7, 28($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $2, $8 +; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: movn $2, $17, $5 ; MMR3-NEXT: sw $2, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: srl16 $5, $17, 1 -; MMR3-NEXT: not16 $2, $4 -; MMR3-NEXT: srlv $2, $5, $2 -; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $2, $17 -; MMR3-NEXT: lw $17, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: sllv $8, $17, $4 -; MMR3-NEXT: andi16 $4, $4, 32 -; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $2, $8, $4 -; MMR3-NEXT: sltiu $10, $16, 64 -; MMR3-NEXT: movn $2, $3, $10 -; MMR3-NEXT: srlv $4, $17, $6 -; MMR3-NEXT: not16 $3, $6 -; MMR3-NEXT: sll16 $6, $7, 1 -; MMR3-NEXT: sllv $3, $6, $3 -; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: sllv $6, $7, $16 +; MMR3-NEXT: move $2, $5 +; MMR3-NEXT: lw $5, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $5, $7 +; MMR3-NEXT: lw $7, 24($sp) # 4-byte Folded Reload +; MMR3-NEXT: srlv $7, $7, $3 +; MMR3-NEXT: not16 $3, $3 +; MMR3-NEXT: sll16 $17, $6, 1 +; MMR3-NEXT: sllv $3, $17, $3 +; MMR3-NEXT: or16 $3, $7 +; MMR3-NEXT: movn $3, $8, $2 +; MMR3-NEXT: or16 $5, $3 +; MMR3-NEXT: addiu $7, $16, -64 +; MMR3-NEXT: lw $6, 28($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $5, $3, $6 +; MMR3-NEXT: lw $2, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $4, $2 +; MMR3-NEXT: lw $6, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $2, $6, $7 +; MMR3-NEXT: sw $2, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: lw $17, 24($sp) # 4-byte Folded Reload +; MMR3-NEXT: srl16 $3, $17, 1 +; MMR3-NEXT: sw $3, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $2, $7 +; MMR3-NEXT: srlv $2, $3, $2 +; MMR3-NEXT: lw $3, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $2, $3 +; MMR3-NEXT: sllv $3, $17, $7 +; MMR3-NEXT: andi16 $7, $7, 32 +; MMR3-NEXT: movn $2, $3, $7 +; MMR3-NEXT: sltiu $8, $16, 64 +; MMR3-NEXT: movn $2, $4, $8 +; MMR3-NEXT: movz $2, $1, $16 +; MMR3-NEXT: sllv $1, $17, $16 +; MMR3-NEXT: sllv $17, $6, $16 +; MMR3-NEXT: li16 $4, 0 +; MMR3-NEXT: movn $3, $4, $7 +; MMR3-NEXT: movn $3, $5, $8 ; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $5, 12($sp) # 4-byte Folded Reload ; MMR3-NEXT: srlv $4, $5, $4 -; MMR3-NEXT: or16 $4, $6 -; MMR3-NEXT: sllv $6, $17, $16 -; MMR3-NEXT: lw $17, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $4, $6, $17 -; MMR3-NEXT: movz $2, $1, $16 +; MMR3-NEXT: lw $5, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: movz $3, $5, $16 +; MMR3-NEXT: or16 $4, $17 +; MMR3-NEXT: lw $6, 28($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $1, $6 ; MMR3-NEXT: li16 $5, 0 -; MMR3-NEXT: movz $4, $5, $10 -; MMR3-NEXT: lw $7, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $9, $7 -; MMR3-NEXT: lw $5, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $5, $7, $17 -; MMR3-NEXT: or16 $5, $3 -; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $8, $7, $3 -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $8, $5, $10 -; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: movz $8, $3, $16 -; MMR3-NEXT: movn $6, $7, $17 -; MMR3-NEXT: li16 $3, 0 -; MMR3-NEXT: movz $6, $3, $10 -; MMR3-NEXT: move $3, $8 -; MMR3-NEXT: move $5, $6 +; MMR3-NEXT: movz $4, $5, $8 +; MMR3-NEXT: movn $1, $5, $6 +; MMR3-NEXT: li16 $5, 0 +; MMR3-NEXT: movz $1, $5, $8 +; MMR3-NEXT: move $5, $1 ; MMR3-NEXT: lwp $16, 32($sp) ; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra @@ -940,7 +937,8 @@ ; MMR6-NEXT: lw $3, 44($sp) ; MMR6-NEXT: sllv $1, $4, $3 ; MMR6-NEXT: not16 $2, $3 -; MMR6-NEXT: sw $2, 4($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $2, 0($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $5, 4($sp) # 4-byte Folded Spill ; MMR6-NEXT: srl16 $16, $5, 1 ; MMR6-NEXT: srlv $8, $16, $2 ; MMR6-NEXT: or $1, $1, $8 @@ -954,57 +952,60 @@ ; MMR6-NEXT: andi16 $2, $17, 32 ; MMR6-NEXT: seleqz $12, $10, $2 ; MMR6-NEXT: or $1, $9, $1 -; MMR6-NEXT: selnez $9, $10, $2 -; MMR6-NEXT: srlv $10, $7, $17 +; MMR6-NEXT: srlv $9, $7, $17 ; MMR6-NEXT: not16 $17, $17 ; MMR6-NEXT: sll16 $4, $6, 1 ; MMR6-NEXT: sllv $4, $4, $17 -; MMR6-NEXT: or $4, $4, $10 -; MMR6-NEXT: seleqz $2, $4, $2 +; MMR6-NEXT: or $4, $4, $9 +; MMR6-NEXT: selnez $9, $10, $2 +; MMR6-NEXT: seleqz $13, $4, $2 ; MMR6-NEXT: addiu $4, $3, -64 -; MMR6-NEXT: or $10, $9, $2 ; MMR6-NEXT: or $1, $1, $12 -; MMR6-NEXT: sllv $9, $6, $4 -; MMR6-NEXT: srl16 $2, $7, 1 -; MMR6-NEXT: not16 $17, $4 -; MMR6-NEXT: srlv $12, $2, $17 -; MMR6-NEXT: or $9, $9, $12 -; MMR6-NEXT: andi16 $17, $4, 32 -; MMR6-NEXT: seleqz $9, $9, $17 -; MMR6-NEXT: sllv $14, $7, $4 -; MMR6-NEXT: selnez $12, $14, $17 -; MMR6-NEXT: sltiu $13, $3, 64 -; MMR6-NEXT: selnez $1, $1, $13 -; MMR6-NEXT: or $9, $12, $9 +; MMR6-NEXT: sllv $10, $6, $4 +; MMR6-NEXT: srl16 $17, $7, 1 +; MMR6-NEXT: not16 $5, $4 +; MMR6-NEXT: srlv $5, $17, $5 ; MMR6-NEXT: sllv $6, $6, $3 -; MMR6-NEXT: lw $4, 4($sp) # 4-byte Folded Reload -; MMR6-NEXT: srlv $2, $2, $4 -; MMR6-NEXT: seleqz $8, $8, $16 -; MMR6-NEXT: li16 $4, 0 -; MMR6-NEXT: or $8, $8, $10 -; MMR6-NEXT: or $6, $6, $2 -; MMR6-NEXT: seleqz $2, $9, $13 -; MMR6-NEXT: seleqz $9, $4, $13 +; MMR6-NEXT: lw $2, 0($sp) # 4-byte Folded Reload +; MMR6-NEXT: srlv $12, $17, $2 +; MMR6-NEXT: or $2, $9, $13 +; MMR6-NEXT: or $9, $6, $12 +; MMR6-NEXT: or $5, $10, $5 +; MMR6-NEXT: andi16 $6, $4, 32 +; MMR6-NEXT: seleqz $5, $5, $6 +; MMR6-NEXT: sllv $4, $7, $4 +; MMR6-NEXT: selnez $10, $4, $6 +; MMR6-NEXT: sltiu $12, $3, 64 +; MMR6-NEXT: selnez $1, $1, $12 +; MMR6-NEXT: or $5, $10, $5 +; MMR6-NEXT: seleqz $9, $9, $16 ; MMR6-NEXT: sllv $7, $7, $3 -; MMR6-NEXT: seleqz $10, $7, $16 -; MMR6-NEXT: selnez $10, $10, $13 -; MMR6-NEXT: seleqz $11, $11, $3 +; MMR6-NEXT: selnez $10, $7, $16 +; MMR6-NEXT: li16 $17, 0 +; MMR6-NEXT: selnez $13, $2, $16 +; MMR6-NEXT: or $8, $2, $8 +; MMR6-NEXT: or $9, $10, $9 +; MMR6-NEXT: seleqz $2, $5, $12 +; MMR6-NEXT: seleqz $5, $17, $12 +; MMR6-NEXT: seleqz $7, $7, $16 +; MMR6-NEXT: selnez $7, $7, $12 +; MMR6-NEXT: seleqz $10, $11, $3 ; MMR6-NEXT: or $1, $1, $2 ; MMR6-NEXT: selnez $1, $1, $3 -; MMR6-NEXT: or $2, $11, $1 -; MMR6-NEXT: or $1, $9, $10 -; MMR6-NEXT: seleqz $6, $6, $16 -; MMR6-NEXT: selnez $7, $7, $16 -; MMR6-NEXT: seleqz $5, $5, $3 -; MMR6-NEXT: selnez $8, $8, $13 -; MMR6-NEXT: seleqz $4, $14, $17 -; MMR6-NEXT: seleqz $4, $4, $13 -; MMR6-NEXT: or $4, $8, $4 +; MMR6-NEXT: or $2, $10, $1 +; MMR6-NEXT: or $1, $5, $7 +; MMR6-NEXT: selnez $7, $9, $12 +; MMR6-NEXT: lw $17, 4($sp) # 4-byte Folded Reload +; MMR6-NEXT: seleqz $9, $17, $3 +; MMR6-NEXT: seleqz $8, $8, $16 +; MMR6-NEXT: or $8, $13, $8 +; MMR6-NEXT: seleqz $4, $4, $6 +; MMR6-NEXT: selnez $6, $8, $12 +; MMR6-NEXT: seleqz $4, $4, $12 +; MMR6-NEXT: or $4, $6, $4 ; MMR6-NEXT: selnez $3, $4, $3 -; MMR6-NEXT: or $3, $5, $3 -; MMR6-NEXT: or $4, $7, $6 -; MMR6-NEXT: selnez $4, $4, $13 -; MMR6-NEXT: or $4, $9, $4 +; MMR6-NEXT: or $3, $9, $3 +; MMR6-NEXT: or $4, $5, $7 ; MMR6-NEXT: move $5, $1 ; MMR6-NEXT: lw $16, 8($sp) # 4-byte Folded Reload ; MMR6-NEXT: lw $17, 12($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/cmpb-ppc32.ll b/llvm/test/CodeGen/PowerPC/cmpb-ppc32.ll --- a/llvm/test/CodeGen/PowerPC/cmpb-ppc32.ll +++ b/llvm/test/CodeGen/PowerPC/cmpb-ppc32.ll @@ -15,10 +15,19 @@ %conv29 = trunc i32 %or to i16 ret i16 %conv29 -; CHECK-LABEL: @test16 -; CHECK: cmpb [[REG1:[0-9]+]], 4, 3 -; CHECK: clrlwi 3, [[REG1]], 16 -; CHECK: blr +; CHECK-LABEL: test16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: lis 5, 0 +; CHECK-NEXT: andi. 4, 3, 255 +; CHECK-NEXT: cmplwi 1, 3, 256 +; CHECK-NEXT: ori 3, 5, 65280 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: isel 3, 3, 4, 4 +; CHECK-NEXT: li 4, 255 +; CHECK-NEXT: rlwimi 4, 3, 0, 0, 23 +; CHECK-NEXT: iseleq 3, 4, 3 +; CHECK-NEXT: blr } define i32 @test32(i32 %x, i32 %y) #0 { @@ -40,10 +49,30 @@ %or52 = or i32 %or49, %conv47 ret i32 %or52 -; CHECK-LABEL: @test32 -; CHECK: cmpb 3, 4, 3 -; CHECK-NOT: rlwinm -; CHECK: blr +; CHECK-LABEL: test32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: lis 5, -256 +; CHECK-NEXT: lis 7, 0 +; CHECK-NEXT: andis. 4, 3, 65280 +; CHECK-NEXT: ori 7, 7, 65280 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: clrlwi 6, 3, 24 +; CHECK-NEXT: iseleq 4, 5, 4 +; CHECK-NEXT: rlwinm 5, 3, 0, 16, 23 +; CHECK-NEXT: rlwimi 7, 4, 0, 24, 15 +; CHECK-NEXT: cmplwi 5, 0 +; CHECK-NEXT: li 5, 255 +; CHECK-NEXT: iseleq 4, 7, 4 +; CHECK-NEXT: cmplwi 6, 0 +; CHECK-NEXT: rlwimi 5, 4, 0, 0, 23 +; CHECK-NEXT: rlwinm 3, 3, 0, 8, 15 +; CHECK-NEXT: iseleq 4, 5, 4 +; CHECK-NEXT: lis 5, 255 +; CHECK-NEXT: rlwimi 5, 4, 0, 16, 7 +; CHECK-NEXT: cmplwi 3, 0 +; CHECK-NEXT: iseleq 3, 5, 4 +; CHECK-NEXT: blr } attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/PowerPC/cmpb.ll b/llvm/test/CodeGen/PowerPC/cmpb.ll --- a/llvm/test/CodeGen/PowerPC/cmpb.ll +++ b/llvm/test/CodeGen/PowerPC/cmpb.ll @@ -97,10 +97,28 @@ %or52 = or i32 %or49, %conv47 ret i32 %or52 -; CHECK-LABEL: @test32 -; CHECK: cmpb [[REG1:[0-9]+]], 4, 3 -; CHECK: clrldi 3, [[REG1]], 32 -; CHECK: blr +; CHECK-LABEL: test32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: li 5, 255 +; CHECK-NEXT: andi. 4, 3, 255 +; CHECK-NEXT: mcrf 1, 0 +; CHECK-NEXT: andi. 4, 3, 65280 +; CHECK-NEXT: mcrf 5, 0 +; CHECK-NEXT: andis. 4, 3, 255 +; CHECK-NEXT: mcrf 6, 0 +; CHECK-NEXT: andis. 3, 3, 65280 +; CHECK-NEXT: sldi 4, 5, 24 +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: iseleq 4, 4, 3 +; CHECK-NEXT: ori 6, 4, 65280 +; CHECK-NEXT: isel 5, 5, 3, 6 +; CHECK-NEXT: isel 4, 6, 4, 22 +; CHECK-NEXT: lis 6, 255 +; CHECK-NEXT: or 4, 4, 5 +; CHECK-NEXT: isel 3, 6, 3, 26 +; CHECK-NEXT: or 3, 4, 3 +; CHECK-NEXT: blr } define zeroext i32 @test32p1(i32 zeroext %x, i32 zeroext %y) #0 { @@ -122,11 +140,28 @@ %or55 = or i32 %or52, %conv50 ret i32 %or55 -; CHECK-LABEL: @test32p1 -; CHECK: cmpb [[REG1:[0-9]+]], 4, 3 -; CHECK: rldicl [[REG2:[0-9]+]], [[REG1]], 40, 5 -; CHECK: rldicl 3, [[REG2]], 24, 32 -; CHECK: blr +; CHECK-LABEL: test32p1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: li 5, 255 +; CHECK-NEXT: andi. 4, 3, 255 +; CHECK-NEXT: mcrf 1, 0 +; CHECK-NEXT: andi. 4, 3, 65280 +; CHECK-NEXT: mcrf 5, 0 +; CHECK-NEXT: andis. 4, 3, 255 +; CHECK-NEXT: mcrf 6, 0 +; CHECK-NEXT: andis. 3, 3, 65280 +; CHECK-NEXT: sldi 4, 5, 24 +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: iseleq 4, 4, 3 +; CHECK-NEXT: ori 6, 4, 65280 +; CHECK-NEXT: isel 5, 5, 3, 6 +; CHECK-NEXT: isel 4, 6, 4, 22 +; CHECK-NEXT: lis 6, 7 +; CHECK-NEXT: or 4, 4, 5 +; CHECK-NEXT: isel 3, 6, 3, 26 +; CHECK-NEXT: or 3, 4, 3 +; CHECK-NEXT: blr } define zeroext i32 @test32p2(i32 zeroext %x, i32 zeroext %y) #0 { @@ -144,11 +179,23 @@ %or37 = or i32 %or, %conv32 ret i32 %or37 -; CHECK-LABEL: @test32p2 -; CHECK: cmpb [[REG1:[0-9]+]], 4, 3 -; CHECK: rldicl [[REG2:[0-9]+]], [[REG1]], 40, 8 -; CHECK: rldicl 3, [[REG2]], 24, 32 -; CHECK: blr +; CHECK-LABEL: test32p2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: li 5, 255 +; CHECK-NEXT: andi. 4, 3, 255 +; CHECK-NEXT: mcrf 1, 0 +; CHECK-NEXT: andi. 4, 3, 65280 +; CHECK-NEXT: mcrf 5, 0 +; CHECK-NEXT: andis. 3, 3, 65280 +; CHECK-NEXT: sldi 4, 5, 24 +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: iseleq 4, 4, 3 +; CHECK-NEXT: ori 6, 4, 65280 +; CHECK-NEXT: isel 3, 5, 3, 6 +; CHECK-NEXT: isel 4, 6, 4, 22 +; CHECK-NEXT: or 3, 4, 3 +; CHECK-NEXT: blr } define i64 @test64(i64 %x, i64 %y) #0 { @@ -190,10 +237,51 @@ %or112 = or i64 %or109, %conv110 ret i64 %or112 -; CHECK-LABEL: @test64 -; CHECK: cmpb 3, 3, 4 -; CHECK-NOT: rldicl -; CHECK: blr +; CHECK-LABEL: test64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xor 6, 4, 3 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: andi. 7, 6, 65280 +; CHECK-NEXT: rldicl 8, 6, 32, 32 +; CHECK-NEXT: mcrf 1, 0 +; CHECK-NEXT: andis. 9, 6, 255 +; CHECK-NEXT: ori 7, 5, 65280 +; CHECK-NEXT: mcrf 5, 0 +; CHECK-NEXT: andis. 9, 6, 65280 +; CHECK-NEXT: rldicl 3, 3, 8, 56 +; CHECK-NEXT: mcrf 6, 0 +; CHECK-NEXT: rldicl. 8, 8, 32, 24 +; CHECK-NEXT: rldicl 4, 4, 8, 56 +; CHECK-NEXT: isel 5, 7, 5, 6 +; CHECK-NEXT: clrldi 7, 6, 56 +; CHECK-NEXT: mcrf 1, 0 +; CHECK-NEXT: cmpldi 7, 0 +; CHECK-NEXT: ori 7, 5, 255 +; CHECK-NEXT: rldicl 8, 6, 24, 40 +; CHECK-NEXT: iseleq 5, 7, 5 +; CHECK-NEXT: rldicl. 7, 8, 40, 16 +; CHECK-NEXT: rldicl 6, 6, 16, 48 +; CHECK-NEXT: li 8, 255 +; CHECK-NEXT: oris 7, 5, 255 +; CHECK-NEXT: mcrf 7, 0 +; CHECK-NEXT: rldicl. 6, 6, 48, 8 +; CHECK-NEXT: isel 5, 7, 5, 22 +; CHECK-NEXT: oris 7, 5, 65280 +; CHECK-NEXT: isel 5, 7, 5, 26 +; CHECK-NEXT: sldi 7, 8, 32 +; CHECK-NEXT: or 6, 5, 7 +; CHECK-NEXT: isel 5, 6, 5, 6 +; CHECK-NEXT: sldi 6, 8, 40 +; CHECK-NEXT: or 6, 5, 6 +; CHECK-NEXT: isel 5, 6, 5, 30 +; CHECK-NEXT: sldi 6, 8, 48 +; CHECK-NEXT: or 6, 5, 6 +; CHECK-NEXT: iseleq 5, 6, 5 +; CHECK-NEXT: sldi 6, 8, 56 +; CHECK-NEXT: or 6, 5, 6 +; CHECK-NEXT: cmplw 3, 4 +; CHECK-NEXT: iseleq 3, 6, 5 +; CHECK-NEXT: blr } attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll b/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll --- a/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll +++ b/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll @@ -54,12 +54,10 @@ define i32 @pattern4(i1 %cond, i32 %x) { ; CHECK-LABEL: pattern4: ; CHECK: # %bb.0: -; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: andi. 3, 3, 1 -; CHECK-NEXT: oris 3, 5, 65535 -; CHECK-NEXT: ori 3, 3, 65535 -; CHECK-NEXT: iselgt 3, 3, 5 -; CHECK-NEXT: or 3, 4, 3 +; CHECK-NEXT: ori 3, 4, 65535 +; CHECK-NEXT: oris 3, 3, 65535 +; CHECK-NEXT: iselgt 3, 3, 4 ; CHECK-NEXT: blr %sel = select i1 %cond, i32 -1, i32 0 %res = or i32 %x, %sel diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll --- a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll +++ b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll @@ -521,25 +521,24 @@ define zeroext i32 @ppcq_to_u32(ppc_fp128 %m) #0 { ; P8-LABEL: ppcq_to_u32: ; P8: # %bb.0: # %entry +; P8-NEXT: mfcr r12 ; P8-NEXT: mflr r0 ; P8-NEXT: std r0, 16(r1) -; P8-NEXT: stdu r1, -128(r1) -; P8-NEXT: .cfi_def_cfa_offset 128 +; P8-NEXT: stw r12, 8(r1) +; P8-NEXT: stdu r1, -112(r1) +; P8-NEXT: .cfi_def_cfa_offset 112 ; P8-NEXT: .cfi_offset lr, 16 -; P8-NEXT: .cfi_offset r30, -16 +; P8-NEXT: .cfi_offset cr2, 8 ; P8-NEXT: addis r3, r2, .LCPI11_0@toc@ha ; P8-NEXT: xxlxor f3, f3, f3 -; P8-NEXT: std r30, 112(r1) # 8-byte Folded Spill ; P8-NEXT: lfs f0, .LCPI11_0@toc@l(r3) ; P8-NEXT: fcmpo cr0, f2, f3 -; P8-NEXT: lis r3, -32768 ; P8-NEXT: xxlxor f3, f3, f3 ; P8-NEXT: fcmpo cr1, f1, f0 ; P8-NEXT: crand 4*cr5+lt, 4*cr1+eq, lt ; P8-NEXT: crandc 4*cr5+gt, 4*cr1+lt, 4*cr1+eq -; P8-NEXT: cror 4*cr5+lt, 4*cr5+gt, 4*cr5+lt -; P8-NEXT: isel r30, 0, r3, 4*cr5+lt -; P8-NEXT: bc 12, 4*cr5+lt, .LBB11_2 +; P8-NEXT: cror 4*cr2+lt, 4*cr5+gt, 4*cr5+lt +; P8-NEXT: bc 12, 4*cr2+lt, .LBB11_2 ; P8-NEXT: # %bb.1: # %entry ; P8-NEXT: fmr f3, f0 ; P8-NEXT: .LBB11_2: # %entry @@ -553,35 +552,36 @@ ; P8-NEXT: mtfsf 1, f0 ; P8-NEXT: xscvdpsxws f0, f1 ; P8-NEXT: mffprwz r3, f0 -; P8-NEXT: xor r3, r3, r30 -; P8-NEXT: ld r30, 112(r1) # 8-byte Folded Reload +; P8-NEXT: xoris r4, r3, 32768 +; P8-NEXT: isel r3, r3, r4, 4*cr2+lt ; P8-NEXT: clrldi r3, r3, 32 -; P8-NEXT: addi r1, r1, 128 +; P8-NEXT: addi r1, r1, 112 ; P8-NEXT: ld r0, 16(r1) +; P8-NEXT: lwz r12, 8(r1) +; P8-NEXT: mtocrf 32, r12 ; P8-NEXT: mtlr r0 ; P8-NEXT: blr ; ; P9-LABEL: ppcq_to_u32: ; P9: # %bb.0: # %entry ; P9-NEXT: mflr r0 -; P9-NEXT: .cfi_def_cfa_offset 48 -; P9-NEXT: .cfi_offset lr, 16 -; P9-NEXT: .cfi_offset r30, -16 -; P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P9-NEXT: mfocrf r12, 32 ; P9-NEXT: std r0, 16(r1) -; P9-NEXT: stdu r1, -48(r1) +; P9-NEXT: stw r12, 8(r1) +; P9-NEXT: stdu r1, -32(r1) +; P9-NEXT: .cfi_def_cfa_offset 32 +; P9-NEXT: .cfi_offset lr, 16 +; P9-NEXT: .cfi_offset cr2, 8 ; P9-NEXT: addis r3, r2, .LCPI11_0@toc@ha ; P9-NEXT: xxlxor f3, f3, f3 ; P9-NEXT: lfs f0, .LCPI11_0@toc@l(r3) ; P9-NEXT: fcmpo cr1, f2, f3 -; P9-NEXT: lis r3, -32768 -; P9-NEXT: fcmpo cr0, f1, f0 ; P9-NEXT: xxlxor f3, f3, f3 +; P9-NEXT: fcmpo cr0, f1, f0 ; P9-NEXT: crand 4*cr5+lt, eq, 4*cr1+lt ; P9-NEXT: crandc 4*cr5+gt, lt, eq -; P9-NEXT: cror 4*cr5+lt, 4*cr5+gt, 4*cr5+lt -; P9-NEXT: isel r30, 0, r3, 4*cr5+lt -; P9-NEXT: bc 12, 4*cr5+lt, .LBB11_2 +; P9-NEXT: cror 4*cr2+lt, 4*cr5+gt, 4*cr5+lt +; P9-NEXT: bc 12, 4*cr2+lt, .LBB11_2 ; P9-NEXT: # %bb.1: # %entry ; P9-NEXT: fmr f3, f0 ; P9-NEXT: .LBB11_2: # %entry @@ -595,12 +595,14 @@ ; P9-NEXT: mtfsf 1, f0 ; P9-NEXT: xscvdpsxws f0, f1 ; P9-NEXT: mffprwz r3, f0 -; P9-NEXT: xor r3, r3, r30 +; P9-NEXT: xoris r4, r3, 32768 +; P9-NEXT: isel r3, r3, r4, 4*cr2+lt ; P9-NEXT: clrldi r3, r3, 32 -; P9-NEXT: addi r1, r1, 48 +; P9-NEXT: addi r1, r1, 32 ; P9-NEXT: ld r0, 16(r1) -; P9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P9-NEXT: lwz r12, 8(r1) ; P9-NEXT: mtlr r0 +; P9-NEXT: mtocrf 32, r12 ; P9-NEXT: blr ; ; NOVSX-LABEL: ppcq_to_u32: @@ -637,10 +639,9 @@ ; NOVSX-NEXT: mtfsf 1, f0 ; NOVSX-NEXT: fctiwz f0, f1 ; NOVSX-NEXT: stfiwx f0, 0, r3 -; NOVSX-NEXT: lis r3, -32768 -; NOVSX-NEXT: lwz r4, 44(r1) -; NOVSX-NEXT: isel r3, 0, r3, 4*cr2+lt -; NOVSX-NEXT: xor r3, r4, r3 +; NOVSX-NEXT: lwz r3, 44(r1) +; NOVSX-NEXT: xoris r4, r3, 32768 +; NOVSX-NEXT: isel r3, r3, r4, 4*cr2+lt ; NOVSX-NEXT: clrldi r3, r3, 32 ; NOVSX-NEXT: addi r1, r1, 48 ; NOVSX-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/noPermuteFormasking.ll b/llvm/test/CodeGen/PowerPC/noPermuteFormasking.ll --- a/llvm/test/CodeGen/PowerPC/noPermuteFormasking.ll +++ b/llvm/test/CodeGen/PowerPC/noPermuteFormasking.ll @@ -10,9 +10,10 @@ ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: cmpdi 1, 3, 0 ; CHECK-NEXT: andi. 4, 3, 3 -; CHECK-NEXT: crand 20, 2, 5 -; CHECK-NEXT: isel 3, 0, 3, 20 ; CHECK-NEXT: addi 3, 3, -1 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: crand 20, 2, 5 +; CHECK-NEXT: isel 3, 4, 3, 20 ; CHECK-NEXT: cmpldi 3, 3 ; CHECK-NEXT: bltlr+ 0 ; CHECK-NEXT: # %bb.1: # %for.body.i.i.i.i.i.i.i @@ -44,10 +45,9 @@ define signext i32 @andis_bot(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: andis_bot: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw 4, 3, 4 ; CHECK-NEXT: andis. 5, 3, 1 -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: iseleq 4, 4, 5 -; CHECK-NEXT: mullw 3, 4, 3 +; CHECK-NEXT: iseleq 3, 4, 3 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr entry: @@ -62,10 +62,9 @@ define signext i32 @andis_mid(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: andis_mid: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw 4, 3, 4 ; CHECK-NEXT: andis. 5, 3, 252 -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: iseleq 4, 4, 5 -; CHECK-NEXT: mullw 3, 4, 3 +; CHECK-NEXT: iseleq 3, 4, 3 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr entry: @@ -80,10 +79,9 @@ define signext i32 @andis_top(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: andis_top: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw 4, 3, 4 ; CHECK-NEXT: andis. 5, 3, 64512 -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: iseleq 4, 4, 5 -; CHECK-NEXT: mullw 3, 4, 3 +; CHECK-NEXT: iseleq 3, 4, 3 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr entry: @@ -94,6 +92,12 @@ } define i64 @andis_no_cmp(i64 %a, i64 %b) { +; CHECK-LABEL: andis_no_cmp: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mulld 4, 3, 4 +; CHECK-NEXT: andis. 5, 3, 1 +; CHECK-NEXT: iseleq 3, 4, 3 +; CHECK-NEXT: blr entry: %and = and i64 %a, 65536 %tobool = icmp eq i64 %and, 0 diff --git a/llvm/test/CodeGen/PowerPC/nofpexcept.ll b/llvm/test/CodeGen/PowerPC/nofpexcept.ll --- a/llvm/test/CodeGen/PowerPC/nofpexcept.ll +++ b/llvm/test/CodeGen/PowerPC/nofpexcept.ll @@ -112,9 +112,6 @@ ; CHECK: [[COPY13:%[0-9]+]]:crbitrc = COPY [[FCMPOD]].sub_lt ; CHECK: [[CRANDC:%[0-9]+]]:crbitrc = CRANDC killed [[COPY13]], killed [[COPY12]] ; CHECK: [[CROR:%[0-9]+]]:crbitrc = CROR killed [[CRANDC]], killed [[CRAND]] - ; CHECK: [[LIS:%[0-9]+]]:gprc_and_gprc_nor0 = LIS 32768 - ; CHECK: [[LI:%[0-9]+]]:gprc_and_gprc_nor0 = LI 0 - ; CHECK: [[ISEL:%[0-9]+]]:gprc = ISEL [[LI]], [[LIS]], [[CROR]] ; CHECK: BC [[CROR]], %bb.2 ; CHECK: bb.1.entry: ; CHECK: successors: %bb.2(0x80000000) @@ -132,12 +129,13 @@ ; CHECK: [[MFFS1:%[0-9]+]]:f8rc = MFFS implicit $rm ; CHECK: MTFSB1 31, implicit-def $rm ; CHECK: MTFSB0 30, implicit-def $rm - ; CHECK: %37:f8rc = nofpexcept FADD [[COPY15]], [[COPY14]], implicit $rm + ; CHECK: [[FADD:%[0-9]+]]:f8rc = nofpexcept FADD [[COPY15]], [[COPY14]], implicit $rm ; CHECK: MTFSFb 1, [[MFFS1]], implicit-def $rm - ; CHECK: %38:vsfrc = nofpexcept XSCVDPSXWS killed %37, implicit $rm - ; CHECK: [[MFVSRWZ3:%[0-9]+]]:gprc = MFVSRWZ killed %38 - ; CHECK: [[XOR:%[0-9]+]]:gprc = XOR killed [[MFVSRWZ3]], killed [[ISEL]] - ; CHECK: STW killed [[XOR]], 0, [[COPY1]] :: (volatile store 4 into %ir.addr1) + ; CHECK: [[XSCVDPSXWS:%[0-9]+]]:vsfrc = nofpexcept XSCVDPSXWS killed [[FADD]], implicit $rm + ; CHECK: [[MFVSRWZ3:%[0-9]+]]:gprc_and_gprc_nor0 = MFVSRWZ killed [[XSCVDPSXWS]] + ; CHECK: [[XORIS:%[0-9]+]]:gprc_and_gprc_nor0 = XORIS [[MFVSRWZ3]], 32768 + ; CHECK: [[ISEL:%[0-9]+]]:gprc = ISEL [[MFVSRWZ3]], [[XORIS]], [[CROR]] + ; CHECK: STW killed [[ISEL]], 0, [[COPY1]] :: (volatile store 4 into %ir.addr1) ; CHECK: BLR8 implicit $lr8, implicit $rm entry: %conv1 = tail call i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128 %m, metadata !"fpexcept.ignore") #0 diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll @@ -1286,22 +1286,21 @@ define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) #0 { ; PC64LE-LABEL: test_fptoui_ppc_i32_ppc_fp128: ; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mfocrf 12, 32 ; PC64LE-NEXT: mflr 0 -; PC64LE-NEXT: std 30, -16(1) # 8-byte Folded Spill ; PC64LE-NEXT: std 0, 16(1) -; PC64LE-NEXT: stdu 1, -48(1) +; PC64LE-NEXT: stw 12, 8(1) +; PC64LE-NEXT: stdu 1, -32(1) ; PC64LE-NEXT: addis 3, 2, .LCPI31_0@toc@ha ; PC64LE-NEXT: xxlxor 3, 3, 3 ; PC64LE-NEXT: lfs 0, .LCPI31_0@toc@l(3) ; PC64LE-NEXT: fcmpo 0, 2, 3 -; PC64LE-NEXT: lis 3, -32768 ; PC64LE-NEXT: xxlxor 3, 3, 3 ; PC64LE-NEXT: fcmpo 1, 1, 0 ; PC64LE-NEXT: crand 20, 6, 0 ; PC64LE-NEXT: crandc 21, 4, 6 -; PC64LE-NEXT: cror 20, 21, 20 -; PC64LE-NEXT: isel 30, 0, 3, 20 -; PC64LE-NEXT: bc 12, 20, .LBB31_2 +; PC64LE-NEXT: cror 8, 21, 20 +; PC64LE-NEXT: bc 12, 8, .LBB31_2 ; PC64LE-NEXT: # %bb.1: # %entry ; PC64LE-NEXT: fmr 3, 0 ; PC64LE-NEXT: .LBB31_2: # %entry @@ -1315,31 +1314,32 @@ ; PC64LE-NEXT: mtfsf 1, 0 ; PC64LE-NEXT: xscvdpsxws 0, 1 ; PC64LE-NEXT: mffprwz 3, 0 -; PC64LE-NEXT: xor 3, 3, 30 -; PC64LE-NEXT: addi 1, 1, 48 +; PC64LE-NEXT: xoris 4, 3, 32768 +; PC64LE-NEXT: isel 3, 3, 4, 8 +; PC64LE-NEXT: addi 1, 1, 32 ; PC64LE-NEXT: ld 0, 16(1) -; PC64LE-NEXT: ld 30, -16(1) # 8-byte Folded Reload +; PC64LE-NEXT: lwz 12, 8(1) +; PC64LE-NEXT: mtocrf 32, 12 ; PC64LE-NEXT: mtlr 0 ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: test_fptoui_ppc_i32_ppc_fp128: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: mflr 0 -; PC64LE9-NEXT: std 30, -16(1) # 8-byte Folded Spill +; PC64LE9-NEXT: mfocrf 12, 32 ; PC64LE9-NEXT: std 0, 16(1) -; PC64LE9-NEXT: stdu 1, -48(1) +; PC64LE9-NEXT: stw 12, 8(1) +; PC64LE9-NEXT: stdu 1, -32(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI31_0@toc@ha ; PC64LE9-NEXT: xxlxor 3, 3, 3 ; PC64LE9-NEXT: lfs 0, .LCPI31_0@toc@l(3) ; PC64LE9-NEXT: fcmpo 1, 2, 3 -; PC64LE9-NEXT: lis 3, -32768 -; PC64LE9-NEXT: fcmpo 0, 1, 0 ; PC64LE9-NEXT: xxlxor 3, 3, 3 +; PC64LE9-NEXT: fcmpo 0, 1, 0 ; PC64LE9-NEXT: crand 20, 2, 4 ; PC64LE9-NEXT: crandc 21, 0, 2 -; PC64LE9-NEXT: cror 20, 21, 20 -; PC64LE9-NEXT: isel 30, 0, 3, 20 -; PC64LE9-NEXT: bc 12, 20, .LBB31_2 +; PC64LE9-NEXT: cror 8, 21, 20 +; PC64LE9-NEXT: bc 12, 8, .LBB31_2 ; PC64LE9-NEXT: # %bb.1: # %entry ; PC64LE9-NEXT: fmr 3, 0 ; PC64LE9-NEXT: .LBB31_2: # %entry @@ -1353,11 +1353,13 @@ ; PC64LE9-NEXT: mtfsf 1, 0 ; PC64LE9-NEXT: xscvdpsxws 0, 1 ; PC64LE9-NEXT: mffprwz 3, 0 -; PC64LE9-NEXT: xor 3, 3, 30 -; PC64LE9-NEXT: addi 1, 1, 48 +; PC64LE9-NEXT: xoris 4, 3, 32768 +; PC64LE9-NEXT: isel 3, 3, 4, 8 +; PC64LE9-NEXT: addi 1, 1, 32 ; PC64LE9-NEXT: ld 0, 16(1) -; PC64LE9-NEXT: ld 30, -16(1) # 8-byte Folded Reload +; PC64LE9-NEXT: lwz 12, 8(1) ; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: mtocrf 32, 12 ; PC64LE9-NEXT: blr ; ; PC64-LABEL: test_fptoui_ppc_i32_ppc_fp128: @@ -1385,19 +1387,18 @@ ; PC64-NEXT: nop ; PC64-NEXT: mffs 0 ; PC64-NEXT: mtfsb1 31 -; PC64-NEXT: lis 4, -32768 -; PC64-NEXT: bc 12, 8, .LBB31_3 -; PC64-NEXT: b .LBB31_4 -; PC64-NEXT: .LBB31_3: # %entry -; PC64-NEXT: li 4, 0 -; PC64-NEXT: .LBB31_4: # %entry ; PC64-NEXT: mtfsb0 30 ; PC64-NEXT: fadd 1, 2, 1 ; PC64-NEXT: mtfsf 1, 0 ; PC64-NEXT: fctiwz 0, 1 ; PC64-NEXT: stfd 0, 120(1) ; PC64-NEXT: lwz 3, 124(1) -; PC64-NEXT: xor 3, 3, 4 +; PC64-NEXT: xoris 4, 3, 32768 +; PC64-NEXT: bc 12, 8, .LBB31_4 +; PC64-NEXT: # %bb.3: # %entry +; PC64-NEXT: ori 3, 4, 0 +; PC64-NEXT: b .LBB31_4 +; PC64-NEXT: .LBB31_4: # %entry ; PC64-NEXT: addi 1, 1, 128 ; PC64-NEXT: ld 0, 16(1) ; PC64-NEXT: lwz 12, 8(1) diff --git a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll --- a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll +++ b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll @@ -27,8 +27,9 @@ unreachable ; CHECK: @test -; CHECK: clrlwi [[R1:[0-9]+]], {{[0-9]+}}, 31 -; CHECK: rlwimi [[R1]], {{[0-9]+}}, 8, 23, 23 +; CHECK: rlwinm [[R1:[0-9]+]], {{[0-9]+}}, 8, 23, 23 +; CHECK: rlwimi [[R2:[0-9]+]], {{[0-9]+}}, 0, 0, 30 +; CHECK: iseleq 3, [[R1]], [[R2]] codeRepl29: ; preds = %codeRepl1 unreachable diff --git a/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll --- a/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll +++ b/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll @@ -24,9 +24,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB0_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b @@ -53,9 +52,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB1_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b @@ -81,9 +79,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB2_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b @@ -110,9 +107,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB3_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b @@ -139,9 +135,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB4_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b @@ -167,9 +162,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB5_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, -2 @@ -194,9 +188,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB6_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, -1 @@ -221,9 +214,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB7_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, 0 @@ -248,9 +240,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB8_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, 1 @@ -275,9 +266,8 @@ ; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB9_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: mulld r4, r3, r4 +; CHECK-NEXT: isellt r3, r3, r4 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, 2 @@ -298,13 +288,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: slw r5, r3, r4 ; CHECK-NEXT: cmpwi r5, -2 -; CHECK-NEXT: bgt cr0, .LBB10_2 -; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: .LBB10_2: # %return -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: mullw r5, r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b @@ -328,15 +315,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: slw r5, r3, r4 ; CHECK-NEXT: cmpwi r5, -1 -; CHECK-NEXT: ble cr0, .LBB11_2 -; CHECK-NEXT: # %bb.1: # %return -; CHECK-NEXT: extsw r3, r4 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB11_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: mullw r5, r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b @@ -360,15 +342,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: slw r5, r3, r4 ; CHECK-NEXT: cmpwi r5, 0 -; CHECK-NEXT: ble cr0, .LBB12_2 -; CHECK-NEXT: # %bb.1: # %return -; CHECK-NEXT: extsw r3, r4 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB12_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: mullw r5, r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b @@ -392,13 +369,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: slw r5, r3, r4 ; CHECK-NEXT: cmpwi r5, 1 -; CHECK-NEXT: bgt cr0, .LBB13_2 -; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: .LBB13_2: # %return -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: mullw r5, r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b @@ -422,13 +396,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: slw r5, r3, r4 ; CHECK-NEXT: cmpwi r5, 2 -; CHECK-NEXT: bgt cr0, .LBB14_2 -; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: .LBB14_2: # %return -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: mullw r5, r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b @@ -450,14 +421,11 @@ define i64 @i_a__2(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a__2: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw r5, r3, r4 ; CHECK-NEXT: cmpwi r3, -2 -; CHECK-NEXT: bgt cr0, .LBB15_2 -; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: .LBB15_2: # %return -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, -2 @@ -478,16 +446,11 @@ define i64 @i_a__1(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a__1: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw r5, r3, r4 ; CHECK-NEXT: cmpwi r3, -1 -; CHECK-NEXT: ble cr0, .LBB16_2 -; CHECK-NEXT: # %bb.1: # %return -; CHECK-NEXT: extsw r3, r4 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB16_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, -1 @@ -508,16 +471,11 @@ define i64 @i_a_0(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a_0: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw r5, r3, r4 ; CHECK-NEXT: cmpwi r3, 0 -; CHECK-NEXT: ble cr0, .LBB17_2 -; CHECK-NEXT: # %bb.1: # %return -; CHECK-NEXT: extsw r3, r4 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB17_2: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, 0 @@ -538,14 +496,11 @@ define i64 @i_a_1(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a_1: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw r5, r3, r4 ; CHECK-NEXT: cmpwi r3, 1 -; CHECK-NEXT: bgt cr0, .LBB18_2 -; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: .LBB18_2: # %return -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, 1 @@ -566,14 +521,11 @@ define i64 @i_a_2(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: i_a_2: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mullw r5, r3, r4 ; CHECK-NEXT: cmpwi r3, 2 -; CHECK-NEXT: bgt cr0, .LBB19_2 -; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: isellt r4, r5, r4 -; CHECK-NEXT: mullw r4, r4, r3 -; CHECK-NEXT: .LBB19_2: # %return -; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: isellt r3, r3, r5 +; CHECK-NEXT: iselgt r3, r4, r3 +; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, 2 diff --git a/llvm/test/CodeGen/SystemZ/subregliveness-04.ll b/llvm/test/CodeGen/SystemZ/subregliveness-04.ll --- a/llvm/test/CodeGen/SystemZ/subregliveness-04.ll +++ b/llvm/test/CodeGen/SystemZ/subregliveness-04.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -disable-early-taildup -disable-cgp -systemz-subreg-liveness < %s | FileCheck %s ; Check for successful compilation. -; CHECK: lhi %r0, -5 +; CHECK: lhi %r2, -5 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll @@ -6,9 +6,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: subs r2, r1, #4 -; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r2, .LCPI0_0 +; CHECK-NEXT: subs.w lr, r1, #4 +; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -17,6 +17,13 @@ ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0xff800000 @ float -Inf +; CHECK-NEXT: .long 0xff800000 @ float -Inf +; CHECK-NEXT: .long 0xff800000 @ float -Inf +; CHECK-NEXT: .long 0xff800000 @ float -Inf entry: br label %do.body diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll --- a/llvm/test/CodeGen/X86/avx-select.ll +++ b/llvm/test/CodeGen/X86/avx-select.ll @@ -6,23 +6,19 @@ ; X86-LABEL: select00: ; X86: # %bb.0: ; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp) -; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-NEXT: je .LBB0_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: vmovaps %ymm0, %ymm1 +; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X86-NEXT: .LBB0_2: -; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: select00: ; X64: # %bb.0: ; X64-NEXT: cmpl $255, %edi -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: je .LBB0_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: vmovaps %ymm0, %ymm1 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: .LBB0_2: -; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <8 x i32> zeroinitializer, <8 x i32> %b @@ -34,23 +30,19 @@ ; X86-LABEL: select01: ; X86: # %bb.0: ; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp) -; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-NEXT: je .LBB1_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: vmovaps %ymm0, %ymm1 +; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X86-NEXT: .LBB1_2: -; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: select01: ; X64: # %bb.0: ; X64-NEXT: cmpl $255, %edi -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: je .LBB1_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: vmovaps %ymm0, %ymm1 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: .LBB1_2: -; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <4 x i64> zeroinitializer, <4 x i64> %b diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -8,23 +8,19 @@ ; X86-LABEL: select00: ; X86: # %bb.0: ; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp) -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-NEXT: je .LBB0_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: vmovdqa64 %zmm0, %zmm1 +; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X86-NEXT: .LBB0_2: -; X86-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: select00: ; X64: # %bb.0: ; X64-NEXT: cmpl $255, %edi -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: je .LBB0_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: vmovdqa64 %zmm0, %zmm1 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: .LBB0_2: -; X64-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b @@ -36,23 +32,19 @@ ; X86-LABEL: select01: ; X86: # %bb.0: ; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp) -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-NEXT: je .LBB1_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: vmovdqa64 %zmm0, %zmm1 +; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X86-NEXT: .LBB1_2: -; X86-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: select01: ; X64: # %bb.0: ; X64-NEXT: cmpl $255, %edi -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: je .LBB1_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: vmovdqa64 %zmm0, %zmm1 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: .LBB1_2: -; X64-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b diff --git a/llvm/test/CodeGen/X86/binop-identity.ll b/llvm/test/CodeGen/X86/binop-identity.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/binop-identity.ll @@ -0,0 +1,317 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s + +define i32 @test_add_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_add_right: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal 7(%rsi), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %add = add i32 %y, %sel + ret i32 %add +} + +define i32 @test_add_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_add_left: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal 7(%rsi), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %add = add i32 %sel, %y + ret i32 %add +} + +define i32 @test_or_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_or_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: orl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %or = or i32 %y, %sel + ret i32 %or +} + +define i32 @test_or_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_or_left: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: orl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %or = or i32 %sel, %y + ret i32 %or +} + +define i32 @test_xor_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_xor_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xorl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %xor = xor i32 %y, %sel + ret i32 %xor +} + +define i32 @test_xor_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_xor_left: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xorl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %xor = xor i32 %sel, %y + ret i32 %xor +} + +define i32 @test_sub_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_sub_right: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal -7(%rsi), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %sub = sub i32 %y, %sel + ret i32 %sub +} + +define i32 @test_sub_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_sub_left: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl $7, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %sub = sub i32 %sel, %y + ret i32 %sub +} + +define i32 @test_shl_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_shl_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shll $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %shl = shl i32 %y, %sel + ret i32 %shl +} + +define i32 @test_shl_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_shl_left: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl $7, %eax +; CHECK-NEXT: cmovel %edx, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %shl = shl i32 %sel, %y + ret i32 %shl +} + +define i32 @test_lshr_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_lshr_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shrl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %lshr = lshr i32 %y, %sel + ret i32 %lshr +} + +define i32 @test_lshr_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_lshr_left: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl $7, %eax +; CHECK-NEXT: cmovel %edx, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %lshr = lshr i32 %sel, %y + ret i32 %lshr +} + +define i32 @test_ashr_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_ashr_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: sarl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %ashr = ashr i32 %y, %sel + ret i32 %ashr +} + +define i32 @test_ashr_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_ashr_left: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl $7, %eax +; CHECK-NEXT: cmovel %edx, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 0 + %ashr = ashr i32 %sel, %y + ret i32 %ashr +} + +define i32 @test_mul_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_mul_right: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (,%rsi,8), %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 1 + %mul = mul i32 %y, %sel + ret i32 %mul +} + +define i32 @test_mul_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_mul_left: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (,%rsi,8), %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 1 + %mul = mul i32 %sel, %y + ret i32 %mul +} + +define i32 @test_sdiv_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_sdiv_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %esi, %rcx +; CHECK-NEXT: imulq $-1840700269, %rcx, %rax # imm = 0x92492493 +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $2, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 1 + %sdiv = sdiv i32 %y, %sel + ret i32 %sdiv +} + +define i32 @test_sdiv_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_sdiv_left: +; CHECK: # %bb.0: +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl $7, %ecx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: idivl %esi +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 1 + %sdiv = sdiv i32 %sel, %y + ret i32 %sdiv +} + +define i32 @test_udiv_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_udiv_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: shrl %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: shrl $2, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 1 + %udiv = udiv i32 %y, %sel + ret i32 %udiv +} + +define i32 @test_udiv_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_udiv_left: +; CHECK: # %bb.0: +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl $7, %ecx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: divl %esi +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 1 + %udiv = udiv i32 %sel, %y + ret i32 %udiv +} + +define i32 @test_and_right(i1 %x, i32 %y) { +; CHECK-LABEL: test_and_right: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: andl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 4294967295 + %and = and i32 %y, %sel + ret i32 %and +} + +define i32 @test_and_left(i1 %x, i32 %y) { +; CHECK-LABEL: test_and_left: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: andl $7, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %x, i32 7, i32 4294967295 + %and = and i32 %sel, %y + ret i32 %and +} diff --git a/llvm/test/CodeGen/X86/bool-simplify.ll b/llvm/test/CodeGen/X86/bool-simplify.ll --- a/llvm/test/CodeGen/X86/bool-simplify.ll +++ b/llvm/test/CodeGen/X86/bool-simplify.ll @@ -51,10 +51,12 @@ define i16 @rnd16(i16 %arg) nounwind { ; CHECK-LABEL: rnd16: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: rdrandw %cx -; CHECK-NEXT: cmovbl %edi, %eax -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: setae %dl +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %1 = tail call { i16, i32 } @llvm.x86.rdrand.16() nounwind @@ -69,10 +71,12 @@ define i32 @rnd32(i32 %arg) nounwind { ; CHECK-LABEL: rnd32: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: rdrandl %ecx -; CHECK-NEXT: cmovbl %edi, %eax -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: setae %dl +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: retq %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind %2 = extractvalue { i32, i32 } %1, 0 @@ -86,10 +90,11 @@ define i64 @rnd64(i64 %arg) nounwind { ; CHECK-LABEL: rnd64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: rdrandq %rcx -; CHECK-NEXT: cmovbq %rdi, %rax -; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: setae %dl +; CHECK-NEXT: leaq (%rdi,%rcx), %rax +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: retq %1 = tail call { i64, i32 } @llvm.x86.rdrand.64() nounwind %2 = extractvalue { i64, i32 } %1, 0 @@ -103,10 +108,12 @@ define i16 @seed16(i16 %arg) nounwind { ; CHECK-LABEL: seed16: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: rdseedw %cx -; CHECK-NEXT: cmovbl %edi, %eax -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: setae %dl +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %1 = tail call { i16, i32 } @llvm.x86.rdseed.16() nounwind @@ -121,10 +128,12 @@ define i32 @seed32(i32 %arg) nounwind { ; CHECK-LABEL: seed32: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: rdseedl %ecx -; CHECK-NEXT: cmovbl %edi, %eax -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: setae %dl +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: retq %1 = tail call { i32, i32 } @llvm.x86.rdseed.32() nounwind %2 = extractvalue { i32, i32 } %1, 0 @@ -138,10 +147,11 @@ define i64 @seed64(i64 %arg) nounwind { ; CHECK-LABEL: seed64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: rdseedq %rcx -; CHECK-NEXT: cmovbq %rdi, %rax -; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: setae %dl +; CHECK-NEXT: leaq (%rdi,%rcx), %rax +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: retq %1 = tail call { i64, i32 } @llvm.x86.rdseed.64() nounwind %2 = extractvalue { i64, i32 } %1, 0 diff --git a/llvm/test/CodeGen/X86/bool-vector.ll b/llvm/test/CodeGen/X86/bool-vector.ll --- a/llvm/test/CodeGen/X86/bool-vector.ll +++ b/llvm/test/CodeGen/X86/bool-vector.ll @@ -74,61 +74,122 @@ define i32 @PR15215_good(<4 x i32> %input) { ; X32-LABEL: PR15215_good: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %esi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %esi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: andl $1, %esi -; X32-NEXT: andl $1, %edx -; X32-NEXT: andl $1, %ecx ; X32-NEXT: andl $1, %eax -; X32-NEXT: leal (%esi,%edx,2), %edx -; X32-NEXT: leal (%edx,%ecx,4), %ecx -; X32-NEXT: leal (%ecx,%eax,8), %eax -; X32-NEXT: popl %esi -; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_1 +; X32-NEXT: # %bb.2: # %entry +; X32-NEXT: addl %eax, %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_4 +; X32-NEXT: jmp .LBB1_5 +; X32-NEXT: .LBB1_1: +; X32-NEXT: leal 1(%eax,%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: je .LBB1_5 +; X32-NEXT: .LBB1_4: +; X32-NEXT: orl $4, %eax +; X32-NEXT: .LBB1_5: # %entry +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_6 +; X32-NEXT: # %bb.7: # %entry +; X32-NEXT: retl +; X32-NEXT: .LBB1_6: +; X32-NEXT: orl $8, %eax ; X32-NEXT: retl ; ; X32-SSE2-LABEL: PR15215_good: ; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE2-NEXT: .cfi_offset %esi, -8 ; X32-SSE2-NEXT: pslld $31, %xmm0 -; X32-SSE2-NEXT: movmskps %xmm0, %eax +; X32-SSE2-NEXT: movmskps %xmm0, %ecx +; X32-SSE2-NEXT: movl %ecx, %eax +; X32-SSE2-NEXT: andl $2, %eax +; X32-SSE2-NEXT: leal 1(%eax), %edx +; X32-SSE2-NEXT: testb $1, %cl +; X32-SSE2-NEXT: cmovel %eax, %edx +; X32-SSE2-NEXT: leal 4(%edx), %esi +; X32-SSE2-NEXT: testb $4, %cl +; X32-SSE2-NEXT: cmovel %edx, %esi +; X32-SSE2-NEXT: leal 8(%esi), %eax +; X32-SSE2-NEXT: testb $8, %cl +; X32-SSE2-NEXT: cmovel %esi, %eax +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE2-NEXT: retl ; ; X32-AVX2-LABEL: PR15215_good: ; X32-AVX2: # %bb.0: # %entry +; X32-AVX2-NEXT: pushl %esi +; X32-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X32-AVX2-NEXT: .cfi_offset %esi, -8 ; X32-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovmskps %xmm0, %eax +; X32-AVX2-NEXT: vmovmskps %xmm0, %ecx +; X32-AVX2-NEXT: movl %ecx, %eax +; X32-AVX2-NEXT: andl $2, %eax +; X32-AVX2-NEXT: leal 1(%eax), %edx +; X32-AVX2-NEXT: testb $1, %cl +; X32-AVX2-NEXT: cmovel %eax, %edx +; X32-AVX2-NEXT: leal 4(%edx), %esi +; X32-AVX2-NEXT: testb $4, %cl +; X32-AVX2-NEXT: cmovel %edx, %esi +; X32-AVX2-NEXT: leal 8(%esi), %eax +; X32-AVX2-NEXT: testb $8, %cl +; X32-AVX2-NEXT: cmovel %esi, %eax +; X32-AVX2-NEXT: popl %esi +; X32-AVX2-NEXT: .cfi_def_cfa_offset 4 ; X32-AVX2-NEXT: retl ; ; X64-LABEL: PR15215_good: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: # kill: def $edx killed $edx def $rdx ; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: andl $1, %esi -; X64-NEXT: andl $1, %edx -; X64-NEXT: andl $1, %ecx -; X64-NEXT: leal (%rdi,%rsi,2), %eax -; X64-NEXT: leal (%rax,%rdx,4), %eax -; X64-NEXT: leal (%rax,%rcx,8), %eax +; X64-NEXT: addl %esi, %esi +; X64-NEXT: andl $2, %esi +; X64-NEXT: leal 1(%rsi), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: leal 4(%rax), %esi +; X64-NEXT: testb $1, %dl +; X64-NEXT: cmovel %eax, %esi +; X64-NEXT: leal 8(%rsi), %eax +; X64-NEXT: testb $1, %cl +; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq ; ; X64-SSE2-LABEL: PR15215_good: ; X64-SSE2: # %bb.0: # %entry ; X64-SSE2-NEXT: pslld $31, %xmm0 -; X64-SSE2-NEXT: movmskps %xmm0, %eax +; X64-SSE2-NEXT: movmskps %xmm0, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: andl $2, %eax +; X64-SSE2-NEXT: leal 1(%rax), %edx +; X64-SSE2-NEXT: testb $1, %cl +; X64-SSE2-NEXT: cmovel %eax, %edx +; X64-SSE2-NEXT: leal 4(%rdx), %esi +; X64-SSE2-NEXT: testb $4, %cl +; X64-SSE2-NEXT: cmovel %edx, %esi +; X64-SSE2-NEXT: leal 8(%rsi), %eax +; X64-SSE2-NEXT: testb $8, %cl +; X64-SSE2-NEXT: cmovel %esi, %eax ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: PR15215_good: ; X64-AVX2: # %bb.0: # %entry ; X64-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovmskps %xmm0, %eax +; X64-AVX2-NEXT: vmovmskps %xmm0, %ecx +; X64-AVX2-NEXT: movl %ecx, %eax +; X64-AVX2-NEXT: andl $2, %eax +; X64-AVX2-NEXT: leal 1(%rax), %edx +; X64-AVX2-NEXT: testb $1, %cl +; X64-AVX2-NEXT: cmovel %eax, %edx +; X64-AVX2-NEXT: leal 4(%rdx), %esi +; X64-AVX2-NEXT: testb $4, %cl +; X64-AVX2-NEXT: cmovel %edx, %esi +; X64-AVX2-NEXT: leal 8(%rsi), %eax +; X64-AVX2-NEXT: testb $8, %cl +; X64-AVX2-NEXT: cmovel %esi, %eax ; X64-AVX2-NEXT: retq entry: %0 = trunc <4 x i32> %input to <4 x i1> diff --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll --- a/llvm/test/CodeGen/X86/fp-cvt.ll +++ b/llvm/test/CodeGen/X86/fp-cvt.ll @@ -446,10 +446,9 @@ ; X86-NEXT: flds {{\.LCPI.*}} ; X86-NEXT: fucom %st(1) ; X86-NEXT: fnstsw %ax -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf -; X86-NEXT: setbe %al +; X86-NEXT: seta %cl ; X86-NEXT: fldz ; X86-NEXT: ja .LBB10_2 ; X86-NEXT: # %bb.1: @@ -460,16 +459,19 @@ ; X86-NEXT: fstp %st(1) ; X86-NEXT: fsubrp %st, %st(1) ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl $3072, %ecx # imm = 0xC00 -; X86-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-NEXT: movb %al, %dl -; X86-NEXT: shll $31, %edx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: jne .LBB10_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: .LBB10_4: ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl @@ -529,10 +531,9 @@ ; X86-NEXT: flds {{\.LCPI.*}} ; X86-NEXT: fucom %st(1) ; X86-NEXT: fnstsw %ax -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf -; X86-NEXT: setbe %al +; X86-NEXT: seta %cl ; X86-NEXT: fldz ; X86-NEXT: ja .LBB11_2 ; X86-NEXT: # %bb.1: @@ -543,16 +544,19 @@ ; X86-NEXT: fstp %st(1) ; X86-NEXT: fsubrp %st, %st(1) ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl $3072, %ecx # imm = 0xC00 -; X86-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-NEXT: movb %al, %dl -; X86-NEXT: shll $31, %edx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: jne .LBB11_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: .LBB11_4: ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll @@ -30,12 +30,12 @@ ; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.0, align 16) ; CHECK: [[MOVSDrm_alt1:%[0-9]+]]:fr64 = MOVSDrm_alt $noreg, 1, $noreg, %const.0, $noreg :: (load 8 from constant-pool) ; CHECK: COMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr +; CHECK: [[COPY2:%[0-9]+]]:gr32 = COPY $eflags ; CHECK: [[FsFLD0SD:%[0-9]+]]:fr64 = FsFLD0SD ; CHECK: JCC_1 ; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[MOVSDrm_alt1]], {{.*}}, [[FsFLD0SD]], {{.*}} ; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[PHI]], implicit $mxcsr ; CHECK: MOVSDmr %stack.0, 1, $noreg, 0, $noreg, killed [[SUBSDrr]] :: (store 8 into %stack.0) -; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags ; CHECK: [[LD_Fp64m80:%[0-9]+]]:rfp80 = LD_Fp64m80 %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load 8 from %stack.0) ; CHECK: FNSTCW16m %stack.1, 1, $noreg, 0, $noreg, implicit-def $fpsw, implicit $fpcw :: (store 2 into %stack.1) ; CHECK: [[MOVZX32rm16_:%[0-9]+]]:gr32 = MOVZX32rm16 %stack.1, 1, $noreg, 0, $noreg :: (load 2 from %stack.1) @@ -45,12 +45,13 @@ ; CHECK: FLDCW16m %stack.2, 1, $noreg, 0, $noreg, implicit-def $fpsw, implicit-def $fpcw :: (load 2 from %stack.2) ; CHECK: IST_Fp64m80 %stack.0, 1, $noreg, 0, $noreg, [[LD_Fp64m80]], implicit-def $fpsw, implicit $fpcw ; CHECK: FLDCW16m %stack.1, 1, $noreg, 0, $noreg, implicit-def $fpsw, implicit-def $fpcw :: (load 2 from %stack.1) -; CHECK: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 killed [[SETCCr]] -; CHECK: [[SHL32ri:%[0-9]+]]:gr32 = SHL32ri [[MOVZX32rr8_]], 31, implicit-def dead $eflags -; CHECK: [[XOR32rm:%[0-9]+]]:gr32 = XOR32rm [[SHL32ri]], %stack.0, 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load 4 from %stack.0 + 4) -; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load 4 from %stack.0, align 8) -; CHECK: $eax = COPY [[MOV32rm]] -; CHECK: $edx = COPY [[XOR32rm]] +; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load 4 from %stack.0, align 8) +; CHECK: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm %stack.0, 1, $noreg, 4, $noreg :: (load 4 from %stack.0 + 4) +; CHECK: [[XOR32ri:%[0-9]+]]:gr32 = XOR32ri [[MOV32rm2]], -2147483648, implicit-def dead $eflags +; CHECK: $eflags = COPY [[COPY2]] +; CHECK: [[CMOV32rr:%[0-9]+]]:gr32 = CMOV32rr [[XOR32ri]], [[MOV32rm2]], 7, implicit $eflags +; CHECK: $eax = COPY [[MOV32rm1]] +; CHECK: $edx = COPY [[CMOV32rr]] ; CHECK: RET 0, $eax, $edx %result = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict") #0 ret i64 %result diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1353,15 +1353,16 @@ define i64 @f20u64(double %x) #0 { ; X87-LABEL: f20u64: ; X87: # %bb.0: # %entry -; X87-NEXT: subl $20, %esp +; X87-NEXT: pushl %esi +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: subl $16, %esp ; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: .cfi_offset %esi, -8 ; X87-NEXT: fldl {{[0-9]+}}(%esp) ; X87-NEXT: flds {{\.LCPI.*}} -; X87-NEXT: wait -; X87-NEXT: xorl %edx, %edx ; X87-NEXT: fcomi %st(1), %st ; X87-NEXT: wait -; X87-NEXT: setbe %dl +; X87-NEXT: seta %cl ; X87-NEXT: fldz ; X87-NEXT: fxch %st(1) ; X87-NEXT: fcmovnbe %st(1), %st @@ -1375,42 +1376,54 @@ ; X87-NEXT: fldcw {{[0-9]+}}(%esp) ; X87-NEXT: fistpll {{[0-9]+}}(%esp) ; X87-NEXT: fldcw {{[0-9]+}}(%esp) -; X87-NEXT: shll $31, %edx -; X87-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %eax -; X87-NEXT: addl $20, %esp +; X87-NEXT: movl {{[0-9]+}}(%esp), %esi +; X87-NEXT: movl %esi, %edx +; X87-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87-NEXT: testb %cl, %cl +; X87-NEXT: cmovnel %esi, %edx +; X87-NEXT: addl $16, %esp +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: popl %esi ; X87-NEXT: .cfi_def_cfa_offset 4 ; X87-NEXT: retl ; ; X86-SSE-LABEL: f20u64: ; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: subl $16, %esp ; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: .cfi_offset %esi, -8 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X86-SSE-NEXT: comisd %xmm0, %xmm2 -; X86-SSE-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: comisd %xmm0, %xmm1 +; X86-SSE-NEXT: seta %cl +; X86-SSE-NEXT: xorpd %xmm2, %xmm2 ; X86-SSE-NEXT: ja .LBB25_2 ; X86-SSE-NEXT: # %bb.1: # %entry -; X86-SSE-NEXT: movapd %xmm2, %xmm1 +; X86-SSE-NEXT: movapd %xmm1, %xmm2 ; X86-SSE-NEXT: .LBB25_2: # %entry -; X86-SSE-NEXT: subsd %xmm1, %xmm0 +; X86-SSE-NEXT: subsd %xmm2, %xmm0 ; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: setbe %al ; X86-SSE-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE-NEXT: wait ; X86-SSE-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: orl $3072, %ecx # imm = 0xC00 -; X86-SSE-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-SSE-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movzbl %al, %edx -; X86-SSE-NEXT: shll $31, %edx -; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl %esi, %edx +; X86-SSE-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X86-SSE-NEXT: testb %cl, %cl +; X86-SSE-NEXT: cmovnel %esi, %edx +; X86-SSE-NEXT: addl $16, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll @@ -540,33 +540,38 @@ ; SSE-X86-NEXT: .cfi_offset %ebp, -8 ; SSE-X86-NEXT: movl %esp, %ebp ; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: pushl %esi ; SSE-X86-NEXT: andl $-8, %esp -; SSE-X86-NEXT: subl $16, %esp +; SSE-X86-NEXT: subl $24, %esp +; SSE-X86-NEXT: .cfi_offset %esi, -12 ; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-X86-NEXT: comiss %xmm0, %xmm2 -; SSE-X86-NEXT: xorps %xmm1, %xmm1 +; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X86-NEXT: comiss %xmm0, %xmm1 +; SSE-X86-NEXT: seta %cl +; SSE-X86-NEXT: xorps %xmm2, %xmm2 ; SSE-X86-NEXT: ja .LBB9_2 ; SSE-X86-NEXT: # %bb.1: -; SSE-X86-NEXT: movaps %xmm2, %xmm1 +; SSE-X86-NEXT: movaps %xmm1, %xmm2 ; SSE-X86-NEXT: .LBB9_2: -; SSE-X86-NEXT: subss %xmm1, %xmm0 +; SSE-X86-NEXT: subss %xmm2, %xmm0 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; SSE-X86-NEXT: setbe %al ; SSE-X86-NEXT: flds {{[0-9]+}}(%esp) ; SSE-X86-NEXT: wait ; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-X86-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-X86-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movzbl %al, %edx -; SSE-X86-NEXT: shll $31, %edx -; SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-X86-NEXT: movl %esi, %edx +; SSE-X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE-X86-NEXT: testb %cl, %cl +; SSE-X86-NEXT: cmovnel %esi, %edx +; SSE-X86-NEXT: leal -4(%ebp), %esp +; SSE-X86-NEXT: popl %esi ; SSE-X86-NEXT: popl %ebp ; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 ; SSE-X86-NEXT: retl @@ -595,11 +600,14 @@ ; AVX1-X86-NEXT: .cfi_offset %ebp, -8 ; AVX1-X86-NEXT: movl %esp, %ebp ; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: pushl %esi ; AVX1-X86-NEXT: andl $-8, %esp -; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: subl $16, %esp +; AVX1-X86-NEXT: .cfi_offset %esi, -12 ; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-X86-NEXT: vcomiss %xmm0, %xmm1 +; AVX1-X86-NEXT: seta %cl ; AVX1-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-X86-NEXT: ja .LBB9_2 ; AVX1-X86-NEXT: # %bb.1: @@ -610,12 +618,14 @@ ; AVX1-X86-NEXT: flds (%esp) ; AVX1-X86-NEXT: fisttpll (%esp) ; AVX1-X86-NEXT: wait -; AVX1-X86-NEXT: setbe %al -; AVX1-X86-NEXT: movzbl %al, %edx -; AVX1-X86-NEXT: shll $31, %edx -; AVX1-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX1-X86-NEXT: movl (%esp), %eax -; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX1-X86-NEXT: movl %esi, %edx +; AVX1-X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX1-X86-NEXT: testb %cl, %cl +; AVX1-X86-NEXT: cmovnel %esi, %edx +; AVX1-X86-NEXT: leal -4(%ebp), %esp +; AVX1-X86-NEXT: popl %esi ; AVX1-X86-NEXT: popl %ebp ; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX1-X86-NEXT: retl @@ -644,12 +654,14 @@ ; AVX512-X86-NEXT: .cfi_offset %ebp, -8 ; AVX512-X86-NEXT: movl %esp, %ebp ; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: pushl %esi ; AVX512-X86-NEXT: andl $-8, %esp -; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: subl $16, %esp +; AVX512-X86-NEXT: .cfi_offset %esi, -12 ; AVX512-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-X86-NEXT: xorl %edx, %edx ; AVX512-X86-NEXT: vcomiss %xmm0, %xmm1 +; AVX512-X86-NEXT: seta %cl ; AVX512-X86-NEXT: seta %al ; AVX512-X86-NEXT: kmovw %eax, %k1 ; AVX512-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 @@ -659,11 +671,14 @@ ; AVX512-X86-NEXT: flds (%esp) ; AVX512-X86-NEXT: fisttpll (%esp) ; AVX512-X86-NEXT: wait -; AVX512-X86-NEXT: setbe %dl -; AVX512-X86-NEXT: shll $31, %edx -; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512-X86-NEXT: movl (%esp), %eax -; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512-X86-NEXT: movl %esi, %edx +; AVX512-X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512-X86-NEXT: testb %cl, %cl +; AVX512-X86-NEXT: cmovnel %esi, %edx +; AVX512-X86-NEXT: leal -4(%ebp), %esp +; AVX512-X86-NEXT: popl %esi ; AVX512-X86-NEXT: popl %ebp ; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX512-X86-NEXT: retl @@ -687,10 +702,9 @@ ; CHECK-NEXT: fcom %st(1) ; CHECK-NEXT: wait ; CHECK-NEXT: fnstsw %ax -; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-NEXT: sahf -; CHECK-NEXT: setbe %al +; CHECK-NEXT: seta %cl ; CHECK-NEXT: fldz ; CHECK-NEXT: ja .LBB9_2 ; CHECK-NEXT: # %bb.1: @@ -702,16 +716,19 @@ ; CHECK-NEXT: fsubrp %st, %st(1) ; CHECK-NEXT: wait ; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: orl $3072, %ecx # imm = 0xC00 -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) ; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) -; CHECK-NEXT: movb %al, %dl -; CHECK-NEXT: shll $31, %edx -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: jne .LBB9_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; CHECK-NEXT: .LBB9_4: ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 @@ -1185,33 +1202,38 @@ ; SSE-X86-NEXT: .cfi_offset %ebp, -8 ; SSE-X86-NEXT: movl %esp, %ebp ; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: pushl %esi ; SSE-X86-NEXT: andl $-8, %esp -; SSE-X86-NEXT: subl $16, %esp +; SSE-X86-NEXT: subl $24, %esp +; SSE-X86-NEXT: .cfi_offset %esi, -12 ; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-X86-NEXT: comisd %xmm0, %xmm2 -; SSE-X86-NEXT: xorpd %xmm1, %xmm1 +; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X86-NEXT: comisd %xmm0, %xmm1 +; SSE-X86-NEXT: seta %cl +; SSE-X86-NEXT: xorpd %xmm2, %xmm2 ; SSE-X86-NEXT: ja .LBB18_2 ; SSE-X86-NEXT: # %bb.1: -; SSE-X86-NEXT: movapd %xmm2, %xmm1 +; SSE-X86-NEXT: movapd %xmm1, %xmm2 ; SSE-X86-NEXT: .LBB18_2: -; SSE-X86-NEXT: subsd %xmm1, %xmm0 +; SSE-X86-NEXT: subsd %xmm2, %xmm0 ; SSE-X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; SSE-X86-NEXT: setbe %al ; SSE-X86-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-X86-NEXT: wait ; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-X86-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-X86-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movzbl %al, %edx -; SSE-X86-NEXT: shll $31, %edx -; SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-X86-NEXT: movl %esi, %edx +; SSE-X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE-X86-NEXT: testb %cl, %cl +; SSE-X86-NEXT: cmovnel %esi, %edx +; SSE-X86-NEXT: leal -4(%ebp), %esp +; SSE-X86-NEXT: popl %esi ; SSE-X86-NEXT: popl %ebp ; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 ; SSE-X86-NEXT: retl @@ -1240,11 +1262,14 @@ ; AVX1-X86-NEXT: .cfi_offset %ebp, -8 ; AVX1-X86-NEXT: movl %esp, %ebp ; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: pushl %esi ; AVX1-X86-NEXT: andl $-8, %esp -; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: subl $16, %esp +; AVX1-X86-NEXT: .cfi_offset %esi, -12 ; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-X86-NEXT: vcomisd %xmm0, %xmm1 +; AVX1-X86-NEXT: seta %cl ; AVX1-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX1-X86-NEXT: ja .LBB18_2 ; AVX1-X86-NEXT: # %bb.1: @@ -1255,12 +1280,14 @@ ; AVX1-X86-NEXT: fldl (%esp) ; AVX1-X86-NEXT: fisttpll (%esp) ; AVX1-X86-NEXT: wait -; AVX1-X86-NEXT: setbe %al -; AVX1-X86-NEXT: movzbl %al, %edx -; AVX1-X86-NEXT: shll $31, %edx -; AVX1-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX1-X86-NEXT: movl (%esp), %eax -; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX1-X86-NEXT: movl %esi, %edx +; AVX1-X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX1-X86-NEXT: testb %cl, %cl +; AVX1-X86-NEXT: cmovnel %esi, %edx +; AVX1-X86-NEXT: leal -4(%ebp), %esp +; AVX1-X86-NEXT: popl %esi ; AVX1-X86-NEXT: popl %ebp ; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX1-X86-NEXT: retl @@ -1289,12 +1316,14 @@ ; AVX512-X86-NEXT: .cfi_offset %ebp, -8 ; AVX512-X86-NEXT: movl %esp, %ebp ; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: pushl %esi ; AVX512-X86-NEXT: andl $-8, %esp -; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: subl $16, %esp +; AVX512-X86-NEXT: .cfi_offset %esi, -12 ; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-X86-NEXT: xorl %edx, %edx ; AVX512-X86-NEXT: vcomisd %xmm0, %xmm1 +; AVX512-X86-NEXT: seta %cl ; AVX512-X86-NEXT: seta %al ; AVX512-X86-NEXT: kmovw %eax, %k1 ; AVX512-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 @@ -1304,11 +1333,14 @@ ; AVX512-X86-NEXT: fldl (%esp) ; AVX512-X86-NEXT: fisttpll (%esp) ; AVX512-X86-NEXT: wait -; AVX512-X86-NEXT: setbe %dl -; AVX512-X86-NEXT: shll $31, %edx -; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512-X86-NEXT: movl (%esp), %eax -; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512-X86-NEXT: movl %esi, %edx +; AVX512-X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512-X86-NEXT: testb %cl, %cl +; AVX512-X86-NEXT: cmovnel %esi, %edx +; AVX512-X86-NEXT: leal -4(%ebp), %esp +; AVX512-X86-NEXT: popl %esi ; AVX512-X86-NEXT: popl %ebp ; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX512-X86-NEXT: retl @@ -1332,10 +1364,9 @@ ; CHECK-NEXT: fcom %st(1) ; CHECK-NEXT: wait ; CHECK-NEXT: fnstsw %ax -; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-NEXT: sahf -; CHECK-NEXT: setbe %al +; CHECK-NEXT: seta %cl ; CHECK-NEXT: fldz ; CHECK-NEXT: ja .LBB18_2 ; CHECK-NEXT: # %bb.1: @@ -1347,16 +1378,19 @@ ; CHECK-NEXT: fsubrp %st, %st(1) ; CHECK-NEXT: wait ; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: orl $3072, %ecx # imm = 0xC00 -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) ; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) -; CHECK-NEXT: movb %al, %dl -; CHECK-NEXT: shll $31, %edx -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: jne .LBB18_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; CHECK-NEXT: .LBB18_4: ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll --- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll @@ -592,10 +592,9 @@ ; X86-NEXT: fcom %st(1) ; X86-NEXT: wait ; X86-NEXT: fnstsw %ax -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf -; X86-NEXT: setbe %al +; X86-NEXT: seta %cl ; X86-NEXT: fldz ; X86-NEXT: ja .LBB18_2 ; X86-NEXT: # %bb.1: @@ -607,16 +606,19 @@ ; X86-NEXT: fsubrp %st, %st(1) ; X86-NEXT: wait ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl $3072, %ecx # imm = 0xC00 -; X86-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-NEXT: movb %al, %dl -; X86-NEXT: shll $31, %edx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: jne .LBB18_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: .LBB18_4: ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -308,11 +308,11 @@ define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) { ; CHECK-LABEL: func_q: ; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: sbbl %ecx, %ecx -; CHECK-NEXT: negl %eax -; CHECK-NEXT: xorl %ecx, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: notl %eax +; CHECK-NEXT: cmovbel %ecx, %eax ; CHECK-NEXT: retl %t1 = icmp ult i32 %a0, %a1 %t2 = sub i32 %a1, %a0 diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -14,39 +14,41 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind { ; X64-LABEL: scalar_i32_signed_reg_reg: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setle %al -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: cmovgl %esi, %ecx -; X64-NEXT: cmovgl %edi, %esi -; X64-NEXT: subl %ecx, %esi -; X64-NEXT: shrl %esi -; X64-NEXT: imull %esi, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmovgl %esi, %eax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovgl %edi, %ecx +; X64-NEXT: subl %eax, %ecx +; X64-NEXT: shrl %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovlel %ecx, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i32_signed_reg_reg: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpl %eax, %ecx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx -; X32-NEXT: jg .LBB0_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: jmp .LBB0_3 -; X32-NEXT: .LBB0_1: -; X32-NEXT: movl %eax, %esi +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: .LBB0_3: +; X32-NEXT: jg .LBB0_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %edx, %eax +; X32-NEXT: .LBB0_2: ; X32-NEXT: subl %esi, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: jle .LBB0_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB0_4: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: popl %esi ; X32-NEXT: retl @@ -64,39 +66,41 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind { ; X64-LABEL: scalar_i32_unsigned_reg_reg: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setbe %al -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: cmoval %esi, %ecx -; X64-NEXT: cmoval %edi, %esi -; X64-NEXT: subl %ecx, %esi -; X64-NEXT: shrl %esi -; X64-NEXT: imull %esi, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmoval %esi, %eax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmoval %edi, %ecx +; X64-NEXT: subl %eax, %ecx +; X64-NEXT: shrl %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i32_unsigned_reg_reg: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpl %eax, %ecx -; X32-NEXT: setbe %dl -; X32-NEXT: leal -1(%edx,%edx), %edx -; X32-NEXT: ja .LBB1_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: jmp .LBB1_3 -; X32-NEXT: .LBB1_1: -; X32-NEXT: movl %eax, %esi +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: .LBB1_3: +; X32-NEXT: ja .LBB1_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %edx, %eax +; X32-NEXT: .LBB1_2: ; X32-NEXT: subl %esi, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: jbe .LBB1_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB1_4: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: popl %esi ; X32-NEXT: retl @@ -116,41 +120,42 @@ define i32 @scalar_i32_signed_mem_reg(i32* %a1_addr, i32 %a2) nounwind { ; X64-LABEL: scalar_i32_signed_mem_reg: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %esi, %ecx -; X64-NEXT: setle %al -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: cmovgl %esi, %edx -; X64-NEXT: cmovgl %ecx, %esi -; X64-NEXT: subl %edx, %esi -; X64-NEXT: shrl %esi -; X64-NEXT: imull %esi, %eax +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl %esi, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: cmovgl %esi, %ecx +; X64-NEXT: movl %esi, %edx +; X64-NEXT: cmovgl %eax, %edx +; X64-NEXT: subl %ecx, %edx +; X64-NEXT: shrl %edx +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: negl %ecx +; X64-NEXT: cmpl %esi, %eax +; X64-NEXT: cmovlel %edx, %ecx ; X64-NEXT: addl %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i32_signed_mem_reg: ; X32: # %bb.0: ; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl (%ecx), %ecx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpl %eax, %ecx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx -; X32-NEXT: jg .LBB2_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: jmp .LBB2_3 -; X32-NEXT: .LBB2_1: -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: .LBB2_3: +; X32-NEXT: jg .LBB2_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %edx, %eax +; X32-NEXT: .LBB2_2: ; X32-NEXT: subl %esi, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: jle .LBB2_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB2_4: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: popl %esi ; X32-NEXT: retl @@ -169,17 +174,18 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, i32* %a2_addr) nounwind { ; X64-LABEL: scalar_i32_signed_reg_mem: ; X64: # %bb.0: -; X64-NEXT: movl (%rsi), %eax -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpl %eax, %edi -; X64-NEXT: setle %cl -; X64-NEXT: leal -1(%rcx,%rcx), %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: cmovgl %eax, %edx -; X64-NEXT: cmovgl %edi, %eax -; X64-NEXT: subl %edx, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: cmovgl %edi, %edx +; X64-NEXT: subl %eax, %edx +; X64-NEXT: shrl %edx +; X64-NEXT: movl %edx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: cmovlel %edx, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; @@ -188,22 +194,22 @@ ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpl %eax, %ecx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx -; X32-NEXT: jg .LBB3_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: jmp .LBB3_3 -; X32-NEXT: .LBB3_1: -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl (%eax), %edx +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: .LBB3_3: +; X32-NEXT: jg .LBB3_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %edx, %eax +; X32-NEXT: .LBB3_2: ; X32-NEXT: subl %esi, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: jle .LBB3_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB3_4: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: popl %esi ; X32-NEXT: retl @@ -223,17 +229,18 @@ ; X64-LABEL: scalar_i32_signed_mem_mem: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setle %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx -; X64-NEXT: movl %ecx, %esi -; X64-NEXT: cmovgl %eax, %esi -; X64-NEXT: cmovgl %ecx, %eax -; X64-NEXT: subl %esi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %edx, %eax +; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmovgl %edx, %eax +; X64-NEXT: movl %edx, %esi +; X64-NEXT: cmovgl %ecx, %esi +; X64-NEXT: subl %eax, %esi +; X64-NEXT: shrl %esi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmovlel %esi, %eax ; X64-NEXT: addl %ecx, %eax ; X64-NEXT: retq ; @@ -243,22 +250,22 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl (%ecx), %ecx -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpl %eax, %ecx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx -; X32-NEXT: jg .LBB4_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: jmp .LBB4_3 -; X32-NEXT: .LBB4_1: -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl (%eax), %edx +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: .LBB4_3: +; X32-NEXT: jg .LBB4_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %edx, %eax +; X32-NEXT: .LBB4_2: ; X32-NEXT: subl %esi, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpl %edx, %ecx +; X32-NEXT: jle .LBB4_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB4_4: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: popl %esi ; X32-NEXT: retl @@ -284,16 +291,17 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind { ; X64-LABEL: scalar_i64_signed_reg_reg: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: setle %al -; X64-NEXT: leaq -1(%rax,%rax), %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: cmovgq %rsi, %rcx -; X64-NEXT: cmovgq %rdi, %rsi -; X64-NEXT: subq %rcx, %rsi -; X64-NEXT: shrq %rsi -; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: cmovgq %rsi, %rax +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: cmovgq %rdi, %rcx +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: shrq %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: cmovleq %rcx, %rax ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq ; @@ -303,38 +311,41 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: cmpl %ecx, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl $-1, %ebx +; X32-NEXT: cmpl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: sbbl %ebp, %eax +; X32-NEXT: setl %cl ; X32-NEXT: jl .LBB5_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: xorl %ebp, %ebp -; X32-NEXT: movl $1, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %edx ; X32-NEXT: jmp .LBB5_3 ; X32-NEXT: .LBB5_1: -; X32-NEXT: movl $-1, %ebp -; X32-NEXT: movl %edi, %edx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: .LBB5_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: sbbl %edx, %edi -; X32-NEXT: shrdl $1, %edi, %eax -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %edx +; X32-NEXT: subl %edx, %ebx +; X32-NEXT: sbbl %eax, %edi +; X32-NEXT: shrdl $1, %edi, %ebx ; X32-NEXT: shrl %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: addl %edi, %edx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: sbbl %edi, %edx +; X32-NEXT: testb %cl, %cl +; X32-NEXT: jne .LBB5_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: .LBB5_5: +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %ebp, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -354,16 +365,17 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind { ; X64-LABEL: scalar_i64_unsigned_reg_reg: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: setbe %al -; X64-NEXT: leaq -1(%rax,%rax), %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: cmovaq %rsi, %rcx -; X64-NEXT: cmovaq %rdi, %rsi -; X64-NEXT: subq %rcx, %rsi -; X64-NEXT: shrq %rsi -; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: cmovaq %rsi, %rax +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: cmovaq %rdi, %rcx +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: shrq %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: cmovbeq %rcx, %rax ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq ; @@ -373,38 +385,41 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: cmpl %ecx, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl $-1, %ebx +; X32-NEXT: cmpl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: sbbl %ebp, %eax +; X32-NEXT: setb %cl ; X32-NEXT: jb .LBB6_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: xorl %ebp, %ebp -; X32-NEXT: movl $1, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %edx ; X32-NEXT: jmp .LBB6_3 ; X32-NEXT: .LBB6_1: -; X32-NEXT: movl $-1, %ebp -; X32-NEXT: movl %edi, %edx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: .LBB6_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: sbbl %edx, %edi -; X32-NEXT: shrdl $1, %edi, %eax -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %edx +; X32-NEXT: subl %edx, %ebx +; X32-NEXT: sbbl %eax, %edi +; X32-NEXT: shrdl $1, %edi, %ebx ; X32-NEXT: shrl %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: addl %edi, %edx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: sbbl %edi, %edx +; X32-NEXT: testb %cl, %cl +; X32-NEXT: jne .LBB6_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: .LBB6_5: +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %ebp, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -426,66 +441,67 @@ define i64 @scalar_i64_signed_mem_reg(i64* %a1_addr, i64 %a2) nounwind { ; X64-LABEL: scalar_i64_signed_mem_reg: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rsi, %rcx -; X64-NEXT: setle %al -; X64-NEXT: leaq -1(%rax,%rax), %rax -; X64-NEXT: movq %rcx, %rdx -; X64-NEXT: cmovgq %rsi, %rdx -; X64-NEXT: cmovgq %rcx, %rsi -; X64-NEXT: subq %rdx, %rsi -; X64-NEXT: shrq %rsi -; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq %rsi, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: cmovgq %rsi, %rcx +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: cmovgq %rax, %rdx +; X64-NEXT: subq %rcx, %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: negq %rcx +; X64-NEXT: cmpq %rsi, %rax +; X64-NEXT: cmovleq %rdx, %rcx ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i64_signed_mem_reg: ; X32: # %bb.0: -; X32-NEXT: pushl %ebp ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl (%ecx), %esi -; X32-NEXT: movl 4(%ecx), %ecx -; X32-NEXT: cmpl %esi, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: sbbl %ecx, %edx -; X32-NEXT: movl $-1, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %esi +; X32-NEXT: movl 4(%eax), %ecx +; X32-NEXT: cmpl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: sbbl %ecx, %eax +; X32-NEXT: setl {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: jl .LBB7_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: xorl %ebp, %ebp -; X32-NEXT: movl $1, %ebx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl %esi, %edx ; X32-NEXT: jmp .LBB7_3 ; X32-NEXT: .LBB7_1: -; X32-NEXT: movl $-1, %ebp -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %edx ; X32-NEXT: movl %ecx, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: .LBB7_3: -; X32-NEXT: subl %edx, %eax -; X32-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: shrdl $1, %edi, %eax -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %edx +; X32-NEXT: subl %edx, %ebx +; X32-NEXT: sbbl %eax, %edi +; X32-NEXT: shrdl $1, %edi, %ebx ; X32-NEXT: shrl %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: addl %edi, %edx +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: sbbl %edi, %edx +; X32-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X32-NEXT: jne .LBB7_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: .LBB7_5: ; X32-NEXT: addl %esi, %eax ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl $4, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx -; X32-NEXT: popl %ebp ; X32-NEXT: retl %a1 = load i64, i64* %a1_addr %t3 = icmp sgt i64 %a1, %a2 ; signed @@ -502,17 +518,18 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, i64* %a2_addr) nounwind { ; X64-LABEL: scalar_i64_signed_reg_mem: ; X64: # %bb.0: -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpq %rax, %rdi -; X64-NEXT: setle %cl -; X64-NEXT: leaq -1(%rcx,%rcx), %rcx -; X64-NEXT: movq %rdi, %rdx -; X64-NEXT: cmovgq %rax, %rdx -; X64-NEXT: cmovgq %rdi, %rax -; X64-NEXT: subq %rdx, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: cmpq %rcx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: cmovgq %rcx, %rax +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: cmovgq %rdi, %rdx +; X64-NEXT: subq %rax, %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmpq %rcx, %rdi +; X64-NEXT: cmovleq %rdx, %rax ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq ; @@ -522,39 +539,42 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl (%edx), %eax -; X32-NEXT: movl 4(%edx), %edi -; X32-NEXT: cmpl %ecx, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl $-1, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ebx +; X32-NEXT: movl 4(%eax), %edi +; X32-NEXT: cmpl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: sbbl %ebp, %eax +; X32-NEXT: setl %cl ; X32-NEXT: jl .LBB8_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: xorl %ebp, %ebp -; X32-NEXT: movl $1, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %edx ; X32-NEXT: jmp .LBB8_3 ; X32-NEXT: .LBB8_1: -; X32-NEXT: movl $-1, %ebp -; X32-NEXT: movl %edi, %edx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: .LBB8_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: sbbl %edx, %edi -; X32-NEXT: shrdl $1, %edi, %eax -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %edx +; X32-NEXT: subl %edx, %ebx +; X32-NEXT: sbbl %eax, %edi +; X32-NEXT: shrdl $1, %edi, %ebx ; X32-NEXT: shrl %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: addl %edi, %edx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: sbbl %edi, %edx +; X32-NEXT: testb %cl, %cl +; X32-NEXT: jne .LBB8_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: .LBB8_5: +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %ebp, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -576,67 +596,68 @@ ; X64-LABEL: scalar_i64_signed_mem_mem: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: setle %dl -; X64-NEXT: leaq -1(%rdx,%rdx), %rdx -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: cmovgq %rax, %rsi -; X64-NEXT: cmovgq %rcx, %rax -; X64-NEXT: subq %rsi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: imulq %rdx, %rax +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmovgq %rdx, %rax +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: cmovgq %rcx, %rsi +; X64-NEXT: subq %rax, %rsi +; X64-NEXT: shrq %rsi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmovleq %rsi, %rax ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i64_signed_mem_mem: ; X32: # %bb.0: -; X32-NEXT: pushl %ebp ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl (%eax), %esi -; X32-NEXT: movl 4(%eax), %ecx -; X32-NEXT: movl (%edx), %eax -; X32-NEXT: movl 4(%edx), %edi -; X32-NEXT: cmpl %esi, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: sbbl %ecx, %edx -; X32-NEXT: movl $-1, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %esi +; X32-NEXT: movl 4(%ecx), %ecx +; X32-NEXT: movl (%eax), %ebx +; X32-NEXT: movl 4(%eax), %edi +; X32-NEXT: cmpl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: sbbl %ecx, %eax +; X32-NEXT: setl {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: jl .LBB9_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: xorl %ebp, %ebp -; X32-NEXT: movl $1, %ebx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl %esi, %edx ; X32-NEXT: jmp .LBB9_3 ; X32-NEXT: .LBB9_1: -; X32-NEXT: movl $-1, %ebp -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %edx ; X32-NEXT: movl %ecx, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: .LBB9_3: -; X32-NEXT: subl %edx, %eax -; X32-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: shrdl $1, %edi, %eax -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %edx +; X32-NEXT: subl %edx, %ebx +; X32-NEXT: sbbl %eax, %edi +; X32-NEXT: shrdl $1, %edi, %ebx ; X32-NEXT: shrl %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: addl %edi, %edx +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: sbbl %edi, %edx +; X32-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X32-NEXT: jne .LBB9_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: .LBB9_5: ; X32-NEXT: addl %esi, %eax ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl $4, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx -; X32-NEXT: popl %ebp ; X32-NEXT: retl %a1 = load i64, i64* %a1_addr %a2 = load i64, i64* %a2_addr @@ -660,17 +681,18 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X64-LABEL: scalar_i16_signed_reg_reg: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpw %si, %di -; X64-NEXT: setle %al -; X64-NEXT: leal -1(%rax,%rax), %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: cmovgl %esi, %eax -; X64-NEXT: cmovgl %edi, %esi -; X64-NEXT: subl %eax, %esi -; X64-NEXT: movzwl %si, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovgl %edi, %ecx +; X64-NEXT: subl %eax, %ecx +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: shrl %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpw %si, %di +; X64-NEXT: cmovlel %ecx, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -678,24 +700,26 @@ ; X32-LABEL: scalar_i16_signed_reg_reg: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpw %ax, %cx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx +; X32-NEXT: cmpw %dx, %cx ; X32-NEXT: jg .LBB10_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %esi ; X32-NEXT: jmp .LBB10_3 ; X32-NEXT: .LBB10_1: -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %eax +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: .LBB10_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movzwl %si, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpw %dx, %cx +; X32-NEXT: jle .LBB10_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB10_5: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: popl %esi @@ -714,17 +738,18 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X64-LABEL: scalar_i16_unsigned_reg_reg: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpw %si, %di -; X64-NEXT: setbe %al -; X64-NEXT: leal -1(%rax,%rax), %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: cmoval %esi, %eax -; X64-NEXT: cmoval %edi, %esi -; X64-NEXT: subl %eax, %esi -; X64-NEXT: movzwl %si, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmoval %edi, %ecx +; X64-NEXT: subl %eax, %ecx +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: shrl %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpw %si, %di +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -732,24 +757,26 @@ ; X32-LABEL: scalar_i16_unsigned_reg_reg: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpw %ax, %cx -; X32-NEXT: setbe %dl -; X32-NEXT: leal -1(%edx,%edx), %edx +; X32-NEXT: cmpw %dx, %cx ; X32-NEXT: ja .LBB11_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %esi ; X32-NEXT: jmp .LBB11_3 ; X32-NEXT: .LBB11_1: -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %eax +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: .LBB11_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movzwl %si, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpw %dx, %cx +; X32-NEXT: jbe .LBB11_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB11_5: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: popl %esi @@ -771,17 +798,18 @@ ; X64-LABEL: scalar_i16_signed_mem_reg: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %ecx -; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpw %si, %cx -; X64-NEXT: setle %al -; X64-NEXT: leal -1(%rax,%rax), %edx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cmovgl %esi, %eax -; X64-NEXT: cmovgl %ecx, %esi -; X64-NEXT: subl %eax, %esi -; X64-NEXT: movzwl %si, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %edx, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: cmovgl %ecx, %edx +; X64-NEXT: subl %eax, %edx +; X64-NEXT: movzwl %dx, %edx +; X64-NEXT: shrl %edx +; X64-NEXT: movl %edx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpw %si, %cx +; X64-NEXT: cmovlel %edx, %eax ; X64-NEXT: addl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -789,25 +817,27 @@ ; X32-LABEL: scalar_i16_signed_mem_reg: ; X32: # %bb.0: ; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movzwl (%ecx), %ecx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpw %ax, %cx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx +; X32-NEXT: movzwl (%eax), %ecx +; X32-NEXT: cmpw %dx, %cx ; X32-NEXT: jg .LBB12_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %esi ; X32-NEXT: jmp .LBB12_3 ; X32-NEXT: .LBB12_1: -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %eax +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: .LBB12_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movzwl %si, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpw %dx, %cx +; X32-NEXT: jle .LBB12_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB12_5: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: popl %esi @@ -827,18 +857,19 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, i16* %a2_addr) nounwind { ; X64-LABEL: scalar_i16_signed_reg_mem: ; X64: # %bb.0: -; X64-NEXT: movzwl (%rsi), %eax -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpw %ax, %di -; X64-NEXT: setle %cl -; X64-NEXT: leal -1(%rcx,%rcx), %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: cmovgl %eax, %edx -; X64-NEXT: cmovgl %edi, %eax -; X64-NEXT: subl %edx, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: cmpw %cx, %di +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: cmovgl %edi, %edx +; X64-NEXT: subl %eax, %edx +; X64-NEXT: movzwl %dx, %edx +; X64-NEXT: shrl %edx +; X64-NEXT: movl %edx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpw %cx, %di +; X64-NEXT: cmovlel %edx, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -848,23 +879,25 @@ ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movzwl (%eax), %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpw %ax, %cx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx +; X32-NEXT: movzwl (%eax), %edx +; X32-NEXT: cmpw %dx, %cx ; X32-NEXT: jg .LBB13_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %esi ; X32-NEXT: jmp .LBB13_3 ; X32-NEXT: .LBB13_1: -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %eax +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: .LBB13_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movzwl %si, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpw %dx, %cx +; X32-NEXT: jle .LBB13_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB13_5: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: popl %esi @@ -885,18 +918,19 @@ ; X64-LABEL: scalar_i16_signed_mem_mem: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %ecx -; X64-NEXT: movzwl (%rsi), %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpw %ax, %cx -; X64-NEXT: setle %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx -; X64-NEXT: movl %ecx, %esi -; X64-NEXT: cmovgl %eax, %esi -; X64-NEXT: cmovgl %ecx, %eax -; X64-NEXT: subl %esi, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: imull %edx, %eax +; X64-NEXT: movzwl (%rsi), %edx +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmovgl %edx, %eax +; X64-NEXT: movl %edx, %esi +; X64-NEXT: cmovgl %ecx, %esi +; X64-NEXT: subl %eax, %esi +; X64-NEXT: movzwl %si, %esi +; X64-NEXT: shrl %esi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: cmovlel %esi, %eax ; X64-NEXT: addl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -907,23 +941,25 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movzwl (%ecx), %ecx -; X32-NEXT: movzwl (%eax), %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpw %ax, %cx -; X32-NEXT: setle %dl -; X32-NEXT: leal -1(%edx,%edx), %edx +; X32-NEXT: movzwl (%eax), %edx +; X32-NEXT: cmpw %dx, %cx ; X32-NEXT: jg .LBB14_1 ; X32-NEXT: # %bb.2: -; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %esi ; X32-NEXT: jmp .LBB14_3 ; X32-NEXT: .LBB14_1: -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %eax +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: .LBB14_3: -; X32-NEXT: subl %esi, %eax -; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movzwl %si, %eax ; X32-NEXT: shrl %eax -; X32-NEXT: imull %edx, %eax +; X32-NEXT: cmpw %dx, %cx +; X32-NEXT: jle .LBB14_5 +; X32-NEXT: # %bb.4: +; X32-NEXT: negl %eax +; X32-NEXT: .LBB14_5: ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: popl %esi @@ -950,40 +986,41 @@ define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind { ; X64-LABEL: scalar_i8_signed_reg_reg: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmpb %al, %dil -; X64-NEXT: setle %cl -; X64-NEXT: movl %edi, %edx -; X64-NEXT: cmovgl %esi, %edx -; X64-NEXT: cmovgl %edi, %eax -; X64-NEXT: addb %cl, %cl -; X64-NEXT: decb %cl -; X64-NEXT: subb %dl, %al -; X64-NEXT: shrb %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %cl +; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmovgl %esi, %eax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovgl %edi, %ecx +; X64-NEXT: subb %al, %cl +; X64-NEXT: shrb %cl +; X64-NEXT: movzbl %cl, %edx +; X64-NEXT: negb %cl +; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: cmovlel %edx, %eax ; X64-NEXT: addb %dil, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i8_signed_reg_reg: ; X32: # %bb.0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movb {{[0-9]+}}(%esp), %dl ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; X32-NEXT: cmpb %al, %cl -; X32-NEXT: setle %dl -; X32-NEXT: jg .LBB15_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movb %cl, %ah -; X32-NEXT: jmp .LBB15_3 -; X32-NEXT: .LBB15_1: -; X32-NEXT: movb %al, %ah +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: movb %dl, %ah ; X32-NEXT: movb %cl, %al -; X32-NEXT: .LBB15_3: +; X32-NEXT: jg .LBB15_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movb %cl, %ah +; X32-NEXT: movb %dl, %al +; X32-NEXT: .LBB15_2: ; X32-NEXT: subb %ah, %al -; X32-NEXT: addb %dl, %dl -; X32-NEXT: decb %dl ; X32-NEXT: shrb %al -; X32-NEXT: mulb %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: jle .LBB15_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negb %al +; X32-NEXT: .LBB15_4: ; X32-NEXT: addb %cl, %al ; X32-NEXT: retl %t3 = icmp sgt i8 %a1, %a2 ; signed @@ -1000,40 +1037,41 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind { ; X64-LABEL: scalar_i8_unsigned_reg_reg: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmpb %al, %dil -; X64-NEXT: setbe %cl -; X64-NEXT: movl %edi, %edx -; X64-NEXT: cmoval %esi, %edx -; X64-NEXT: cmoval %edi, %eax -; X64-NEXT: addb %cl, %cl -; X64-NEXT: decb %cl -; X64-NEXT: subb %dl, %al -; X64-NEXT: shrb %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %cl +; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmoval %esi, %eax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmoval %edi, %ecx +; X64-NEXT: subb %al, %cl +; X64-NEXT: shrb %cl +; X64-NEXT: movzbl %cl, %edx +; X64-NEXT: negb %cl +; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: cmovbel %edx, %eax ; X64-NEXT: addb %dil, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i8_unsigned_reg_reg: ; X32: # %bb.0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movb {{[0-9]+}}(%esp), %dl ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; X32-NEXT: cmpb %al, %cl -; X32-NEXT: setbe %dl -; X32-NEXT: ja .LBB16_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movb %cl, %ah -; X32-NEXT: jmp .LBB16_3 -; X32-NEXT: .LBB16_1: -; X32-NEXT: movb %al, %ah +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: movb %dl, %ah ; X32-NEXT: movb %cl, %al -; X32-NEXT: .LBB16_3: +; X32-NEXT: ja .LBB16_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movb %cl, %ah +; X32-NEXT: movb %dl, %al +; X32-NEXT: .LBB16_2: ; X32-NEXT: subb %ah, %al -; X32-NEXT: addb %dl, %dl -; X32-NEXT: decb %dl ; X32-NEXT: shrb %al -; X32-NEXT: mulb %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: jbe .LBB16_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negb %al +; X32-NEXT: .LBB16_4: ; X32-NEXT: addb %cl, %al ; X32-NEXT: retl %t3 = icmp ugt i8 %a1, %a2 @@ -1054,40 +1092,41 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl (%rdi), %ecx ; X64-NEXT: cmpb %sil, %cl -; X64-NEXT: setle %dl -; X64-NEXT: movl %ecx, %edi -; X64-NEXT: cmovgl %esi, %edi ; X64-NEXT: movl %ecx, %eax -; X64-NEXT: cmovlel %esi, %eax -; X64-NEXT: addb %dl, %dl -; X64-NEXT: decb %dl -; X64-NEXT: subb %dil, %al -; X64-NEXT: shrb %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %dl +; X64-NEXT: cmovgl %esi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: cmovgl %ecx, %edx +; X64-NEXT: subb %al, %dl +; X64-NEXT: shrb %dl +; X64-NEXT: movzbl %dl, %edi +; X64-NEXT: negb %dl +; X64-NEXT: cmpb %sil, %cl +; X64-NEXT: movzbl %dl, %eax +; X64-NEXT: cmovlel %edi, %eax ; X64-NEXT: addb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i8_signed_mem_reg: ; X32: # %bb.0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movb (%ecx), %cl -; X32-NEXT: cmpb %al, %cl -; X32-NEXT: setle %dl -; X32-NEXT: jg .LBB17_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movb %cl, %ah -; X32-NEXT: jmp .LBB17_3 -; X32-NEXT: .LBB17_1: -; X32-NEXT: movb %al, %ah +; X32-NEXT: movb {{[0-9]+}}(%esp), %dl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movb (%eax), %cl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: movb %dl, %ah ; X32-NEXT: movb %cl, %al -; X32-NEXT: .LBB17_3: +; X32-NEXT: jg .LBB17_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movb %cl, %ah +; X32-NEXT: movb %dl, %al +; X32-NEXT: .LBB17_2: ; X32-NEXT: subb %ah, %al -; X32-NEXT: addb %dl, %dl -; X32-NEXT: decb %dl ; X32-NEXT: shrb %al -; X32-NEXT: mulb %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: jle .LBB17_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negb %al +; X32-NEXT: .LBB17_4: ; X32-NEXT: addb %cl, %al ; X32-NEXT: retl %a1 = load i8, i8* %a1_addr @@ -1107,39 +1146,41 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl (%rsi), %eax ; X64-NEXT: cmpb %al, %dil -; X64-NEXT: setle %cl -; X64-NEXT: movl %edi, %edx -; X64-NEXT: cmovgl %eax, %edx -; X64-NEXT: cmovgl %edi, %eax -; X64-NEXT: addb %cl, %cl -; X64-NEXT: decb %cl -; X64-NEXT: subb %dl, %al -; X64-NEXT: shrb %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %cl +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: cmovgl %eax, %ecx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: cmovgl %edi, %edx +; X64-NEXT: subb %cl, %dl +; X64-NEXT: shrb %dl +; X64-NEXT: movzbl %dl, %ecx +; X64-NEXT: negb %dl +; X64-NEXT: cmpb %al, %dil +; X64-NEXT: movzbl %dl, %eax +; X64-NEXT: cmovlel %ecx, %eax ; X64-NEXT: addb %dil, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i8_signed_reg_mem: ; X32: # %bb.0: ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movb (%eax), %al -; X32-NEXT: cmpb %al, %cl -; X32-NEXT: setle %dl -; X32-NEXT: jg .LBB18_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movb %cl, %ah -; X32-NEXT: jmp .LBB18_3 -; X32-NEXT: .LBB18_1: -; X32-NEXT: movb %al, %ah +; X32-NEXT: movb (%eax), %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: movb %dl, %ah ; X32-NEXT: movb %cl, %al -; X32-NEXT: .LBB18_3: +; X32-NEXT: jg .LBB18_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movb %cl, %ah +; X32-NEXT: movb %dl, %al +; X32-NEXT: .LBB18_2: ; X32-NEXT: subb %ah, %al -; X32-NEXT: addb %dl, %dl -; X32-NEXT: decb %dl ; X32-NEXT: shrb %al -; X32-NEXT: mulb %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: jle .LBB18_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negb %al +; X32-NEXT: .LBB18_4: ; X32-NEXT: addb %cl, %al ; X32-NEXT: retl %a2 = load i8, i8* %a2_addr @@ -1160,17 +1201,19 @@ ; X64-NEXT: movzbl (%rdi), %ecx ; X64-NEXT: movzbl (%rsi), %eax ; X64-NEXT: cmpb %al, %cl -; X64-NEXT: setle %dl -; X64-NEXT: movl %ecx, %esi -; X64-NEXT: cmovgl %eax, %esi -; X64-NEXT: cmovgl %ecx, %eax -; X64-NEXT: addb %dl, %dl -; X64-NEXT: decb %dl -; X64-NEXT: subb %sil, %al -; X64-NEXT: shrb %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %dl +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: cmovgl %eax, %edx +; X64-NEXT: movl %eax, %esi +; X64-NEXT: cmovgl %ecx, %esi +; X64-NEXT: subb %dl, %sil +; X64-NEXT: shrb %sil +; X64-NEXT: movzbl %sil, %edx +; X64-NEXT: negb %sil +; X64-NEXT: cmpb %al, %cl +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: cmovlel %edx, %eax ; X64-NEXT: addb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: scalar_i8_signed_mem_mem: @@ -1178,22 +1221,22 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movb (%ecx), %cl -; X32-NEXT: movb (%eax), %al -; X32-NEXT: cmpb %al, %cl -; X32-NEXT: setle %dl -; X32-NEXT: jg .LBB19_1 -; X32-NEXT: # %bb.2: -; X32-NEXT: movb %cl, %ah -; X32-NEXT: jmp .LBB19_3 -; X32-NEXT: .LBB19_1: -; X32-NEXT: movb %al, %ah +; X32-NEXT: movb (%eax), %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: movb %dl, %ah ; X32-NEXT: movb %cl, %al -; X32-NEXT: .LBB19_3: +; X32-NEXT: jg .LBB19_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: movb %cl, %ah +; X32-NEXT: movb %dl, %al +; X32-NEXT: .LBB19_2: ; X32-NEXT: subb %ah, %al -; X32-NEXT: addb %dl, %dl -; X32-NEXT: decb %dl ; X32-NEXT: shrb %al -; X32-NEXT: mulb %dl +; X32-NEXT: cmpb %dl, %cl +; X32-NEXT: jle .LBB19_4 +; X32-NEXT: # %bb.3: +; X32-NEXT: negb %al +; X32-NEXT: .LBB19_4: ; X32-NEXT: addb %cl, %al ; X32-NEXT: retl %a1 = load i8, i8* %a1_addr diff --git a/llvm/test/CodeGen/X86/pr22338.ll b/llvm/test/CodeGen/X86/pr22338.ll --- a/llvm/test/CodeGen/X86/pr22338.ll +++ b/llvm/test/CodeGen/X86/pr22338.ll @@ -5,51 +5,52 @@ define i32 @fn(i32 %a0, i32 %a1) { ; X86-LABEL: fn: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %ebx, -8 -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; X86-NEXT: sete %cl -; X86-NEXT: setne %al -; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; X86-NEXT: sete %dl -; X86-NEXT: negl %eax -; X86-NEXT: addb %cl, %cl -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shll %cl, %eax +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: setne %cl +; X86-NEXT: negl %ecx +; X86-NEXT: leal (,%ecx,4), %eax +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: jne .LBB0_2 +; X86-NEXT: # %bb.1: # %entry +; X86-NEXT: movl %eax, %edx +; X86-NEXT: .LBB0_2: # %entry +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: je .LBB0_4 +; X86-NEXT: # %bb.3: # %entry +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB0_1: # %bb1 +; X86-NEXT: .LBB0_4: # %bb1 ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: # %bb2 -; X86-NEXT: popl %ebx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB0_4 +; X86-NEXT: # %bb.5: # %bb2 +; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: fn: ; X64: # %bb.0: # %entry -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl $1, %edi -; X64-NEXT: sete %cl -; X64-NEXT: setne %al +; X64-NEXT: setne %dl +; X64-NEXT: negl %edx +; X64-NEXT: leal (,%rdx,4), %eax +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: cmovnel %edx, %ecx ; X64-NEXT: cmpl $1, %esi -; X64-NEXT: sete %dl -; X64-NEXT: negl %eax -; X64-NEXT: addb %cl, %cl -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: addb %dl, %dl -; X64-NEXT: movl %edx, %ecx -; X64-NEXT: shll %cl, %eax +; X64-NEXT: cmovnel %edx, %eax ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %bb1 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: testl %esi, %esi +; X64-NEXT: testl %ecx, %ecx ; X64-NEXT: je .LBB0_1 ; X64-NEXT: # %bb.2: # %bb2 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr33954.ll b/llvm/test/CodeGen/X86/pr33954.ll --- a/llvm/test/CodeGen/X86/pr33954.ll +++ b/llvm/test/CodeGen/X86/pr33954.ll @@ -41,7 +41,7 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; CHECK-NOT: jg -; CHECK: cmovle +; CHECK: cmovg define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 { entry: %cmp30 = icmp sgt i32 %n, 0 diff --git a/llvm/test/CodeGen/X86/pr47482.ll b/llvm/test/CodeGen/X86/pr47482.ll --- a/llvm/test/CodeGen/X86/pr47482.ll +++ b/llvm/test/CodeGen/X86/pr47482.ll @@ -10,17 +10,17 @@ ; CHECK-NEXT: movl {{.*}}(%rip), %eax ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl (%rdi), %ecx +; CHECK-NEXT: movl %ecx, %edi +; CHECK-NEXT: orl $2, %edi ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: sete %cl -; CHECK-NEXT: addl %ecx, %ecx -; CHECK-NEXT: orl (%rdi), %ecx +; CHECK-NEXT: cmovnel %ecx, %edi ; CHECK-NEXT: movl $0, (%rsi) ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: bextrl %eax, {{.*}}(%rip), %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: movl %eax, (%rdx) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -88,17 +88,18 @@ ; AVX512F_32_WIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F_32_WIN-NEXT: vcmpltss %xmm1, %xmm0, %k1 ; AVX512F_32_WIN-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512F_32_WIN-NEXT: xorl %edx, %edx -; AVX512F_32_WIN-NEXT: vucomiss %xmm0, %xmm1 -; AVX512F_32_WIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512F_32_WIN-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX512F_32_WIN-NEXT: vmovss %xmm0, (%esp) +; AVX512F_32_WIN-NEXT: vmovaps %xmm1, %xmm3 +; AVX512F_32_WIN-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F_32_WIN-NEXT: vsubss %xmm3, %xmm0, %xmm2 +; AVX512F_32_WIN-NEXT: vmovss %xmm2, (%esp) ; AVX512F_32_WIN-NEXT: flds (%esp) ; AVX512F_32_WIN-NEXT: fisttpll (%esp) -; AVX512F_32_WIN-NEXT: setbe %dl -; AVX512F_32_WIN-NEXT: shll $31, %edx -; AVX512F_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512F_32_WIN-NEXT: movl (%esp), %eax +; AVX512F_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F_32_WIN-NEXT: movl %ecx, %edx +; AVX512F_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F_32_WIN-NEXT: vucomiss %xmm0, %xmm1 +; AVX512F_32_WIN-NEXT: cmoval %ecx, %edx ; AVX512F_32_WIN-NEXT: movl %ebp, %esp ; AVX512F_32_WIN-NEXT: popl %ebp ; AVX512F_32_WIN-NEXT: retl @@ -110,17 +111,18 @@ ; AVX512F_32_LIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F_32_LIN-NEXT: vcmpltss %xmm1, %xmm0, %k1 ; AVX512F_32_LIN-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512F_32_LIN-NEXT: xorl %edx, %edx -; AVX512F_32_LIN-NEXT: vucomiss %xmm0, %xmm1 -; AVX512F_32_LIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512F_32_LIN-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX512F_32_LIN-NEXT: vmovss %xmm0, (%esp) +; AVX512F_32_LIN-NEXT: vmovaps %xmm1, %xmm3 +; AVX512F_32_LIN-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F_32_LIN-NEXT: vsubss %xmm3, %xmm0, %xmm2 +; AVX512F_32_LIN-NEXT: vmovss %xmm2, (%esp) ; AVX512F_32_LIN-NEXT: flds (%esp) ; AVX512F_32_LIN-NEXT: fisttpll (%esp) -; AVX512F_32_LIN-NEXT: setbe %dl -; AVX512F_32_LIN-NEXT: shll $31, %edx -; AVX512F_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512F_32_LIN-NEXT: movl (%esp), %eax +; AVX512F_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F_32_LIN-NEXT: movl %ecx, %edx +; AVX512F_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F_32_LIN-NEXT: vucomiss %xmm0, %xmm1 +; AVX512F_32_LIN-NEXT: cmoval %ecx, %edx ; AVX512F_32_LIN-NEXT: addl $12, %esp ; AVX512F_32_LIN-NEXT: retl ; @@ -133,18 +135,19 @@ ; SSE3_32_WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE3_32_WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE3_32_WIN-NEXT: movaps %xmm0, %xmm2 -; SSE3_32_WIN-NEXT: xorl %edx, %edx -; SSE3_32_WIN-NEXT: ucomiss %xmm0, %xmm1 -; SSE3_32_WIN-NEXT: cmpltss %xmm1, %xmm0 -; SSE3_32_WIN-NEXT: andnps %xmm1, %xmm0 -; SSE3_32_WIN-NEXT: subss %xmm0, %xmm2 -; SSE3_32_WIN-NEXT: movss %xmm2, (%esp) +; SSE3_32_WIN-NEXT: cmpltss %xmm1, %xmm2 +; SSE3_32_WIN-NEXT: andnps %xmm1, %xmm2 +; SSE3_32_WIN-NEXT: movaps %xmm0, %xmm3 +; SSE3_32_WIN-NEXT: subss %xmm2, %xmm3 +; SSE3_32_WIN-NEXT: movss %xmm3, (%esp) ; SSE3_32_WIN-NEXT: flds (%esp) ; SSE3_32_WIN-NEXT: fisttpll (%esp) -; SSE3_32_WIN-NEXT: setbe %dl -; SSE3_32_WIN-NEXT: shll $31, %edx -; SSE3_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE3_32_WIN-NEXT: movl (%esp), %eax +; SSE3_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3_32_WIN-NEXT: movl %ecx, %edx +; SSE3_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE3_32_WIN-NEXT: ucomiss %xmm0, %xmm1 +; SSE3_32_WIN-NEXT: cmoval %ecx, %edx ; SSE3_32_WIN-NEXT: movl %ebp, %esp ; SSE3_32_WIN-NEXT: popl %ebp ; SSE3_32_WIN-NEXT: retl @@ -155,18 +158,19 @@ ; SSE3_32_LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE3_32_LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE3_32_LIN-NEXT: movaps %xmm0, %xmm2 -; SSE3_32_LIN-NEXT: xorl %edx, %edx -; SSE3_32_LIN-NEXT: ucomiss %xmm0, %xmm1 -; SSE3_32_LIN-NEXT: cmpltss %xmm1, %xmm0 -; SSE3_32_LIN-NEXT: andnps %xmm1, %xmm0 -; SSE3_32_LIN-NEXT: subss %xmm0, %xmm2 -; SSE3_32_LIN-NEXT: movss %xmm2, (%esp) +; SSE3_32_LIN-NEXT: cmpltss %xmm1, %xmm2 +; SSE3_32_LIN-NEXT: andnps %xmm1, %xmm2 +; SSE3_32_LIN-NEXT: movaps %xmm0, %xmm3 +; SSE3_32_LIN-NEXT: subss %xmm2, %xmm3 +; SSE3_32_LIN-NEXT: movss %xmm3, (%esp) ; SSE3_32_LIN-NEXT: flds (%esp) ; SSE3_32_LIN-NEXT: fisttpll (%esp) -; SSE3_32_LIN-NEXT: setbe %dl -; SSE3_32_LIN-NEXT: shll $31, %edx -; SSE3_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE3_32_LIN-NEXT: movl (%esp), %eax +; SSE3_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3_32_LIN-NEXT: movl %ecx, %edx +; SSE3_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE3_32_LIN-NEXT: ucomiss %xmm0, %xmm1 +; SSE3_32_LIN-NEXT: cmoval %ecx, %edx ; SSE3_32_LIN-NEXT: addl $12, %esp ; SSE3_32_LIN-NEXT: retl ; @@ -205,12 +209,12 @@ ; SSE2_32_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE2_32_WIN-NEXT: xorl %edx, %edx -; SSE2_32_WIN-NEXT: ucomiss %xmm0, %xmm1 -; SSE2_32_WIN-NEXT: setbe %dl -; SSE2_32_WIN-NEXT: shll $31, %edx -; SSE2_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE2_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE2_32_WIN-NEXT: movl %ecx, %edx +; SSE2_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE2_32_WIN-NEXT: ucomiss %xmm0, %xmm1 +; SSE2_32_WIN-NEXT: cmoval %ecx, %edx ; SSE2_32_WIN-NEXT: movl %ebp, %esp ; SSE2_32_WIN-NEXT: popl %ebp ; SSE2_32_WIN-NEXT: retl @@ -234,12 +238,12 @@ ; SSE2_32_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE2_32_LIN-NEXT: xorl %edx, %edx -; SSE2_32_LIN-NEXT: ucomiss %xmm0, %xmm1 -; SSE2_32_LIN-NEXT: setbe %dl -; SSE2_32_LIN-NEXT: shll $31, %edx -; SSE2_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE2_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE2_32_LIN-NEXT: movl %ecx, %edx +; SSE2_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE2_32_LIN-NEXT: ucomiss %xmm0, %xmm1 +; SSE2_32_LIN-NEXT: cmoval %ecx, %edx ; SSE2_32_LIN-NEXT: addl $20, %esp ; SSE2_32_LIN-NEXT: retl ; @@ -266,10 +270,9 @@ ; X87_WIN-NEXT: flds __real@5f000000 ; X87_WIN-NEXT: fucom %st(1) ; X87_WIN-NEXT: fnstsw %ax -; X87_WIN-NEXT: xorl %edx, %edx ; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_WIN-NEXT: sahf -; X87_WIN-NEXT: setbe %al +; X87_WIN-NEXT: seta %cl ; X87_WIN-NEXT: fldz ; X87_WIN-NEXT: ja LBB0_2 ; X87_WIN-NEXT: # %bb.1: @@ -280,16 +283,19 @@ ; X87_WIN-NEXT: fstp %st(1) ; X87_WIN-NEXT: fsubrp %st, %st(1) ; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00 -; X87_WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87_WIN-NEXT: orl $3072, %eax # imm = 0xC00 +; X87_WIN-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movb %al, %dl -; X87_WIN-NEXT: shll $31, %edx -; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X87_WIN-NEXT: testb %cl, %cl +; X87_WIN-NEXT: jne LBB0_4 +; X87_WIN-NEXT: # %bb.3: +; X87_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87_WIN-NEXT: LBB0_4: ; X87_WIN-NEXT: movl %ebp, %esp ; X87_WIN-NEXT: popl %ebp ; X87_WIN-NEXT: retl @@ -301,10 +307,9 @@ ; X87_LIN-NEXT: flds {{\.LCPI.*}} ; X87_LIN-NEXT: fucom %st(1) ; X87_LIN-NEXT: fnstsw %ax -; X87_LIN-NEXT: xorl %edx, %edx ; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_LIN-NEXT: sahf -; X87_LIN-NEXT: setbe %al +; X87_LIN-NEXT: seta %cl ; X87_LIN-NEXT: fldz ; X87_LIN-NEXT: ja .LBB0_2 ; X87_LIN-NEXT: # %bb.1: @@ -315,16 +320,19 @@ ; X87_LIN-NEXT: fstp %st(1) ; X87_LIN-NEXT: fsubrp %st, %st(1) ; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00 -; X87_LIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87_LIN-NEXT: orl $3072, %eax # imm = 0xC00 +; X87_LIN-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movb %al, %dl -; X87_LIN-NEXT: shll $31, %edx -; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X87_LIN-NEXT: testb %cl, %cl +; X87_LIN-NEXT: jne .LBB0_4 +; X87_LIN-NEXT: # %bb.3: +; X87_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87_LIN-NEXT: .LBB0_4: ; X87_LIN-NEXT: addl $20, %esp ; X87_LIN-NEXT: retl %r = fptoui float %a to i64 @@ -566,17 +574,18 @@ ; AVX512F_32_WIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512F_32_WIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1 ; AVX512F_32_WIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX512F_32_WIN-NEXT: xorl %edx, %edx -; AVX512F_32_WIN-NEXT: vucomisd %xmm0, %xmm1 -; AVX512F_32_WIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} -; AVX512F_32_WIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX512F_32_WIN-NEXT: vmovsd %xmm0, (%esp) +; AVX512F_32_WIN-NEXT: vmovapd %xmm1, %xmm3 +; AVX512F_32_WIN-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F_32_WIN-NEXT: vsubsd %xmm3, %xmm0, %xmm2 +; AVX512F_32_WIN-NEXT: vmovsd %xmm2, (%esp) ; AVX512F_32_WIN-NEXT: fldl (%esp) ; AVX512F_32_WIN-NEXT: fisttpll (%esp) -; AVX512F_32_WIN-NEXT: setbe %dl -; AVX512F_32_WIN-NEXT: shll $31, %edx -; AVX512F_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512F_32_WIN-NEXT: movl (%esp), %eax +; AVX512F_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F_32_WIN-NEXT: movl %ecx, %edx +; AVX512F_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F_32_WIN-NEXT: vucomisd %xmm0, %xmm1 +; AVX512F_32_WIN-NEXT: cmoval %ecx, %edx ; AVX512F_32_WIN-NEXT: movl %ebp, %esp ; AVX512F_32_WIN-NEXT: popl %ebp ; AVX512F_32_WIN-NEXT: retl @@ -588,17 +597,18 @@ ; AVX512F_32_LIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512F_32_LIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1 ; AVX512F_32_LIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX512F_32_LIN-NEXT: xorl %edx, %edx -; AVX512F_32_LIN-NEXT: vucomisd %xmm0, %xmm1 -; AVX512F_32_LIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} -; AVX512F_32_LIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX512F_32_LIN-NEXT: vmovsd %xmm0, (%esp) +; AVX512F_32_LIN-NEXT: vmovapd %xmm1, %xmm3 +; AVX512F_32_LIN-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F_32_LIN-NEXT: vsubsd %xmm3, %xmm0, %xmm2 +; AVX512F_32_LIN-NEXT: vmovsd %xmm2, (%esp) ; AVX512F_32_LIN-NEXT: fldl (%esp) ; AVX512F_32_LIN-NEXT: fisttpll (%esp) -; AVX512F_32_LIN-NEXT: setbe %dl -; AVX512F_32_LIN-NEXT: shll $31, %edx -; AVX512F_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512F_32_LIN-NEXT: movl (%esp), %eax +; AVX512F_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F_32_LIN-NEXT: movl %ecx, %edx +; AVX512F_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F_32_LIN-NEXT: vucomisd %xmm0, %xmm1 +; AVX512F_32_LIN-NEXT: cmoval %ecx, %edx ; AVX512F_32_LIN-NEXT: addl $12, %esp ; AVX512F_32_LIN-NEXT: retl ; @@ -611,18 +621,19 @@ ; SSE3_32_WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE3_32_WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE3_32_WIN-NEXT: movapd %xmm0, %xmm2 -; SSE3_32_WIN-NEXT: xorl %edx, %edx -; SSE3_32_WIN-NEXT: ucomisd %xmm0, %xmm1 -; SSE3_32_WIN-NEXT: cmpltsd %xmm1, %xmm0 -; SSE3_32_WIN-NEXT: andnpd %xmm1, %xmm0 -; SSE3_32_WIN-NEXT: subsd %xmm0, %xmm2 -; SSE3_32_WIN-NEXT: movsd %xmm2, (%esp) +; SSE3_32_WIN-NEXT: cmpltsd %xmm1, %xmm2 +; SSE3_32_WIN-NEXT: andnpd %xmm1, %xmm2 +; SSE3_32_WIN-NEXT: movapd %xmm0, %xmm3 +; SSE3_32_WIN-NEXT: subsd %xmm2, %xmm3 +; SSE3_32_WIN-NEXT: movsd %xmm3, (%esp) ; SSE3_32_WIN-NEXT: fldl (%esp) ; SSE3_32_WIN-NEXT: fisttpll (%esp) -; SSE3_32_WIN-NEXT: setbe %dl -; SSE3_32_WIN-NEXT: shll $31, %edx -; SSE3_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE3_32_WIN-NEXT: movl (%esp), %eax +; SSE3_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3_32_WIN-NEXT: movl %ecx, %edx +; SSE3_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE3_32_WIN-NEXT: ucomisd %xmm0, %xmm1 +; SSE3_32_WIN-NEXT: cmoval %ecx, %edx ; SSE3_32_WIN-NEXT: movl %ebp, %esp ; SSE3_32_WIN-NEXT: popl %ebp ; SSE3_32_WIN-NEXT: retl @@ -633,18 +644,19 @@ ; SSE3_32_LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE3_32_LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE3_32_LIN-NEXT: movapd %xmm0, %xmm2 -; SSE3_32_LIN-NEXT: xorl %edx, %edx -; SSE3_32_LIN-NEXT: ucomisd %xmm0, %xmm1 -; SSE3_32_LIN-NEXT: cmpltsd %xmm1, %xmm0 -; SSE3_32_LIN-NEXT: andnpd %xmm1, %xmm0 -; SSE3_32_LIN-NEXT: subsd %xmm0, %xmm2 -; SSE3_32_LIN-NEXT: movsd %xmm2, (%esp) +; SSE3_32_LIN-NEXT: cmpltsd %xmm1, %xmm2 +; SSE3_32_LIN-NEXT: andnpd %xmm1, %xmm2 +; SSE3_32_LIN-NEXT: movapd %xmm0, %xmm3 +; SSE3_32_LIN-NEXT: subsd %xmm2, %xmm3 +; SSE3_32_LIN-NEXT: movsd %xmm3, (%esp) ; SSE3_32_LIN-NEXT: fldl (%esp) ; SSE3_32_LIN-NEXT: fisttpll (%esp) -; SSE3_32_LIN-NEXT: setbe %dl -; SSE3_32_LIN-NEXT: shll $31, %edx -; SSE3_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE3_32_LIN-NEXT: movl (%esp), %eax +; SSE3_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3_32_LIN-NEXT: movl %ecx, %edx +; SSE3_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE3_32_LIN-NEXT: ucomisd %xmm0, %xmm1 +; SSE3_32_LIN-NEXT: cmoval %ecx, %edx ; SSE3_32_LIN-NEXT: addl $12, %esp ; SSE3_32_LIN-NEXT: retl ; @@ -683,12 +695,12 @@ ; SSE2_32_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE2_32_WIN-NEXT: xorl %edx, %edx -; SSE2_32_WIN-NEXT: ucomisd %xmm0, %xmm1 -; SSE2_32_WIN-NEXT: setbe %dl -; SSE2_32_WIN-NEXT: shll $31, %edx -; SSE2_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE2_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE2_32_WIN-NEXT: movl %ecx, %edx +; SSE2_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE2_32_WIN-NEXT: ucomisd %xmm0, %xmm1 +; SSE2_32_WIN-NEXT: cmoval %ecx, %edx ; SSE2_32_WIN-NEXT: movl %ebp, %esp ; SSE2_32_WIN-NEXT: popl %ebp ; SSE2_32_WIN-NEXT: retl @@ -712,12 +724,12 @@ ; SSE2_32_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE2_32_LIN-NEXT: xorl %edx, %edx -; SSE2_32_LIN-NEXT: ucomisd %xmm0, %xmm1 -; SSE2_32_LIN-NEXT: setbe %dl -; SSE2_32_LIN-NEXT: shll $31, %edx -; SSE2_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE2_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE2_32_LIN-NEXT: movl %ecx, %edx +; SSE2_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE2_32_LIN-NEXT: ucomisd %xmm0, %xmm1 +; SSE2_32_LIN-NEXT: cmoval %ecx, %edx ; SSE2_32_LIN-NEXT: addl $20, %esp ; SSE2_32_LIN-NEXT: retl ; @@ -744,10 +756,9 @@ ; X87_WIN-NEXT: flds __real@5f000000 ; X87_WIN-NEXT: fucom %st(1) ; X87_WIN-NEXT: fnstsw %ax -; X87_WIN-NEXT: xorl %edx, %edx ; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_WIN-NEXT: sahf -; X87_WIN-NEXT: setbe %al +; X87_WIN-NEXT: seta %cl ; X87_WIN-NEXT: fldz ; X87_WIN-NEXT: ja LBB2_2 ; X87_WIN-NEXT: # %bb.1: @@ -758,16 +769,19 @@ ; X87_WIN-NEXT: fstp %st(1) ; X87_WIN-NEXT: fsubrp %st, %st(1) ; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00 -; X87_WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87_WIN-NEXT: orl $3072, %eax # imm = 0xC00 +; X87_WIN-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movb %al, %dl -; X87_WIN-NEXT: shll $31, %edx -; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X87_WIN-NEXT: testb %cl, %cl +; X87_WIN-NEXT: jne LBB2_4 +; X87_WIN-NEXT: # %bb.3: +; X87_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87_WIN-NEXT: LBB2_4: ; X87_WIN-NEXT: movl %ebp, %esp ; X87_WIN-NEXT: popl %ebp ; X87_WIN-NEXT: retl @@ -779,10 +793,9 @@ ; X87_LIN-NEXT: flds {{\.LCPI.*}} ; X87_LIN-NEXT: fucom %st(1) ; X87_LIN-NEXT: fnstsw %ax -; X87_LIN-NEXT: xorl %edx, %edx ; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_LIN-NEXT: sahf -; X87_LIN-NEXT: setbe %al +; X87_LIN-NEXT: seta %cl ; X87_LIN-NEXT: fldz ; X87_LIN-NEXT: ja .LBB2_2 ; X87_LIN-NEXT: # %bb.1: @@ -793,16 +806,19 @@ ; X87_LIN-NEXT: fstp %st(1) ; X87_LIN-NEXT: fsubrp %st, %st(1) ; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00 -; X87_LIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87_LIN-NEXT: orl $3072, %eax # imm = 0xC00 +; X87_LIN-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movb %al, %dl -; X87_LIN-NEXT: shll $31, %edx -; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X87_LIN-NEXT: testb %cl, %cl +; X87_LIN-NEXT: jne .LBB2_4 +; X87_LIN-NEXT: # %bb.3: +; X87_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87_LIN-NEXT: .LBB2_4: ; X87_LIN-NEXT: addl $20, %esp ; X87_LIN-NEXT: retl %r = fptoui double %a to i64 @@ -1003,18 +1019,20 @@ ; AVX512_32_WIN-NEXT: subl $8, %esp ; AVX512_32_WIN-NEXT: fldt 8(%ebp) ; AVX512_32_WIN-NEXT: flds __real@5f000000 -; AVX512_32_WIN-NEXT: xorl %edx, %edx ; AVX512_32_WIN-NEXT: fucomi %st(1), %st ; AVX512_32_WIN-NEXT: fldz -; AVX512_32_WIN-NEXT: fxch %st(1) +; AVX512_32_WIN-NEXT: fld %st(1) ; AVX512_32_WIN-NEXT: fcmovnbe %st(1), %st ; AVX512_32_WIN-NEXT: fstp %st(1) -; AVX512_32_WIN-NEXT: fsubrp %st, %st(1) +; AVX512_32_WIN-NEXT: fsubr %st(2), %st ; AVX512_32_WIN-NEXT: fisttpll (%esp) -; AVX512_32_WIN-NEXT: setbe %dl -; AVX512_32_WIN-NEXT: shll $31, %edx -; AVX512_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512_32_WIN-NEXT: movl (%esp), %eax +; AVX512_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512_32_WIN-NEXT: movl %ecx, %edx +; AVX512_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512_32_WIN-NEXT: fucompi %st(1), %st +; AVX512_32_WIN-NEXT: fstp %st(0) +; AVX512_32_WIN-NEXT: cmoval %ecx, %edx ; AVX512_32_WIN-NEXT: movl %ebp, %esp ; AVX512_32_WIN-NEXT: popl %ebp ; AVX512_32_WIN-NEXT: retl @@ -1024,18 +1042,20 @@ ; AVX512_32_LIN-NEXT: subl $12, %esp ; AVX512_32_LIN-NEXT: fldt {{[0-9]+}}(%esp) ; AVX512_32_LIN-NEXT: flds {{\.LCPI.*}} -; AVX512_32_LIN-NEXT: xorl %edx, %edx ; AVX512_32_LIN-NEXT: fucomi %st(1), %st ; AVX512_32_LIN-NEXT: fldz -; AVX512_32_LIN-NEXT: fxch %st(1) +; AVX512_32_LIN-NEXT: fld %st(1) ; AVX512_32_LIN-NEXT: fcmovnbe %st(1), %st ; AVX512_32_LIN-NEXT: fstp %st(1) -; AVX512_32_LIN-NEXT: fsubrp %st, %st(1) +; AVX512_32_LIN-NEXT: fsubr %st(2), %st ; AVX512_32_LIN-NEXT: fisttpll (%esp) -; AVX512_32_LIN-NEXT: setbe %dl -; AVX512_32_LIN-NEXT: shll $31, %edx -; AVX512_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512_32_LIN-NEXT: movl (%esp), %eax +; AVX512_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512_32_LIN-NEXT: movl %ecx, %edx +; AVX512_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512_32_LIN-NEXT: fucompi %st(1), %st +; AVX512_32_LIN-NEXT: fstp %st(0) +; AVX512_32_LIN-NEXT: cmoval %ecx, %edx ; AVX512_32_LIN-NEXT: addl $12, %esp ; AVX512_32_LIN-NEXT: retl ; @@ -1083,18 +1103,20 @@ ; SSE3_32_WIN-NEXT: subl $8, %esp ; SSE3_32_WIN-NEXT: fldt 8(%ebp) ; SSE3_32_WIN-NEXT: flds __real@5f000000 -; SSE3_32_WIN-NEXT: xorl %edx, %edx ; SSE3_32_WIN-NEXT: fucomi %st(1), %st ; SSE3_32_WIN-NEXT: fldz -; SSE3_32_WIN-NEXT: fxch %st(1) +; SSE3_32_WIN-NEXT: fld %st(1) ; SSE3_32_WIN-NEXT: fcmovnbe %st(1), %st ; SSE3_32_WIN-NEXT: fstp %st(1) -; SSE3_32_WIN-NEXT: fsubrp %st, %st(1) +; SSE3_32_WIN-NEXT: fsubr %st(2), %st ; SSE3_32_WIN-NEXT: fisttpll (%esp) -; SSE3_32_WIN-NEXT: setbe %dl -; SSE3_32_WIN-NEXT: shll $31, %edx -; SSE3_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE3_32_WIN-NEXT: movl (%esp), %eax +; SSE3_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3_32_WIN-NEXT: movl %ecx, %edx +; SSE3_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE3_32_WIN-NEXT: fucompi %st(1), %st +; SSE3_32_WIN-NEXT: fstp %st(0) +; SSE3_32_WIN-NEXT: cmoval %ecx, %edx ; SSE3_32_WIN-NEXT: movl %ebp, %esp ; SSE3_32_WIN-NEXT: popl %ebp ; SSE3_32_WIN-NEXT: retl @@ -1104,18 +1126,20 @@ ; SSE3_32_LIN-NEXT: subl $12, %esp ; SSE3_32_LIN-NEXT: fldt {{[0-9]+}}(%esp) ; SSE3_32_LIN-NEXT: flds {{\.LCPI.*}} -; SSE3_32_LIN-NEXT: xorl %edx, %edx ; SSE3_32_LIN-NEXT: fucomi %st(1), %st ; SSE3_32_LIN-NEXT: fldz -; SSE3_32_LIN-NEXT: fxch %st(1) +; SSE3_32_LIN-NEXT: fld %st(1) ; SSE3_32_LIN-NEXT: fcmovnbe %st(1), %st ; SSE3_32_LIN-NEXT: fstp %st(1) -; SSE3_32_LIN-NEXT: fsubrp %st, %st(1) +; SSE3_32_LIN-NEXT: fsubr %st(2), %st ; SSE3_32_LIN-NEXT: fisttpll (%esp) -; SSE3_32_LIN-NEXT: setbe %dl -; SSE3_32_LIN-NEXT: shll $31, %edx -; SSE3_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE3_32_LIN-NEXT: movl (%esp), %eax +; SSE3_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3_32_LIN-NEXT: movl %ecx, %edx +; SSE3_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE3_32_LIN-NEXT: fucompi %st(1), %st +; SSE3_32_LIN-NEXT: fstp %st(0) +; SSE3_32_LIN-NEXT: cmoval %ecx, %edx ; SSE3_32_LIN-NEXT: addl $12, %esp ; SSE3_32_LIN-NEXT: retl ; @@ -1163,14 +1187,12 @@ ; SSE2_32_WIN-NEXT: subl $16, %esp ; SSE2_32_WIN-NEXT: fldt 8(%ebp) ; SSE2_32_WIN-NEXT: flds __real@5f000000 -; SSE2_32_WIN-NEXT: xorl %edx, %edx ; SSE2_32_WIN-NEXT: fucomi %st(1), %st -; SSE2_32_WIN-NEXT: setbe %dl ; SSE2_32_WIN-NEXT: fldz -; SSE2_32_WIN-NEXT: fxch %st(1) +; SSE2_32_WIN-NEXT: fld %st(1) ; SSE2_32_WIN-NEXT: fcmovnbe %st(1), %st ; SSE2_32_WIN-NEXT: fstp %st(1) -; SSE2_32_WIN-NEXT: fsubrp %st, %st(1) +; SSE2_32_WIN-NEXT: fsubr %st(2), %st ; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; SSE2_32_WIN-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1178,9 +1200,13 @@ ; SSE2_32_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE2_32_WIN-NEXT: shll $31, %edx -; SSE2_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE2_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2_32_WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE2_32_WIN-NEXT: movl %ecx, %edx +; SSE2_32_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE2_32_WIN-NEXT: fucompi %st(1), %st +; SSE2_32_WIN-NEXT: fstp %st(0) +; SSE2_32_WIN-NEXT: cmoval %ecx, %edx ; SSE2_32_WIN-NEXT: movl %ebp, %esp ; SSE2_32_WIN-NEXT: popl %ebp ; SSE2_32_WIN-NEXT: retl @@ -1190,14 +1216,12 @@ ; SSE2_32_LIN-NEXT: subl $20, %esp ; SSE2_32_LIN-NEXT: fldt {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: flds {{\.LCPI.*}} -; SSE2_32_LIN-NEXT: xorl %edx, %edx ; SSE2_32_LIN-NEXT: fucomi %st(1), %st -; SSE2_32_LIN-NEXT: setbe %dl ; SSE2_32_LIN-NEXT: fldz -; SSE2_32_LIN-NEXT: fxch %st(1) +; SSE2_32_LIN-NEXT: fld %st(1) ; SSE2_32_LIN-NEXT: fcmovnbe %st(1), %st ; SSE2_32_LIN-NEXT: fstp %st(1) -; SSE2_32_LIN-NEXT: fsubrp %st, %st(1) +; SSE2_32_LIN-NEXT: fsubr %st(2), %st ; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; SSE2_32_LIN-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1205,9 +1229,13 @@ ; SSE2_32_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE2_32_LIN-NEXT: shll $31, %edx -; SSE2_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE2_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2_32_LIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE2_32_LIN-NEXT: movl %ecx, %edx +; SSE2_32_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; SSE2_32_LIN-NEXT: fucompi %st(1), %st +; SSE2_32_LIN-NEXT: fstp %st(0) +; SSE2_32_LIN-NEXT: cmoval %ecx, %edx ; SSE2_32_LIN-NEXT: addl $20, %esp ; SSE2_32_LIN-NEXT: retl ; @@ -1269,10 +1297,9 @@ ; X87_WIN-NEXT: flds __real@5f000000 ; X87_WIN-NEXT: fucom %st(1) ; X87_WIN-NEXT: fnstsw %ax -; X87_WIN-NEXT: xorl %edx, %edx ; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_WIN-NEXT: sahf -; X87_WIN-NEXT: setbe %al +; X87_WIN-NEXT: seta %cl ; X87_WIN-NEXT: fldz ; X87_WIN-NEXT: ja LBB4_2 ; X87_WIN-NEXT: # %bb.1: @@ -1283,16 +1310,19 @@ ; X87_WIN-NEXT: fstp %st(1) ; X87_WIN-NEXT: fsubrp %st, %st(1) ; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00 -; X87_WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87_WIN-NEXT: orl $3072, %eax # imm = 0xC00 +; X87_WIN-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movb %al, %dl -; X87_WIN-NEXT: shll $31, %edx -; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X87_WIN-NEXT: testb %cl, %cl +; X87_WIN-NEXT: jne LBB4_4 +; X87_WIN-NEXT: # %bb.3: +; X87_WIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87_WIN-NEXT: LBB4_4: ; X87_WIN-NEXT: movl %ebp, %esp ; X87_WIN-NEXT: popl %ebp ; X87_WIN-NEXT: retl @@ -1304,10 +1334,9 @@ ; X87_LIN-NEXT: flds {{\.LCPI.*}} ; X87_LIN-NEXT: fucom %st(1) ; X87_LIN-NEXT: fnstsw %ax -; X87_LIN-NEXT: xorl %edx, %edx ; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_LIN-NEXT: sahf -; X87_LIN-NEXT: setbe %al +; X87_LIN-NEXT: seta %cl ; X87_LIN-NEXT: fldz ; X87_LIN-NEXT: ja .LBB4_2 ; X87_LIN-NEXT: # %bb.1: @@ -1318,16 +1347,19 @@ ; X87_LIN-NEXT: fstp %st(1) ; X87_LIN-NEXT: fsubrp %st, %st(1) ; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00 -; X87_LIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87_LIN-NEXT: orl $3072, %eax # imm = 0xC00 +; X87_LIN-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movb %al, %dl -; X87_LIN-NEXT: shll $31, %edx -; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X87_LIN-NEXT: testb %cl, %cl +; X87_LIN-NEXT: jne .LBB4_4 +; X87_LIN-NEXT: # %bb.3: +; X87_LIN-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X87_LIN-NEXT: .LBB4_4: ; X87_LIN-NEXT: addl $20, %esp ; X87_LIN-NEXT: retl %r = fptoui x86_fp80 %a to i64 diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll --- a/llvm/test/CodeGen/X86/setcc-combine.ll +++ b/llvm/test/CodeGen/X86/setcc-combine.ll @@ -267,9 +267,12 @@ define i64 @PR40657(i8 %var2, i8 %var9) { ; CHECK-LABEL: PR40657: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %sil -; CHECK-NEXT: addb %dil, %sil -; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: incb %sil +; CHECK-NEXT: leal 1(%rdi), %eax +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovel %edi, %eax ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %var6 = trunc i8 %var9 to i1 diff --git a/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll --- a/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll +++ b/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll @@ -24,9 +24,9 @@ ; CHECK-NEXT: cmpq $-2, %rdx ; CHECK-NEXT: jg .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: cmpq $-2, %rdx +; CHECK-NEXT: cmovlq %rdi, %rax ; CHECK-NEXT: .LBB0_2: # %return ; CHECK-NEXT: retq entry: @@ -56,10 +56,9 @@ ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_1: # %if.end -; CHECK-NEXT: cmpq $-1, %rdx -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: cmpq $-1, %rdx +; CHECK-NEXT: cmovlq %rdi, %rax ; CHECK-NEXT: retq entry: %shl = shl i64 %a, %b @@ -88,9 +87,9 @@ ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB2_1: # %if.end -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovsq %rdi, %rax ; CHECK-NEXT: retq entry: %shl = shl i64 %a, %b @@ -117,10 +116,9 @@ ; CHECK-NEXT: cmpq $1, %rdx ; CHECK-NEXT: jg .LBB3_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovleq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovleq %rdi, %rax ; CHECK-NEXT: .LBB3_2: # %return ; CHECK-NEXT: retq entry: @@ -148,9 +146,9 @@ ; CHECK-NEXT: cmpq $2, %rdx ; CHECK-NEXT: jg .LBB4_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: cmpq $2, %rdx +; CHECK-NEXT: cmovlq %rdi, %rax ; CHECK-NEXT: .LBB4_2: # %return ; CHECK-NEXT: retq entry: @@ -175,9 +173,9 @@ ; CHECK-NEXT: cmpq $-2, %rdi ; CHECK-NEXT: jg .LBB5_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: cmpq $-2, %rdi +; CHECK-NEXT: cmovlq %rdi, %rax ; CHECK-NEXT: .LBB5_2: # %return ; CHECK-NEXT: retq entry: @@ -203,10 +201,9 @@ ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB6_1: # %if.end -; CHECK-NEXT: cmpq $-1, %rdi -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: cmpq $-1, %rdi +; CHECK-NEXT: cmovlq %rdi, %rax ; CHECK-NEXT: retq entry: %cmp = icmp sgt i64 %a, -1 @@ -231,9 +228,9 @@ ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB7_1: # %if.end -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovsq %rdi, %rax ; CHECK-NEXT: retq entry: %cmp = icmp sgt i64 %a, 0 @@ -256,10 +253,9 @@ ; CHECK-NEXT: cmpq $1, %rdi ; CHECK-NEXT: jg .LBB8_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovleq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovleq %rdi, %rax ; CHECK-NEXT: .LBB8_2: # %return ; CHECK-NEXT: retq entry: @@ -283,9 +279,9 @@ ; CHECK-NEXT: cmpq $2, %rdi ; CHECK-NEXT: jg .LBB9_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %ecx -; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: cmpq $2, %rdi +; CHECK-NEXT: cmovlq %rdi, %rax ; CHECK-NEXT: .LBB9_2: # %return ; CHECK-NEXT: retq entry: @@ -311,9 +307,9 @@ ; CHECK-NEXT: cmpl $-2, %eax ; CHECK-NEXT: jg .LBB10_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: imull %edi, %ecx +; CHECK-NEXT: cmpl $-2, %eax +; CHECK-NEXT: cmovll %edi, %ecx ; CHECK-NEXT: .LBB10_2: # %return ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq @@ -346,10 +342,9 @@ ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB11_1: # %if.end -; CHECK-NEXT: cmpl $-1, %eax -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: imull %edi, %ecx +; CHECK-NEXT: cmpl $-1, %eax +; CHECK-NEXT: cmovll %edi, %ecx ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq entry: @@ -381,9 +376,9 @@ ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB12_1: # %if.end -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovsl %eax, %ecx ; CHECK-NEXT: imull %edi, %ecx +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: cmovsl %edi, %ecx ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq entry: @@ -412,10 +407,9 @@ ; CHECK-NEXT: cmpl $1, %eax ; CHECK-NEXT: jg .LBB13_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovlel %eax, %ecx ; CHECK-NEXT: imull %edi, %ecx +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: cmovlel %edi, %ecx ; CHECK-NEXT: .LBB13_2: # %return ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq @@ -445,9 +439,9 @@ ; CHECK-NEXT: cmpl $2, %eax ; CHECK-NEXT: jg .LBB14_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: imull %edi, %ecx +; CHECK-NEXT: cmpl $2, %eax +; CHECK-NEXT: cmovll %edi, %ecx ; CHECK-NEXT: .LBB14_2: # %return ; CHECK-NEXT: movslq %ecx, %rax ; CHECK-NEXT: retq @@ -474,9 +468,9 @@ ; CHECK-NEXT: cmpl $-2, %edi ; CHECK-NEXT: jg .LBB15_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovll %eax, %esi ; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: cmpl $-2, %edi +; CHECK-NEXT: cmovll %edi, %esi ; CHECK-NEXT: .LBB15_2: # %return ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq @@ -505,10 +499,9 @@ ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB16_1: # %if.end -; CHECK-NEXT: cmpl $-1, %edi -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovll %eax, %esi ; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: cmpl $-1, %edi +; CHECK-NEXT: cmovll %edi, %esi ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq entry: @@ -536,9 +529,9 @@ ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB17_1: # %if.end -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovsl %eax, %esi ; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovsl %edi, %esi ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq entry: @@ -563,10 +556,9 @@ ; CHECK-NEXT: cmpl $1, %edi ; CHECK-NEXT: jg .LBB18_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovlel %eax, %esi ; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovlel %edi, %esi ; CHECK-NEXT: .LBB18_2: # %return ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq @@ -592,9 +584,9 @@ ; CHECK-NEXT: cmpl $2, %edi ; CHECK-NEXT: jg .LBB19_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmovll %eax, %esi ; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: cmpl $2, %edi +; CHECK-NEXT: cmovll %edi, %esi ; CHECK-NEXT: .LBB19_2: # %return ; CHECK-NEXT: movslq %esi, %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -220,38 +220,45 @@ ; SSE-32-NEXT: .cfi_offset %ebp, -8 ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-8, %esp -; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: comisd %xmm2, %xmm0 -; SSE-32-NEXT: xorpd %xmm1, %xmm1 -; SSE-32-NEXT: xorpd %xmm3, %xmm3 +; SSE-32-NEXT: subl $32, %esp +; SSE-32-NEXT: .cfi_offset %esi, -12 +; SSE-32-NEXT: movapd %xmm0, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: comisd %xmm1, %xmm3 +; SSE-32-NEXT: setb %cl +; SSE-32-NEXT: xorpd %xmm2, %xmm2 +; SSE-32-NEXT: xorpd %xmm4, %xmm4 ; SSE-32-NEXT: jb .LBB1_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movapd %xmm2, %xmm3 +; SSE-32-NEXT: movapd %xmm1, %xmm4 ; SSE-32-NEXT: .LBB1_2: -; SSE-32-NEXT: movapd %xmm0, %xmm4 -; SSE-32-NEXT: subsd %xmm3, %xmm4 -; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: subsd %xmm4, %xmm3 +; SSE-32-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: comisd %xmm2, %xmm0 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %eax +; SSE-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %eax +; SSE-32-NEXT: comisd %xmm1, %xmm0 +; SSE-32-NEXT: setb %cl ; SSE-32-NEXT: jb .LBB1_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movapd %xmm2, %xmm1 +; SSE-32-NEXT: movapd %xmm1, %xmm2 ; SSE-32-NEXT: .LBB1_4: -; SSE-32-NEXT: subsd %xmm1, %xmm0 +; SSE-32-NEXT: subsd %xmm2, %xmm0 ; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw (%esp) @@ -261,20 +268,20 @@ ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: movzbl %al, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %esi +; SSE-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %esi +; SSE-32-NEXT: movd %esi, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-32-NEXT: movzbl %cl, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE-32-NEXT: retl @@ -321,45 +328,52 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: subl $24, %esp +; AVX-32-NEXT: .cfi_offset %esi, -12 ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB1_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 ; AVX-32-NEXT: .LBB1_2: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovsd %xmm3, (%esp) -; AVX-32-NEXT: fldl (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: jb .LBB1_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovapd %xmm1, %xmm2 ; AVX-32-NEXT: .LBB1_4: ; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovsd %xmm0, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %esi +; AVX-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %esi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: leal -4(%ebp), %esp +; AVX-32-NEXT: popl %esi ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl @@ -404,43 +418,50 @@ ; AVX512F-32-NEXT: .cfi_offset %ebp, -8 ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $16, %esp -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512F-32-NEXT: xorl %eax, %eax -; AVX512F-32-NEXT: vcomisd %xmm2, %xmm1 +; AVX512F-32-NEXT: subl $24, %esp +; AVX512F-32-NEXT: .cfi_offset %esi, -12 +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512F-32-NEXT: setb %al ; AVX512F-32-NEXT: setb %cl ; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovapd %xmm2, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovsd %xmm1, (%esp) -; AVX512F-32-NEXT: fldl (%esp) -; AVX512F-32-NEXT: fisttpll (%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomisd %xmm2, %xmm0 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 -; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %al, %al +; AVX512F-32-NEXT: cmovnel %ecx, %edx +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512F-32-NEXT: setb %al +; AVX512F-32-NEXT: setb %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX512F-32-NEXT: vmovsd %xmm0, (%esp) +; AVX512F-32-NEXT: fldl (%esp) +; AVX512F-32-NEXT: fisttpll (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512F-32-NEXT: testb %al, %al +; AVX512F-32-NEXT: cmovnel %ecx, %esi ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX512F-32-NEXT: leal -4(%ebp), %esp +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX512F-32-NEXT: retl @@ -462,43 +483,50 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $16, %esp -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm1 +; AVX512VL-32-NEXT: subl $24, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -12 +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovapd %xmm2, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovsd %xmm1, (%esp) -; AVX512VL-32-NEXT: fldl (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm0 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %edx +; AVX512VL-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ecx, %edx +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovsd %xmm0, (%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %esi +; AVX512VL-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ecx, %esi ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: leal -4(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX512VL-32-NEXT: retl @@ -903,38 +931,45 @@ ; SSE-32-NEXT: .cfi_offset %ebp, -8 ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-8, %esp -; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: xorps %xmm3, %xmm3 +; SSE-32-NEXT: subl $32, %esp +; SSE-32-NEXT: .cfi_offset %esi, -12 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm1, %xmm3 +; SSE-32-NEXT: setb %cl +; SSE-32-NEXT: xorps %xmm2, %xmm2 +; SSE-32-NEXT: xorps %xmm4, %xmm4 ; SSE-32-NEXT: jb .LBB4_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: movaps %xmm1, %xmm4 ; SSE-32-NEXT: .LBB4_2: -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm3, %xmm4 -; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: subss %xmm4, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %eax +; SSE-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %eax +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: setb %cl ; SSE-32-NEXT: jb .LBB4_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: movaps %xmm1, %xmm2 ; SSE-32-NEXT: .LBB4_4: -; SSE-32-NEXT: subss %xmm1, %xmm0 +; SSE-32-NEXT: subss %xmm2, %xmm0 ; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw (%esp) @@ -944,20 +979,20 @@ ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: movzbl %al, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %esi +; SSE-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %esi +; SSE-32-NEXT: movd %esi, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-32-NEXT: movzbl %cl, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE-32-NEXT: retl @@ -1004,45 +1039,52 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: subl $24, %esp +; AVX-32-NEXT: .cfi_offset %esi, -12 ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB4_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 ; AVX-32-NEXT: .LBB4_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: jb .LBB4_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovaps %xmm1, %xmm2 ; AVX-32-NEXT: .LBB4_4: ; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovss %xmm0, (%esp) -; AVX-32-NEXT: flds (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %esi +; AVX-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %esi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: leal -4(%ebp), %esp +; AVX-32-NEXT: popl %esi ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl @@ -1087,43 +1129,50 @@ ; AVX512F-32-NEXT: .cfi_offset %ebp, -8 ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $16, %esp -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: xorl %eax, %eax -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: subl $24, %esp +; AVX512F-32-NEXT: .cfi_offset %esi, -12 +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX512F-32-NEXT: setb %al ; AVX512F-32-NEXT: setb %cl ; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 -; AVX512F-32-NEXT: vmovss %xmm0, (%esp) +; AVX512F-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX512F-32-NEXT: vmovss %xmm3, (%esp) ; AVX512F-32-NEXT: flds (%esp) ; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %al, %al +; AVX512F-32-NEXT: cmovnel %ecx, %edx +; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX512F-32-NEXT: setb %al +; AVX512F-32-NEXT: setb %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512F-32-NEXT: testb %al, %al +; AVX512F-32-NEXT: cmovnel %ecx, %esi ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX512F-32-NEXT: leal -4(%ebp), %esp +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX512F-32-NEXT: retl @@ -1145,43 +1194,50 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $16, %esp -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: subl $24, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -12 +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vmovss %xmm0, (%esp) +; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) ; AVX512VL-32-NEXT: flds (%esp) ; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %edx +; AVX512VL-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ecx, %edx +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %esi +; AVX512VL-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ecx, %esi ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: leal -4(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX512VL-32-NEXT: retl @@ -1221,40 +1277,47 @@ ; SSE-32-NEXT: .cfi_offset %ebp, -8 ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-8, %esp -; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: subl $32, %esp +; SSE-32-NEXT: .cfi_offset %esi, -12 ; SSE-32-NEXT: movl 8(%ebp), %eax ; SSE-32-NEXT: movaps (%eax), %xmm0 -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: xorps %xmm3, %xmm3 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm1, %xmm3 +; SSE-32-NEXT: setb %cl +; SSE-32-NEXT: xorps %xmm2, %xmm2 +; SSE-32-NEXT: xorps %xmm4, %xmm4 ; SSE-32-NEXT: jb .LBB5_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: movaps %xmm1, %xmm4 ; SSE-32-NEXT: .LBB5_2: -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm3, %xmm4 -; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: subss %xmm4, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %eax +; SSE-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %eax +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: setb %cl ; SSE-32-NEXT: jb .LBB5_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: movaps %xmm1, %xmm2 ; SSE-32-NEXT: .LBB5_4: -; SSE-32-NEXT: subss %xmm1, %xmm0 +; SSE-32-NEXT: subss %xmm2, %xmm0 ; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw (%esp) @@ -1264,20 +1327,20 @@ ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: movzbl %al, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %esi +; SSE-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %esi +; SSE-32-NEXT: movd %esi, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-32-NEXT: movzbl %cl, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE-32-NEXT: retl @@ -1324,13 +1387,16 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: subl $24, %esp +; AVX-32-NEXT: .cfi_offset %esi, -12 ; AVX-32-NEXT: movl 8(%ebp), %eax -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB5_2 @@ -1338,33 +1404,37 @@ ; AVX-32-NEXT: vmovaps %xmm1, %xmm4 ; AVX-32-NEXT: .LBB5_2: ; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: jb .LBB5_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovaps %xmm1, %xmm2 ; AVX-32-NEXT: .LBB5_4: ; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovss %xmm0, (%esp) -; AVX-32-NEXT: flds (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %esi +; AVX-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %esi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: leal -4(%ebp), %esp +; AVX-32-NEXT: popl %esi ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl @@ -1410,45 +1480,52 @@ ; AVX512F-32-NEXT: .cfi_offset %ebp, -8 ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $16, %esp +; AVX512F-32-NEXT: subl $24, %esp +; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: movl 8(%ebp), %eax ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: xorl %eax, %eax -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512F-32-NEXT: setb %al ; AVX512F-32-NEXT: setb %cl ; AVX512F-32-NEXT: kmovw %ecx, %k1 ; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4 ; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512F-32-NEXT: vsubss %xmm4, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovss %xmm0, (%esp) ; AVX512F-32-NEXT: flds (%esp) ; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %al, %al +; AVX512F-32-NEXT: cmovnel %ecx, %edx +; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: setb %al +; AVX512F-32-NEXT: setb %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512F-32-NEXT: vsubss %xmm2, %xmm1, %xmm0 +; AVX512F-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512F-32-NEXT: testb %al, %al +; AVX512F-32-NEXT: cmovnel %ecx, %esi ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX512F-32-NEXT: leal -4(%ebp), %esp +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX512F-32-NEXT: retl @@ -1469,45 +1546,52 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $16, %esp +; AVX512VL-32-NEXT: subl $24, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -12 ; AVX512VL-32-NEXT: movl 8(%ebp), %eax ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 ; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovss %xmm0, (%esp) ; AVX512VL-32-NEXT: flds (%esp) ; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %edx +; AVX512VL-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ecx, %edx +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm2, %xmm1, %xmm0 +; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %esi +; AVX512VL-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ecx, %esi ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: leal -4(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX512VL-32-NEXT: retl @@ -2414,38 +2498,45 @@ ; SSE-32-NEXT: .cfi_offset %ebp, -8 ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-8, %esp -; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: comisd %xmm2, %xmm0 -; SSE-32-NEXT: xorpd %xmm1, %xmm1 -; SSE-32-NEXT: xorpd %xmm3, %xmm3 +; SSE-32-NEXT: subl $32, %esp +; SSE-32-NEXT: .cfi_offset %esi, -12 +; SSE-32-NEXT: movapd %xmm0, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: comisd %xmm1, %xmm3 +; SSE-32-NEXT: setb %cl +; SSE-32-NEXT: xorpd %xmm2, %xmm2 +; SSE-32-NEXT: xorpd %xmm4, %xmm4 ; SSE-32-NEXT: jb .LBB19_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movapd %xmm2, %xmm3 +; SSE-32-NEXT: movapd %xmm1, %xmm4 ; SSE-32-NEXT: .LBB19_2: -; SSE-32-NEXT: movapd %xmm0, %xmm4 -; SSE-32-NEXT: subsd %xmm3, %xmm4 -; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: subsd %xmm4, %xmm3 +; SSE-32-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: comisd %xmm2, %xmm0 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %eax +; SSE-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %eax +; SSE-32-NEXT: comisd %xmm1, %xmm0 +; SSE-32-NEXT: setb %cl ; SSE-32-NEXT: jb .LBB19_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movapd %xmm2, %xmm1 +; SSE-32-NEXT: movapd %xmm1, %xmm2 ; SSE-32-NEXT: .LBB19_4: -; SSE-32-NEXT: subsd %xmm1, %xmm0 +; SSE-32-NEXT: subsd %xmm2, %xmm0 ; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw (%esp) @@ -2455,20 +2546,20 @@ ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: movzbl %al, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %esi +; SSE-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %esi +; SSE-32-NEXT: movd %esi, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-32-NEXT: movzbl %cl, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE-32-NEXT: retl @@ -2515,45 +2606,52 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: subl $24, %esp +; AVX-32-NEXT: .cfi_offset %esi, -12 ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB19_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 ; AVX-32-NEXT: .LBB19_2: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovsd %xmm3, (%esp) -; AVX-32-NEXT: fldl (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: jb .LBB19_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovapd %xmm1, %xmm2 ; AVX-32-NEXT: .LBB19_4: ; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovsd %xmm0, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %esi +; AVX-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %esi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: leal -4(%ebp), %esp +; AVX-32-NEXT: popl %esi ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl @@ -2790,38 +2888,45 @@ ; SSE-32-NEXT: .cfi_offset %ebp, -8 ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-8, %esp -; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: xorps %xmm3, %xmm3 +; SSE-32-NEXT: subl $32, %esp +; SSE-32-NEXT: .cfi_offset %esi, -12 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm1, %xmm3 +; SSE-32-NEXT: setb %cl +; SSE-32-NEXT: xorps %xmm2, %xmm2 +; SSE-32-NEXT: xorps %xmm4, %xmm4 ; SSE-32-NEXT: jb .LBB21_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: movaps %xmm1, %xmm4 ; SSE-32-NEXT: .LBB21_2: -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm3, %xmm4 -; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: subss %xmm4, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 -; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %eax +; SSE-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %eax +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: setb %cl ; SSE-32-NEXT: jb .LBB21_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: movaps %xmm1, %xmm2 ; SSE-32-NEXT: .LBB21_4: -; SSE-32-NEXT: subss %xmm1, %xmm0 +; SSE-32-NEXT: subss %xmm2, %xmm0 ; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait ; SSE-32-NEXT: fnstcw (%esp) @@ -2831,20 +2936,20 @@ ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: movzbl %al, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-32-NEXT: movl %edx, %esi +; SSE-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; SSE-32-NEXT: testb %cl, %cl +; SSE-32-NEXT: cmovnel %edx, %esi +; SSE-32-NEXT: movd %esi, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-32-NEXT: movzbl %cl, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE-32-NEXT: retl @@ -2891,45 +2996,52 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: subl $24, %esp +; AVX-32-NEXT: .cfi_offset %esi, -12 ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB21_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 ; AVX-32-NEXT: .LBB21_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: jb .LBB21_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovaps %xmm1, %xmm2 ; AVX-32-NEXT: .LBB21_4: ; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovss %xmm0, (%esp) -; AVX-32-NEXT: flds (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %esi +; AVX-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %esi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: leal -4(%ebp), %esp +; AVX-32-NEXT: popl %esi ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -224,82 +224,99 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %ebx +; AVX-32-NEXT: pushl %edi +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: subl $40, %esp +; AVX-32-NEXT: .cfi_offset %esi, -20 +; AVX-32-NEXT: .cfi_offset %edi, -16 +; AVX-32-NEXT: .cfi_offset %ebx, -12 ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB1_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 ; AVX-32-NEXT: .LBB1_2: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 ; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX-32-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX-32-NEXT: setb %dl +; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB1_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX-32-NEXT: vmovapd %xmm1, %xmm4 ; AVX-32-NEXT: .LBB1_4: -; AVX-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX-32-NEXT: vmovsd %xmm4, (%esp) -; AVX-32-NEXT: fldl (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: movl %esi, %ecx +; AVX-32-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 +; AVX-32-NEXT: testb %dl, %dl +; AVX-32-NEXT: cmovnel %esi, %ecx +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %bl +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB1_6 ; AVX-32-NEXT: # %bb.5: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 ; AVX-32-NEXT: .LBB1_6: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 ; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %dl -; AVX-32-NEXT: movzbl %dl, %edx -; AVX-32-NEXT: shll $31, %edx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: movl %esi, %edx +; AVX-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX-32-NEXT: testb %bl, %bl +; AVX-32-NEXT: cmovnel %esi, %edx +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX-32-NEXT: setb %bl ; AVX-32-NEXT: jb .LBB1_8 ; AVX-32-NEXT: # %bb.7: ; AVX-32-NEXT: vmovapd %xmm1, %xmm2 ; AVX-32-NEXT: .LBB1_8: ; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovsd %xmm0, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: movl %esi, %edi +; AVX-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX-32-NEXT: testb %bl, %bl +; AVX-32-NEXT: cmovnel %esi, %edi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: leal -12(%ebp), %esp +; AVX-32-NEXT: popl %esi +; AVX-32-NEXT: popl %edi +; AVX-32-NEXT: popl %ebx ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl @@ -375,83 +392,93 @@ ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512F-32-NEXT: pushl %ebx +; AVX512F-32-NEXT: pushl %edi ; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $32, %esp -; AVX512F-32-NEXT: .cfi_offset %esi, -16 +; AVX512F-32-NEXT: subl $40, %esp +; AVX512F-32-NEXT: .cfi_offset %esi, -20 +; AVX512F-32-NEXT: .cfi_offset %edi, -16 ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512F-32-NEXT: setb %cl +; AVX512F-32-NEXT: setb %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512F-32-NEXT: testb %cl, %cl +; AVX512F-32-NEXT: cmovnel %edx, %eax +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512F-32-NEXT: setb %dl ; AVX512F-32-NEXT: setb %cl ; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX512F-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512F-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: movl $0, %eax -; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl %eax, %esi -; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm4 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512F-32-NEXT: movl %esi, %ecx +; AVX512F-32-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %dl, %dl +; AVX512F-32-NEXT: cmovnel %esi, %ecx +; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm4, (%esp) -; AVX512F-32-NEXT: fldl (%esp) -; AVX512F-32-NEXT: fisttpll (%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: xorl %edx, %edx -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512F-32-NEXT: setb %bl ; AVX512F-32-NEXT: kmovw %ebx, %k1 -; AVX512F-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %dl -; AVX512F-32-NEXT: shll $31, %edx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: xorl %ebx, %ebx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512F-32-NEXT: movl %esi, %edi +; AVX512F-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX512F-32-NEXT: testb %dl, %dl +; AVX512F-32-NEXT: cmovnel %esi, %edi +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX512F-32-NEXT: setb %al -; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1} +; AVX512F-32-NEXT: setb %dl +; AVX512F-32-NEXT: setb %bl +; AVX512F-32-NEXT: kmovw %ebx, %k1 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} ; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovsd %xmm0, (%esp) +; AVX512F-32-NEXT: fldl (%esp) +; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512F-32-NEXT: movl %esi, %ebx +; AVX512F-32-NEXT: xorl $-2147483648, %ebx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %dl, %dl +; AVX512F-32-NEXT: cmovnel %esi, %ebx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512F-32-NEXT: setae %bl -; AVX512F-32-NEXT: shll $31, %ebx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512F-32-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512F-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-32-NEXT: leal -8(%ebp), %esp +; AVX512F-32-NEXT: leal -12(%ebp), %esp ; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 @@ -483,83 +510,93 @@ ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512VL-32-NEXT: pushl %ebx +; AVX512VL-32-NEXT: pushl %edi ; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $32, %esp -; AVX512VL-32-NEXT: .cfi_offset %esi, -16 +; AVX512VL-32-NEXT: subl $40, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -20 +; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: movl %edx, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %cl, %cl +; AVX512VL-32-NEXT: cmovnel %edx, %eax +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %dl ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %ecx +; AVX512VL-32-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %ecx +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp) -; AVX512VL-32-NEXT: fldl (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %edi +; AVX512VL-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %edi +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} ; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovsd %xmm0, (%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %ebx +; AVX512VL-32-NEXT: xorl $-2147483648, %ebx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %ebx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-32-NEXT: leal -8(%ebp), %esp +; AVX512VL-32-NEXT: leal -12(%ebp), %esp ; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: popl %edi ; AVX512VL-32-NEXT: popl %ebx ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 @@ -786,44 +823,54 @@ ; AVX-32-NEXT: .cfi_offset %ebp, -8 ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: pushl %ebx +; AVX-32-NEXT: pushl %edi +; AVX-32-NEXT: pushl %esi ; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: subl $40, %esp +; AVX-32-NEXT: .cfi_offset %esi, -20 +; AVX-32-NEXT: .cfi_offset %edi, -16 +; AVX-32-NEXT: .cfi_offset %ebx, -12 ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %cl ; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: jb .LBB3_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 ; AVX-32-NEXT: .LBB3_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 ; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %al -; AVX-32-NEXT: movzbl %al, %eax -; AVX-32-NEXT: shll $31, %eax -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl %edx, %eax +; AVX-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX-32-NEXT: testb %cl, %cl +; AVX-32-NEXT: cmovnel %edx, %eax +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: setb %dl ; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB3_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovaps %xmm1, %xmm4 ; AVX-32-NEXT: .LBB3_4: ; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, (%esp) -; AVX-32-NEXT: flds (%esp) -; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: movl %esi, %ecx +; AVX-32-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 +; AVX-32-NEXT: testb %dl, %dl +; AVX-32-NEXT: cmovnel %esi, %ecx ; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: setb %bl ; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB3_6 ; AVX-32-NEXT: # %bb.5: @@ -834,34 +881,41 @@ ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait -; AVX-32-NEXT: setae %dl -; AVX-32-NEXT: movzbl %dl, %edx -; AVX-32-NEXT: shll $31, %edx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: movl %esi, %edx +; AVX-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX-32-NEXT: testb %bl, %bl +; AVX-32-NEXT: cmovnel %esi, %edx +; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: setb %bl ; AVX-32-NEXT: jb .LBB3_8 ; AVX-32-NEXT: # %bb.7: ; AVX-32-NEXT: vmovaps %xmm1, %xmm2 ; AVX-32-NEXT: .LBB3_8: ; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 -; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: movl %esi, %edi +; AVX-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX-32-NEXT: testb %bl, %bl +; AVX-32-NEXT: cmovnel %esi, %edi ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX-32-NEXT: setae %cl -; AVX-32-NEXT: movzbl %cl, %ecx -; AVX-32-NEXT: shll $31, %ecx -; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: leal -12(%ebp), %esp +; AVX-32-NEXT: popl %esi +; AVX-32-NEXT: popl %edi +; AVX-32-NEXT: popl %ebx ; AVX-32-NEXT: popl %ebp ; AVX-32-NEXT: .cfi_def_cfa %esp, 4 ; AVX-32-NEXT: retl @@ -937,83 +991,93 @@ ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512F-32-NEXT: pushl %ebx +; AVX512F-32-NEXT: pushl %edi ; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $32, %esp -; AVX512F-32-NEXT: .cfi_offset %esi, -16 +; AVX512F-32-NEXT: subl $40, %esp +; AVX512F-32-NEXT: .cfi_offset %esi, -20 +; AVX512F-32-NEXT: .cfi_offset %edi, -16 ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512F-32-NEXT: setb %cl -; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: setb %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX512F-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: movl $0, %eax -; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl %eax, %esi -; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512F-32-NEXT: testb %cl, %cl +; AVX512F-32-NEXT: cmovnel %edx, %eax +; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 +; AVX512F-32-NEXT: setb %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 ; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovss %xmm2, (%esp) -; AVX512F-32-NEXT: flds (%esp) -; AVX512F-32-NEXT: fisttpll (%esp) +; AVX512F-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512F-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512F-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512F-32-NEXT: xorl %edx, %edx -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512F-32-NEXT: movl %esi, %ecx +; AVX512F-32-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %dl, %dl +; AVX512F-32-NEXT: cmovnel %esi, %ecx +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX512F-32-NEXT: setb %dl ; AVX512F-32-NEXT: setb %bl ; AVX512F-32-NEXT: kmovw %ebx, %k1 ; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512F-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512F-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %dl -; AVX512F-32-NEXT: shll $31, %edx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: xorl %ebx, %ebx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512F-32-NEXT: movl %esi, %edi +; AVX512F-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX512F-32-NEXT: testb %dl, %dl +; AVX512F-32-NEXT: cmovnel %esi, %edi +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX512F-32-NEXT: setb %al -; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1} +; AVX512F-32-NEXT: setb %dl +; AVX512F-32-NEXT: setb %bl +; AVX512F-32-NEXT: kmovw %ebx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX512F-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovss %xmm0, (%esp) +; AVX512F-32-NEXT: flds (%esp) +; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512F-32-NEXT: movl %esi, %ebx +; AVX512F-32-NEXT: xorl $-2147483648, %ebx # imm = 0x80000000 +; AVX512F-32-NEXT: testb %dl, %dl +; AVX512F-32-NEXT: cmovnel %esi, %ebx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512F-32-NEXT: setae %bl -; AVX512F-32-NEXT: shll $31, %ebx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512F-32-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512F-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-32-NEXT: leal -8(%ebp), %esp +; AVX512F-32-NEXT: leal -12(%ebp), %esp ; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 @@ -1045,83 +1109,93 @@ ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512VL-32-NEXT: pushl %ebx +; AVX512VL-32-NEXT: pushl %edi ; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $32, %esp -; AVX512VL-32-NEXT: .cfi_offset %esi, -16 +; AVX512VL-32-NEXT: subl $40, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -20 +; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: movl %edx, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %cl, %cl +; AVX512VL-32-NEXT: cmovnel %edx, %eax +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovss %xmm2, (%esp) -; AVX512VL-32-NEXT: flds (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %ecx +; AVX512VL-32-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %ecx +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %dl ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %edi +; AVX512VL-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %edi +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovss %xmm0, (%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %ebx +; AVX512VL-32-NEXT: xorl $-2147483648, %ebx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %ebx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-32-NEXT: leal -8(%ebp), %esp +; AVX512VL-32-NEXT: leal -12(%ebp), %esp ; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: popl %edi ; AVX512VL-32-NEXT: popl %ebx ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -145,16 +145,34 @@ ; AVX512VL-32-NEXT: pushl %edi ; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $80, %esp +; AVX512VL-32-NEXT: subl $88, %esp ; AVX512VL-32-NEXT: .cfi_offset %esi, -20 ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: movl %edx, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %cl, %cl +; AVX512VL-32-NEXT: cmovnel %edx, %eax +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 ; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 @@ -162,29 +180,33 @@ ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm4 ; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %bl, %bl +; AVX512VL-32-NEXT: cmovnel %esi, %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 @@ -194,28 +216,29 @@ ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %bl, %bl +; AVX512VL-32-NEXT: cmovnel %eax, %esi ; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm4 ; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: movl %ebx, %edi +; AVX512VL-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ebx, %edi +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 @@ -226,68 +249,59 @@ ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp) -; AVX512VL-32-NEXT: fldl (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: movl %ebx, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %cl, %cl +; AVX512VL-32-NEXT: cmovnel %ebx, %eax +; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm0, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %ebx +; AVX512VL-32-NEXT: xorl $-2147483648, %ebx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %ecx, %ebx +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} ; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %edx +; AVX512VL-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512VL-32-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; AVX512VL-32-NEXT: cmovnel %ecx, %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 ; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -459,16 +473,34 @@ ; AVX512VL-32-NEXT: pushl %edi ; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $80, %esp +; AVX512VL-32-NEXT: subl $88, %esp ; AVX512VL-32-NEXT: .cfi_offset %esi, -20 ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: movl %edx, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %cl, %cl +; AVX512VL-32-NEXT: cmovnel %edx, %eax +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 ; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 @@ -476,15 +508,17 @@ ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %esi, %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 ; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 @@ -492,13 +526,15 @@ ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: movl %esi, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %bl, %bl +; AVX512VL-32-NEXT: cmovnel %esi, %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -508,60 +544,46 @@ ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4 +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %bl, %bl +; AVX512VL-32-NEXT: cmovnel %eax, %esi +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: movl %ebx, %edi +; AVX512VL-32-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %al, %al +; AVX512VL-32-NEXT: cmovnel %ebx, %edi +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 ; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,3,3,3] -; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, (%esp) -; AVX512VL-32-NEXT: flds (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: movl %ebx, %eax +; AVX512VL-32-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %cl, %cl +; AVX512VL-32-NEXT: cmovnel %ebx, %eax +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 @@ -572,36 +594,42 @@ ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %ebx +; AVX512VL-32-NEXT: xorl $-2147483648, %ebx # imm = 0x80000000 +; AVX512VL-32-NEXT: testb %dl, %dl +; AVX512VL-32-NEXT: cmovnel %ecx, %ebx +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; AVX512VL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: movl %ecx, %edx +; AVX512VL-32-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; AVX512VL-32-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; AVX512VL-32-NEXT: cmovnel %ecx, %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 ; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/zext-sext.ll b/llvm/test/CodeGen/X86/zext-sext.ll --- a/llvm/test/CodeGen/X86/zext-sext.ll +++ b/llvm/test/CodeGen/X86/zext-sext.ll @@ -15,28 +15,29 @@ ; CHECK-NEXT: subq %rax, %rsi ; CHECK-NEXT: movq (%rdx), %rax ; CHECK-NEXT: movswl 8(%rdi), %edx +; CHECK-NEXT: movabsq $5089792277106559579, %rdi # imm = 0x46A2931BF1768A5B ; CHECK-NEXT: movswl (%rax,%rsi,2), %eax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: imull %edx, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: addl $2138875574, %eax # imm = 0x7F7CA6B6 ; CHECK-NEXT: cmpl $-8608074, %eax # imm = 0xFF7CA6B6 -; CHECK-NEXT: movslq %eax, %rdi ; CHECK-NEXT: setl %dl ; CHECK-NEXT: cmpl $2138875573, %eax # imm = 0x7F7CA6B5 -; CHECK-NEXT: movq %rdi, %r8 ; CHECK-NEXT: leal -1(%rdx,%rdx), %edx ; CHECK-NEXT: cmovlel %edx, %esi -; CHECK-NEXT: subq %rax, %r8 -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movslq %eax, %rdx +; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: subq %rdx, %rax +; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: cmpl $1, %esi -; CHECK-NEXT: cmovneq %rax, %r8 -; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: cmovnsq %rax, %r8 -; CHECK-NEXT: movq (%rcx), %rax -; CHECK-NEXT: subq %r8, %rdi -; CHECK-NEXT: leaq -2138875574(%rax,%rdi), %rax -; CHECK-NEXT: movq %rax, (%rcx) +; CHECK-NEXT: cmovneq %rdi, %rax +; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: movabsq $-5089792279245435153, %rdx # imm = 0xB95D6CE38F0CCEEF +; CHECK-NEXT: cmovnsq %rdi, %rax +; CHECK-NEXT: addq (%rcx), %rax +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: movq %rdx, (%rcx) ; CHECK-NEXT: retq entry: %tmp103 = getelementptr inbounds [40 x i16], [40 x i16]* %a, i64 0, i64 4 diff --git a/llvm/test/tools/llvm-locstats/locstats.ll b/llvm/test/tools/llvm-locstats/locstats.ll --- a/llvm/test/tools/llvm-locstats/locstats.ll +++ b/llvm/test/tools/llvm-locstats/locstats.ll @@ -10,10 +10,10 @@ ; LOCSTATS: [20%,30%) 1 11% ; LOCSTATS: [30%,40%) 0 0% ; LOCSTATS: [40%,50%) 0 0% -; LOCSTATS: [50%,60%) 0 0% +; LOCSTATS: [50%,60%) 1 11% ; LOCSTATS: [60%,70%) 3 33% ; LOCSTATS: [70%,80%) 0 0% -; LOCSTATS: [80%,90%) 2 22% +; LOCSTATS: [80%,90%) 1 11% ; LOCSTATS: [90%,100%) 1 11% ; LOCSTATS: 100% 2 22% ;