diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1074,6 +1074,10 @@ if (Op.isUndef()) return false; + // We can't simplify target constants. + if (Op.getOpcode() == ISD::TargetConstant) + return false; + if (Op.getOpcode() == ISD::Constant) { // We know all of the bits for a constant! Known = KnownBits::makeConstant(cast(Op)->getAPIntValue()); @@ -1087,31 +1091,23 @@ return false; } - // Other users may use these bits. EVT VT = Op.getValueType(); - if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) { - if (Depth != 0) { - // If not at the root, Just compute the Known bits to - // simplify things downstream. - Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); - return false; - } - // If this is the root being simplified, allow it to have multiple uses, - // just set the DemandedBits/Elts to all bits. + bool HasMultiUse = false; + if (Depth >= SelectionDAG::MaxRecursionDepth) { + // Limit search depth. + return false; + } else if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) { + // Allow multiple uses, just set the DemandedBits/Elts to all bits. DemandedBits = APInt::getAllOnes(BitWidth); DemandedElts = APInt::getAllOnes(NumElts); + HasMultiUse = true; } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) { // Not demanding any bits/elts from Op. return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); - } else if (Depth >= SelectionDAG::MaxRecursionDepth) { - // Limit search depth. - return false; } KnownBits Known2; switch (Op.getOpcode()) { - case ISD::TargetConstant: - llvm_unreachable("Can't simplify this node"); case ISD::SCALAR_TO_VECTOR: { if (!DemandedElts[0]) return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); @@ -2674,6 +2670,12 @@ APFloat(TLO.DAG.EVTToAPFloatSemantics(VT), Known.One), dl, VT)); } + // A multi use 'all demanded elts' simplify failed to find any knownbits. + // Try again just for the original demanded elts. + // Ensure we do this AFTER constant folding above. + if (HasMultiUse && Known.isUnknown() && !OriginalDemandedElts.isAllOnes()) + Known = TLO.DAG.computeKnownBits(Op, OriginalDemandedElts, Depth); + return false; } diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -2616,36 +2616,36 @@ ; CHECK-NEXT: mov w8, #1895825407 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov x25, #-34359738368 -; CHECK-NEXT: mov x23, #34359738367 +; CHECK-NEXT: mov x22, #34359738367 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: csel x8, x25, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt -; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: csinv x9, x9, xzr, le +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill -; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: str x8, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: csel x8, xzr, x9, vs +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: csel x22, xzr, x9, vs +; CHECK-NEXT: csel x10, xzr, x8, vs +; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x8, x10, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s10 @@ -2654,10 +2654,10 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x24, xzr, x8, vs +; CHECK-NEXT: csel x26, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill @@ -2669,40 +2669,39 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x27, xzr, x8, vs +; CHECK-NEXT: csel x28, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: csel x8, x25, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt -; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: csinv x9, x9, xzr, le +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: csel x29, xzr, x9, vs +; CHECK-NEXT: csel x27, xzr, x8, vs +; CHECK-NEXT: csel x20, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x20, xzr, x8, vs -; CHECK-NEXT: csel x28, xzr, x9, vs +; CHECK-NEXT: csel x29, xzr, x8, vs +; CHECK-NEXT: csel x21, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2712,65 +2711,54 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x21, xzr, x8, vs -; CHECK-NEXT: csel x26, xzr, x9, vs +; CHECK-NEXT: csel x23, xzr, x8, vs +; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: fmov d0, x20 ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: lsr x10, x28, #28 -; CHECK-NEXT: ldr d1, [sp] // 8-byte Folded Reload -; CHECK-NEXT: lsr x12, x29, #28 -; CHECK-NEXT: mov v0.d[1], x28 +; CHECK-NEXT: extr x9, x21, x29, #28 +; CHECK-NEXT: bfi x23, x20, #36, #28 +; CHECK-NEXT: extr x11, x27, x20, #28 +; CHECK-NEXT: str x24, [x19] ; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: stur x11, [x19, #75] -; CHECK-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: stur x9, [x19, #41] +; CHECK-NEXT: stp x23, x11, [x19, #8] +; CHECK-NEXT: lsr x11, x27, #28 +; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: lsr x10, x21, #28 +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: stur x13, [x19, #50] -; CHECK-NEXT: mov v1.d[1], x29 -; CHECK-NEXT: ldr d0, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: extr x10, x28, x11, #28 ; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: bfi x8, x11, #36, #28 -; CHECK-NEXT: strb w12, [x19, #24] +; CHECK-NEXT: ldr x10, [sp] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs +; CHECK-NEXT: bfi x8, x29, #36, #28 +; CHECK-NEXT: strb w11, [x19, #24] +; CHECK-NEXT: stur x10, [x19, #75] +; CHECK-NEXT: ldp x12, x11, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: stur x9, [x19, #25] -; CHECK-NEXT: fmov x12, d1 -; CHECK-NEXT: stur x10, [x19, #41] -; CHECK-NEXT: lsr x9, x22, #28 -; CHECK-NEXT: ldr d1, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x19, #33] +; CHECK-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: extr x10, x12, x11, #28 +; CHECK-NEXT: bfi x28, x11, #36, #28 +; CHECK-NEXT: stur x8, [x19, #50] +; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: ldr x11, [sp, #72] // 8-byte Folded Reload -; CHECK-NEXT: extr x18, x29, x12, #28 -; CHECK-NEXT: mov v0.d[1], x22 -; CHECK-NEXT: bfi x21, x12, #36, #28 -; CHECK-NEXT: str x26, [x19] -; CHECK-NEXT: mov v1.d[1], x11 -; CHECK-NEXT: lsr x10, x11, #28 -; CHECK-NEXT: mov x13, x11 -; CHECK-NEXT: stp x21, x18, [x19, #8] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: strb w9, [x19, #99] -; CHECK-NEXT: strb w10, [x19, #74] -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: extr x12, x22, x8, #28 -; CHECK-NEXT: bfi x27, x8, #36, #28 -; CHECK-NEXT: extr x8, x13, x11, #28 -; CHECK-NEXT: bfi x24, x11, #36, #28 -; CHECK-NEXT: stur x12, [x19, #91] -; CHECK-NEXT: stur x27, [x19, #83] +; CHECK-NEXT: stur x10, [x19, #91] +; CHECK-NEXT: stur x28, [x19, #83] +; CHECK-NEXT: extr x8, x11, x9, #28 +; CHECK-NEXT: bfi x26, x9, #36, #28 +; CHECK-NEXT: lsr x9, x12, #28 ; CHECK-NEXT: stur x8, [x19, #66] -; CHECK-NEXT: stur x24, [x19, #58] +; CHECK-NEXT: lsr x8, x11, #28 +; CHECK-NEXT: stur x26, [x19, #58] +; CHECK-NEXT: strb w9, [x19, #99] +; CHECK-NEXT: strb w8, [x19, #74] ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #144] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2195,28 +2195,28 @@ ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #1904214015 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: mov x23, #68719476735 ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x20, x21, x8, gt +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp x8, x9, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x23, x21, x8, gt +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 @@ -2226,7 +2226,7 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x24, x21, x9, gt +; CHECK-NEXT: csel x25, x23, x9, gt ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2238,29 +2238,29 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x26, x21, x9, gt -; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: csel x27, x23, x9, gt +; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x29, x9, xzr, le -; CHECK-NEXT: csel x28, x21, x8, gt +; CHECK-NEXT: csel x29, x23, x9, gt +; CHECK-NEXT: csinv x26, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x27, x9, xzr, le -; CHECK-NEXT: csel x22, x21, x8, gt +; CHECK-NEXT: csel x28, x23, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2270,58 +2270,46 @@ ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x25, x21, x9, gt -; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill +; CHECK-NEXT: csel x21, x23, x9, gt +; CHECK-NEXT: csinv x22, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: fmov d0, x27 -; CHECK-NEXT: fmov d1, x29 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: lsr x10, x22, #28 -; CHECK-NEXT: stur x11, [x19, #75] -; CHECK-NEXT: lsr x11, x28, #28 -; CHECK-NEXT: mov v0.d[1], x22 -; CHECK-NEXT: ldr x12, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: mov v1.d[1], x28 +; CHECK-NEXT: extr x8, x28, x20, #28 +; CHECK-NEXT: bfi x21, x26, #36, #28 +; CHECK-NEXT: extr x9, x29, x26, #28 +; CHECK-NEXT: lsr x11, x29, #28 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: stur x8, [x19, #41] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: csel x10, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: stur x12, [x19, #50] -; CHECK-NEXT: fmov x12, d0 -; CHECK-NEXT: fmov x13, d1 +; CHECK-NEXT: stp x21, x9, [x19, #8] +; CHECK-NEXT: lsr x9, x28, #28 +; CHECK-NEXT: strb w11, [x19, #24] +; CHECK-NEXT: bfi x27, x24, #36, #28 +; CHECK-NEXT: csel x10, x23, x10, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: ldp d0, d1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, x21, x9, gt -; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: extr x10, x22, x12, #28 -; CHECK-NEXT: bfi x9, x12, #36, #28 +; CHECK-NEXT: bfi x10, x20, #36, #28 +; CHECK-NEXT: strb w9, [x19, #49] ; CHECK-NEXT: stur x8, [x19, #25] -; CHECK-NEXT: extr x8, x28, x13, #28 -; CHECK-NEXT: mov v0.d[1], x23 -; CHECK-NEXT: strb w11, [x19, #24] -; CHECK-NEXT: mov v1.d[1], x20 -; CHECK-NEXT: stur x10, [x19, #41] -; CHECK-NEXT: stur x9, [x19, #33] -; CHECK-NEXT: bfi x25, x13, #36, #28 -; CHECK-NEXT: str x8, [x19, #16] -; CHECK-NEXT: lsr x9, x23, #28 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: ldr x12, [sp] // 8-byte Folded Reload -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: lsr x10, x20, #28 -; CHECK-NEXT: strb w9, [x19, #99] -; CHECK-NEXT: stp x12, x25, [x19] -; CHECK-NEXT: extr x12, x23, x8, #28 -; CHECK-NEXT: bfi x26, x8, #36, #28 -; CHECK-NEXT: extr x8, x20, x11, #28 -; CHECK-NEXT: bfi x24, x11, #36, #28 -; CHECK-NEXT: strb w10, [x19, #74] -; CHECK-NEXT: stur x12, [x19, #91] -; CHECK-NEXT: stur x26, [x19, #83] -; CHECK-NEXT: stur x8, [x19, #66] -; CHECK-NEXT: stur x24, [x19, #58] +; CHECK-NEXT: stur x10, [x19, #33] +; CHECK-NEXT: ldp x9, x12, [sp] // 16-byte Folded Reload +; CHECK-NEXT: stur x9, [x19, #75] +; CHECK-NEXT: extr x8, x12, x24, #28 +; CHECK-NEXT: ldr x9, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: stur x9, [x19, #50] +; CHECK-NEXT: ldp x11, x10, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: stur x8, [x19, #91] +; CHECK-NEXT: lsr x8, x12, #28 +; CHECK-NEXT: stur x27, [x19, #83] +; CHECK-NEXT: extr x9, x10, x11, #28 +; CHECK-NEXT: bfi x25, x11, #36, #28 +; CHECK-NEXT: strb w8, [x19, #99] +; CHECK-NEXT: stur x9, [x19, #66] +; CHECK-NEXT: lsr x9, x10, #28 +; CHECK-NEXT: stur x25, [x19, #58] +; CHECK-NEXT: strb w9, [x19, #74] ; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2753,67 +2753,63 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16 +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX6-NEXT: s_and_b32 s8, s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: s_lshr_b32 s9, s6, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 ; GFX6-NEXT: s_and_b32 s6, s7, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: s_and_b32 s6, s5, 0xffff -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_lshr_b32 s4, s7, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_mad_f32 v4, -v1, v3, v4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3029,7 +3025,7 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s6, s4 @@ -3045,7 +3041,7 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 @@ -3280,74 +3276,73 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 -; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 -; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 -; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: s_ashr_i32 s9, s6, 16 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: s_xor_b32 s4, s4, s9 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s7 -; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: s_sext_i32_i16 s6, s5 ; GFX6-NEXT: s_xor_b32 s4, s6, s4 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v2, -v4, v3, v2 +; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: s_ashr_i32 s4, s7, 16 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v3| -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: s_lshr_b32 s6, s7, 16 ; GFX6-NEXT: s_ashr_i32 s7, s5, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s7 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: s_xor_b32 s4, s7, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s6 ; GFX6-NEXT: s_lshr_b32 s4, s5, 16 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -3635,7 +3630,7 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3719,7 +3714,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_lshr_b32 s3, s4, 8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -3999,54 +3994,50 @@ ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s6, 0xffff +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_and_b32 s8, s4, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX6-NEXT: v_alignbit_b32 v0, s5, v0, 16 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: s_lshr_b32 s9, s6, 16 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s5, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -4225,7 +4216,7 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 @@ -4415,49 +4406,48 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_ashr_i32 s9, s6, 16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 -; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 -; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 -; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: s_xor_b32 s4, s4, s9 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: s_sext_i32_i16 s4, s7 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 -; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 @@ -5026,7 +5016,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: s_xor_b32 s0, s1, s0 @@ -5251,7 +5241,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 @@ -5274,7 +5264,7 @@ ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -5287,11 +5277,11 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 ; GFX6-NEXT: s_lshr_b32 s3, s2, 15 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v5 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 @@ -5404,7 +5394,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -5683,9 +5673,9 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 @@ -6490,7 +6480,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_xor_b32 s11, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, 0, s10 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2 @@ -6504,7 +6494,7 @@ ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: s_add_i32 s1, s9, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 @@ -6954,7 +6944,7 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_xor_b32 s4, s5, s9 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 @@ -7134,9 +7124,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -8217,7 +8207,7 @@ ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -8548,9 +8538,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -9297,9 +9287,9 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 @@ -10528,8 +10518,8 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1735,94 +1735,94 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:6 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:4 ; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] offset:3 -; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2 -; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] offset:1 -; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[0:1] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6 +; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 ; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 -; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:1 -; GFX11-NEXT: global_load_d16_b16 v4, v0, s[2:3] offset:4 +; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1 +; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b96 v7, v[4:6], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -51,11 +51,10 @@ ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1 -; SI-NEXT: v_sub_i32_e32 v2, vcc, 16, v1 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_barrier -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v2 +; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1 ; SI-NEXT: ds_read_b32 v0, v0 ; SI-NEXT: ds_read_b32 v3, v2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -77,16 +76,13 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier -; CI-NEXT: v_sub_i32_e32 v2, vcc, 16, v1 -; CI-NEXT: ds_read_b32 v0, v0 offset:12 -; CI-NEXT: ds_read_b32 v3, v2 offset:12 +; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:16 +; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v4, v[1:2], s[0:3], 0 addr64 offset:16 ; CI-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -38,8 +38,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -85,8 +85,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -56,19 +56,22 @@ ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: s_and_b32 s3, s0, 0xffff +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s0 ; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 ; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; HAWAII-NEXT: v_or_b32_e32 v0, s3, v0 +; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7 ; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: ds_write_b32 v1, v3 ; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -675,7 +675,7 @@ ; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_load_dword s4, s[0:1], 0xd ; GCN-NEXT: s_load_dword s6, s[0:1], 0xc -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, 0xffff +; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xffff @@ -687,7 +687,7 @@ ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: s_and_b32 s8, s6, 0xffff ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 +; GCN-NEXT: v_mac_f32_e32 v1, 0, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s9, s0, 0xff000000 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -234,9 +234,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s0, 0xffff +; VI-NEXT: s_add_i32 s1, s0, 12 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_i32 s1, s1, 12 ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll --- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -1356,54 +1356,77 @@ ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r4, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d16 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d18[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr +; CHECK-FIX-NOSCHED-NEXT: lsr r5, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s4, r4 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r12 -; CHECK-FIX-NOSCHED-NEXT: vmov s7, lr -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r4, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB36_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_2: -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #6] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: b .LBB36_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB36_3: +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r2, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #8] ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r8, [r2] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s4 ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s5 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 @@ -1411,44 +1434,46 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB36_3: -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q0, q0 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d0[1] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d16[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r0 ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 -; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: .LBB36_5: +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s7 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -1494,39 +1519,63 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9} ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_3 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] ; CHECK-CORTEX-FIX-NEXT: vmov r5, r6, d17 +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s7, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s7 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r12, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s5, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d18[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 +; CHECK-CORTEX-FIX-NEXT: lsr r12, lr, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB36_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB36_4 +; CHECK-CORTEX-FIX-NEXT: bne .LBB36_4 ; CHECK-CORTEX-FIX-NEXT: .LBB36_2: +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r1 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s9 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: b .LBB36_5 +; CHECK-CORTEX-FIX-NEXT: .LBB36_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2] ; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r8, [r2, #4] @@ -1535,84 +1584,86 @@ ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r2, #14] +; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r8 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s9, lr +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s6 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_4 -; CHECK-CORTEX-FIX-NEXT: .LBB36_3: -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 ; CHECK-CORTEX-FIX-NEXT: .LBB36_4: +; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d0[1] ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 -; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16 +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r1 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s7 +; CHECK-CORTEX-FIX-NEXT: lsr r1, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 -; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r1 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 ; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s5 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d16[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s3 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s18 +; CHECK-CORTEX-FIX-NEXT: .LBB36_5: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 ; CHECK-CORTEX-FIX-NEXT: vmov r1, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r5 @@ -1620,7 +1671,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s1 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s3 ; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r4, r1, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -1629,7 +1680,7 @@ ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, pc} br i1 %0, label %5, label %12 @@ -1680,56 +1731,78 @@ ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] ; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, lr, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov r2, r12, d16 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, lr +; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, lr -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r12, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r5 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB37_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_2: +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: b .LBB37_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #6] +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #14] ; CHECK-FIX-NOSCHED-NEXT: vmov s8, r3 ; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r4 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: vmov s1, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #4] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r5 +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s0 ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov s1, r6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 @@ -1737,47 +1810,48 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov r7, s9 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d2[1] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r7 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d2[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r0 ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 -; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_5: +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s3 ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s4 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -1822,129 +1896,153 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9} ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2 +; CHECK-CORTEX-FIX-NEXT: beq .LBB37_3 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] ; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov r4, r5, d17 +; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s11, lr ; CHECK-CORTEX-FIX-NEXT: lsr r6, r4, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s3, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d16[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11 ; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB37_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB37_4 +; CHECK-CORTEX-FIX-NEXT: bne .LBB37_4 ; CHECK-CORTEX-FIX-NEXT: .LBB37_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] -; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4] -; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #6] -; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r1, #8] -; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10] -; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14] -; CHECK-CORTEX-FIX-NEXT: vmov s0, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s2, r2 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: b .LBB37_5 +; CHECK-CORTEX-FIX-NEXT: .LBB37_3: +; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4] +; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #6] +; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14] ; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s2, r2 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s11, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s13, lr -; CHECK-CORTEX-FIX-NEXT: vmov s15, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s11, lr +; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s2 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB37_4 -; CHECK-CORTEX-FIX-NEXT: .LBB37_3: +; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2 +; CHECK-CORTEX-FIX-NEXT: .LBB37_4: ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[1] ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 ; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r0 -; CHECK-CORTEX-FIX-NEXT: .LBB37_4: ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 -; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r2 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s3 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 -; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s4 -; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s1 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d2[0] +; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s7 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s18 +; CHECK-CORTEX-FIX-NEXT: .LBB37_5: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s4 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s12 ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s6 @@ -1953,7 +2051,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s5 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s7 ; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r4, r2, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -1962,7 +2060,7 @@ ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r11, pc} br i1 %0, label %5, label %11 @@ -3726,54 +3824,77 @@ ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r4, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d16 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d18[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr +; CHECK-FIX-NOSCHED-NEXT: lsr r5, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s4, r4 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r12 -; CHECK-FIX-NOSCHED-NEXT: vmov s7, lr -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r4, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB82_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_2: -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #6] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: b .LBB82_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB82_3: +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r2, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #8] ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r8, [r2] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s4 ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s5 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 @@ -3781,44 +3902,46 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB82_3: -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q0, q0 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d0[1] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d16[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r0 ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 -; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: .LBB82_5: +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s7 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -3864,39 +3987,63 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9} ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_3 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] ; CHECK-CORTEX-FIX-NEXT: vmov r5, r6, d17 +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s7, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s7 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r12, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s5, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d18[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 +; CHECK-CORTEX-FIX-NEXT: lsr r12, lr, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB82_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB82_4 +; CHECK-CORTEX-FIX-NEXT: bne .LBB82_4 ; CHECK-CORTEX-FIX-NEXT: .LBB82_2: +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r1 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s9 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: b .LBB82_5 +; CHECK-CORTEX-FIX-NEXT: .LBB82_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2] ; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r8, [r2, #4] @@ -3905,84 +4052,86 @@ ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r2, #14] +; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r8 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s9, lr +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s6 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_4 -; CHECK-CORTEX-FIX-NEXT: .LBB82_3: -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 ; CHECK-CORTEX-FIX-NEXT: .LBB82_4: +; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d0[1] ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 -; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16 +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r1 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s7 +; CHECK-CORTEX-FIX-NEXT: lsr r1, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 -; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r1 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 ; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s5 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d16[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s3 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s18 +; CHECK-CORTEX-FIX-NEXT: .LBB82_5: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 ; CHECK-CORTEX-FIX-NEXT: vmov r1, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r5 @@ -3990,7 +4139,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s1 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s3 ; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r4, r1, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -3999,7 +4148,7 @@ ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, pc} br i1 %0, label %5, label %12 @@ -4050,56 +4199,78 @@ ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] ; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, lr, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov r2, r12, d16 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, lr +; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, lr -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r12, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r5 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB83_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_2: +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: b .LBB83_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #6] +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #14] ; CHECK-FIX-NOSCHED-NEXT: vmov s8, r3 ; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r4 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: vmov s1, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #4] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r5 +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s0 ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov s1, r6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 @@ -4107,47 +4278,48 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov r7, s9 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d2[1] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r7 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d2[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r0 ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 -; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_5: +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s3 ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s4 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -4192,42 +4364,65 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9} ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2 +; CHECK-CORTEX-FIX-NEXT: beq .LBB83_3 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] ; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov r4, r5, d17 +; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s11, lr ; CHECK-CORTEX-FIX-NEXT: lsr r6, r4, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s3, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d16[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11 ; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB83_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB83_4 +; CHECK-CORTEX-FIX-NEXT: bne .LBB83_4 ; CHECK-CORTEX-FIX-NEXT: .LBB83_2: +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: b .LBB83_5 +; CHECK-CORTEX-FIX-NEXT: .LBB83_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] ; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4] @@ -4236,85 +4431,86 @@ ; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s11, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s13, lr -; CHECK-CORTEX-FIX-NEXT: vmov s15, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s11, lr +; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s2 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB83_4 -; CHECK-CORTEX-FIX-NEXT: .LBB83_3: +; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2 +; CHECK-CORTEX-FIX-NEXT: .LBB83_4: ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[1] ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 ; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r0 -; CHECK-CORTEX-FIX-NEXT: .LBB83_4: ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 -; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r2 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s3 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 -; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s4 -; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s1 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d2[0] +; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s7 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r0 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s18 +; CHECK-CORTEX-FIX-NEXT: .LBB83_5: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s4 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s12 ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s6 @@ -4323,7 +4519,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s5 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s7 ; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r4, r2, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -4332,7 +4528,7 @@ ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r11, pc} br i1 %0, label %5, label %11 diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll --- a/llvm/test/CodeGen/RISCV/rv32zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll @@ -1110,29 +1110,28 @@ ; ; RV32ZBP-LABEL: gorc2b_i64: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: srli a2, a1, 2 -; RV32ZBP-NEXT: srli a3, a0, 2 -; RV32ZBP-NEXT: lui a4, 209715 -; RV32ZBP-NEXT: addi a4, a4, 819 -; RV32ZBP-NEXT: and a3, a3, a4 -; RV32ZBP-NEXT: or a3, a3, a0 -; RV32ZBP-NEXT: or a2, a2, a1 -; RV32ZBP-NEXT: orc2.n a1, a1 +; RV32ZBP-NEXT: srli a2, a0, 2 +; RV32ZBP-NEXT: srli a3, a1, 2 +; RV32ZBP-NEXT: or a3, a3, a1 +; RV32ZBP-NEXT: or a2, a2, a0 ; RV32ZBP-NEXT: orc2.n a0, a0 +; RV32ZBP-NEXT: orc2.n a1, a1 ; RV32ZBP-NEXT: slli a2, a2, 2 ; RV32ZBP-NEXT: slli a3, a3, 2 -; RV32ZBP-NEXT: lui a5, 838861 -; RV32ZBP-NEXT: addi a5, a5, -820 -; RV32ZBP-NEXT: and a3, a3, a5 -; RV32ZBP-NEXT: and a2, a2, a5 +; RV32ZBP-NEXT: lui a4, 838861 +; RV32ZBP-NEXT: addi a4, a4, -820 +; RV32ZBP-NEXT: and a3, a3, a4 +; RV32ZBP-NEXT: and a2, a2, a4 +; RV32ZBP-NEXT: srli a4, a1, 2 ; RV32ZBP-NEXT: srli a5, a0, 2 -; RV32ZBP-NEXT: srli a6, a1, 2 -; RV32ZBP-NEXT: and a6, a6, a4 -; RV32ZBP-NEXT: and a4, a5, a4 -; RV32ZBP-NEXT: or a0, a4, a0 -; RV32ZBP-NEXT: or a1, a6, a1 -; RV32ZBP-NEXT: or a1, a1, a2 -; RV32ZBP-NEXT: or a0, a0, a3 +; RV32ZBP-NEXT: lui a6, 209715 +; RV32ZBP-NEXT: addi a6, a6, 819 +; RV32ZBP-NEXT: and a5, a5, a6 +; RV32ZBP-NEXT: and a4, a4, a6 +; RV32ZBP-NEXT: or a1, a4, a1 +; RV32ZBP-NEXT: or a0, a5, a0 +; RV32ZBP-NEXT: or a0, a0, a2 +; RV32ZBP-NEXT: or a1, a1, a3 ; RV32ZBP-NEXT: ret %and1 = shl i64 %a, 2 %shl1 = and i64 %and1, -3689348814741910324 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -6,55 +6,55 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff ; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov r4, r1, d6 -; CHECK-NEXT: vmov r0, r12, d7 -; CHECK-NEXT: vldrw.u32 q3, [r2] -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s6, s13 -; CHECK-NEXT: adds r2, r5, r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: adcs r1, r6 -; CHECK-NEXT: asrl r2, r1, r4 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: adds r6, r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: asr.w r4, r1, #31 -; CHECK-NEXT: adc.w r1, r4, lr -; CHECK-NEXT: asrl r6, r1, r3 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vand q3, q1, q2 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vmov lr, r12, d7 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vand q2, q4, q2 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r5, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: asrl r0, r5, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adcs r1, r4 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: asrl r2, r1, r3 +; CHECK-NEXT: vmov r4, r5, d6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: adds.w r6, r1, lr ; CHECK-NEXT: asr.w r3, r1, #31 ; CHECK-NEXT: adc.w r1, r3, r12 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: asrl r0, r1, r3 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: adds r6, r1, r5 -; CHECK-NEXT: asr.w r2, r1, #31 -; CHECK-NEXT: adc.w r1, r2, r4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 -; CHECK-NEXT: vpop {d9} +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: asrl r6, r1, r3 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r4, r4, r1 +; CHECK-NEXT: asr.w r3, r1, #31 +; CHECK-NEXT: adc.w r1, r3, r5 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: asrl r4, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 @@ -142,56 +142,56 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q2, q0 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, lr, d2 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov.i64 q4, #0xffffffff ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r5, r1, d6 -; CHECK-NEXT: vmov r0, r12, d7 -; CHECK-NEXT: vldrw.u32 q3, [r2] -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov.f32 s4, s12 -; CHECK-NEXT: vmov.f32 s2, s13 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vand q2, q0, q4 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov r5, r1, d3 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vmov r4, lr, d5 +; CHECK-NEXT: vmov.f32 s20, s6 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s22, s7 +; CHECK-NEXT: vand q4, q5, q4 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: adds r2, r6, r5 -; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: asr.w r7, r6, #31 ; CHECK-NEXT: adcs r1, r7 ; CHECK-NEXT: asrl r2, r1, r5 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: asr.w r5, r1, #31 ; CHECK-NEXT: adc.w r1, r5, lr ; CHECK-NEXT: asrl r4, r1, r7 -; CHECK-NEXT: vmov r6, r5, d3 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 +; CHECK-NEXT: vmov r6, r5, d4 +; CHECK-NEXT: vmov r1, s12 ; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: asr.w r7, r1, #31 ; CHECK-NEXT: adc.w r1, r7, r12 -; CHECK-NEXT: vmov r7, s18 +; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: asrl r0, r1, r7 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: adds r6, r6, r1 -; CHECK-NEXT: asr.w r2, r1, #31 -; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r6, r0 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vpop {d9} +; CHECK-NEXT: asr.w r7, r1, #31 +; CHECK-NEXT: adc.w r1, r7, r5 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: asrl r6, r1, r7 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: @@ -276,8 +276,8 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) { ; CHECK-LABEL: load_one_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s2, s3 @@ -285,27 +285,27 @@ ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: adds.w r12, r2, r2 ; CHECK-NEXT: asr.w r3, r2, #31 -; CHECK-NEXT: adc.w r7, r3, r2, asr #31 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: asrl r12, r7, r2 -; CHECK-NEXT: adds r0, r3, r3 -; CHECK-NEXT: asr.w r5, r3, #31 -; CHECK-NEXT: adc.w r5, r5, r3, asr #31 -; CHECK-NEXT: asrl r0, r5, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: adds r4, r3, r3 -; CHECK-NEXT: asr.w r5, r3, #31 -; CHECK-NEXT: adc.w r5, r5, r3, asr #31 -; CHECK-NEXT: asrl r4, r5, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: adc.w r3, r3, r2, asr #31 +; CHECK-NEXT: asrl r12, r3, r2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r2, r3, r3 +; CHECK-NEXT: asr.w r0, r3, #31 +; CHECK-NEXT: adc.w r5, r0, r3, asr #31 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: asrl r2, r5, r3 ; CHECK-NEXT: adds r4, r0, r0 -; CHECK-NEXT: asr.w r2, r0, #31 -; CHECK-NEXT: adc.w r3, r2, r0, asr #31 +; CHECK-NEXT: asr.w r3, r0, #31 +; CHECK-NEXT: adc.w r3, r3, r0, asr #31 ; CHECK-NEXT: asrl r4, r3, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: adds r6, r0, r0 +; CHECK-NEXT: asr.w r3, r0, #31 +; CHECK-NEXT: adc.w r3, r3, r0, asr #31 +; CHECK-NEXT: asrl r6, r3, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 %sa = sext <4 x i32> %a to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -180,44 +180,44 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff ; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r3, r7, d2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r12, lr, d7 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: asr.w r5, r4, #31 +; CHECK-NEXT: vmov lr, r12, d7 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: adds r2, r2, r0 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: adcs r1, r5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: asrs r4, r5, #31 +; CHECK-NEXT: adds r6, r5, r3 +; CHECK-NEXT: vmov r3, r5, d3 +; CHECK-NEXT: vmov.f32 s6, s1 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: asr.w r4, r1, #31 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: lsrl r2, r3, #1 -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds.w r4, r3, r12 -; CHECK-NEXT: asr.w r6, r3, #31 -; CHECK-NEXT: adc.w r3, r6, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: lsrl r6, r7, #1 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adds r6, r1, r3 +; CHECK-NEXT: asr.w r2, r1, #31 ; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: lsrl r6, r1, #1 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> @@ -328,107 +328,98 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_ops_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.i64 q3, #0xffffffff +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r1, r7, d4 -; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vmov r2, r12, d5 -; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r10, s8 +; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov r6, s2 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: adds r0, r3, r1 -; CHECK-NEXT: asr.w r5, r3, #31 -; CHECK-NEXT: adcs r5, r7 -; CHECK-NEXT: asrl r0, r5, r1 -; CHECK-NEXT: subs.w lr, r0, r1 -; CHECK-NEXT: asr.w r0, r6, #31 -; CHECK-NEXT: sbc.w r8, r5, r7 -; CHECK-NEXT: adds r4, r6, r2 -; CHECK-NEXT: adc.w r5, r0, r12 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: asrl r4, r5, r2 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: asr.w r0, r10, #31 +; CHECK-NEXT: asrs r7, r6, #31 +; CHECK-NEXT: adds.w r4, r10, r2 +; CHECK-NEXT: adc r3, r0, #0 +; CHECK-NEXT: asrl r4, r3, r2 ; CHECK-NEXT: subs r0, r4, r2 -; CHECK-NEXT: sbc.w r5, r5, r12 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: umull r0, r4, r0, r2 -; CHECK-NEXT: mla r5, r5, r2, r4 -; CHECK-NEXT: eor.w r4, r3, r1 -; CHECK-NEXT: orr.w r4, r4, r3, asr #31 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, eq -; CHECK-NEXT: bfi r7, r4, #0, #8 -; CHECK-NEXT: eor.w r4, r6, r2 -; CHECK-NEXT: orr.w r4, r4, r6, asr #31 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: lsll r0, r5, r6 -; CHECK-NEXT: csetm r4, eq -; CHECK-NEXT: lsll r0, r5, r2 -; CHECK-NEXT: bfi r7, r4, #8, #8 -; CHECK-NEXT: rsbs r2, r3, #0 +; CHECK-NEXT: sbc lr, r3, #0 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: umull r0, r8, r0, r2 +; CHECK-NEXT: adds r4, r6, r3 +; CHECK-NEXT: eor.w r1, r6, r3 +; CHECK-NEXT: adc r5, r7, #0 +; CHECK-NEXT: eor.w r7, r10, r2 +; CHECK-NEXT: asrl r4, r5, r3 +; CHECK-NEXT: orr.w r7, r7, r10, asr #31 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: orr.w r1, r1, r6, asr #31 +; CHECK-NEXT: sbc r5, r5, #0 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: umull r4, r12, r4, r3 +; CHECK-NEXT: csetm r9, eq +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: bfi r7, r9, #0, #8 +; CHECK-NEXT: csetm r1, eq +; CHECK-NEXT: bfi r7, r1, #8, #8 +; CHECK-NEXT: mla r5, r5, r3, r12 +; CHECK-NEXT: rsbs r1, r6, #0 ; CHECK-NEXT: vmsr p0, r7 -; CHECK-NEXT: umull r4, r7, lr, r1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: mla r7, r8, r1, r7 -; CHECK-NEXT: lsll r4, r7, r2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: lsll r4, r7, r1 -; CHECK-NEXT: vmov r1, r7, d2 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r0 -; CHECK-NEXT: vpsel q2, q4, q2 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: adds r4, r3, r1 -; CHECK-NEXT: adc.w r5, r0, r7 -; CHECK-NEXT: asrl r4, r5, r1 -; CHECK-NEXT: subs r0, r4, r1 -; CHECK-NEXT: sbc.w r7, r5, r7 -; CHECK-NEXT: umull r0, r4, r0, r1 -; CHECK-NEXT: mla r9, r7, r1, r4 -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: adds r6, r7, r2 -; CHECK-NEXT: asr.w r4, r7, #31 -; CHECK-NEXT: adc.w r5, r4, lr -; CHECK-NEXT: asrl r6, r5, r2 -; CHECK-NEXT: subs r4, r6, r2 -; CHECK-NEXT: sbc.w r6, r5, lr -; CHECK-NEXT: eor.w r5, r3, r1 -; CHECK-NEXT: orr.w r5, r5, r3, asr #31 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: lsll r0, r9, r3 -; CHECK-NEXT: csetm r5, eq -; CHECK-NEXT: rsbs r3, r7, #0 -; CHECK-NEXT: bfi r12, r5, #0, #8 -; CHECK-NEXT: eor.w r5, r7, r2 -; CHECK-NEXT: orr.w r5, r5, r7, asr #31 -; CHECK-NEXT: lsll r0, r9, r1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, eq -; CHECK-NEXT: bfi r12, r5, #8, #8 -; CHECK-NEXT: umull r4, r5, r4, r2 -; CHECK-NEXT: vmsr p0, r12 -; CHECK-NEXT: mla r5, r6, r2, r5 +; CHECK-NEXT: mla r7, lr, r2, r8 +; CHECK-NEXT: lsll r4, r5, r1 +; CHECK-NEXT: rsb.w r1, r10, #0 +; CHECK-NEXT: lsll r0, r7, r1 +; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: lsll r0, r7, r2 ; CHECK-NEXT: lsll r4, r5, r3 -; CHECK-NEXT: lsll r4, r5, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: adds.w r2, lr, r1 +; CHECK-NEXT: asr.w r0, lr, #31 +; CHECK-NEXT: adc r3, r0, #0 +; CHECK-NEXT: asrl r2, r3, r1 +; CHECK-NEXT: subs r0, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sbc r7, r3, #0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r0, r6, r0, r1 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: adds r4, r2, r3 +; CHECK-NEXT: adc r5, r5, #0 +; CHECK-NEXT: asrl r4, r5, r3 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: sbc r8, r5, #0 +; CHECK-NEXT: mla r5, r7, r1, r6 +; CHECK-NEXT: eor.w r6, lr, r1 +; CHECK-NEXT: orr.w r6, r6, lr, asr #31 +; CHECK-NEXT: eor.w r7, r2, r3 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: orr.w r7, r7, r2, asr #31 +; CHECK-NEXT: csetm r6, eq +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: csetm r7, eq +; CHECK-NEXT: rsb.w lr, lr, #0 +; CHECK-NEXT: bfi r12, r7, #0, #8 +; CHECK-NEXT: lsll r0, r5, lr +; CHECK-NEXT: bfi r12, r6, #8, #8 +; CHECK-NEXT: umull r4, r6, r4, r3 +; CHECK-NEXT: lsll r0, r5, r1 +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: vmsr p0, r12 +; CHECK-NEXT: mla r7, r8, r3, r6 +; CHECK-NEXT: lsll r4, r7, r1 +; CHECK-NEXT: lsll r4, r7, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -57,19 +57,19 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vmov.f32 s16, s22 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vmov.f32 s2, s21 ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: vmov.f32 s20, s22 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 @@ -82,8 +82,8 @@ ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -401,26 +401,26 @@ ; CHECK-NEXT: subs r4, r4, r6 ; CHECK-NEXT: sbc.w r9, r3, r6, asr #31 ; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: subs r5, r7, r6 +; CHECK-NEXT: asr.w r7, r7, #31 ; CHECK-NEXT: vmov q2[2], q2[0], r5, r8 -; CHECK-NEXT: asr.w r5, r7, #31 -; CHECK-NEXT: sbc.w r5, r5, r6, asr #31 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: subs r3, r7, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: asr.w r3, r5, #31 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: bfi r4, r3, #0, #4 -; CHECK-NEXT: asr.w r3, r9, #31 -; CHECK-NEXT: bfi r4, r3, #4, #4 -; CHECK-NEXT: asr.w r3, r12, #31 -; CHECK-NEXT: bfi r4, r3, #8, #4 -; CHECK-NEXT: asr.w r3, r7, #31 -; CHECK-NEXT: sbc.w r3, r3, r6, asr #31 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: sbc.w r6, r7, r6, asr #31 +; CHECK-NEXT: asrs r6, r6, #31 +; CHECK-NEXT: subs r7, r3, r5 +; CHECK-NEXT: asr.w r3, r3, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r7 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: sbc.w r3, r3, r5, asr #31 +; CHECK-NEXT: bfi r7, r6, #0, #4 +; CHECK-NEXT: asr.w r4, r9, #31 +; CHECK-NEXT: asr.w r6, r12, #31 +; CHECK-NEXT: bfi r7, r4, #4, #4 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: bfi r4, r3, #12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: bfi r7, r6, #8, #4 +; CHECK-NEXT: bfi r7, r3, #12, #4 +; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.i32 q2, q0, q2 ; CHECK-NEXT: vstrb.8 q2, [r2], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -232,34 +232,33 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r2, r5, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, lr ; CHECK-NEXT: asrs r2, r0, #31 ; CHECK-NEXT: mla r4, r1, r2, r12 ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: mla r5, r3, r2, r5 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: mla r3, r3, r0, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r5, lr, r4, r0 -; CHECK-NEXT: umull r3, r12, r1, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 -; CHECK-NEXT: mla r3, r1, r2, r12 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r3, r5, r1, r0 +; CHECK-NEXT: mla r5, r1, r2, r5 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r2, r4, r2, lr -; CHECK-NEXT: mla r1, r1, r0, r3 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: mla r0, r3, r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: mla r12, r1, r0, r5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: umull r4, r1, r5, r0 +; CHECK-NEXT: mla r1, r5, r2, r1 +; CHECK-NEXT: asrs r2, r5, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: mla r0, r2, r0, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -276,34 +275,33 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: asrs r4, r0, #31 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r2, r5, r0, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: vmov q1[2], q1[0], r2, lr ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: mla r2, r0, r2, r12 ; CHECK-NEXT: mla r1, r4, r1, r2 ; CHECK-NEXT: asrs r2, r3, #31 ; CHECK-NEXT: mla r2, r0, r2, r5 -; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: mla r2, r4, r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r3, lr, r0, r5 -; CHECK-NEXT: umull r2, r12, r0, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r5, #31 -; CHECK-NEXT: mla r0, r0, r2, lr -; CHECK-NEXT: mla r0, r4, r5, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r2, r3, r0, r1 +; CHECK-NEXT: asrs r5, r1, #31 +; CHECK-NEXT: mla r3, r0, r5, r3 +; CHECK-NEXT: mla r12, r4, r1, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r5, r1, r0, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r0, r0, r2, r1 +; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,21 +8,18 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldrd r12, r3, [r0] +; CHECK-NEXT: ldrd lr, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov q1[2], q1[0], r12, lr +; CHECK-NEXT: strd r2, r0, [r1, #16] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: strd r2, r0, [r1, #16] ; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1764,104 +1764,159 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pushq %rax ; SSE2-NEXT: movaps (%rdi), %xmm1 ; SSE2-NEXT: movaps (%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: addq %rax, %r9 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: addq %rdi, %rbx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: addq %rcx, %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE2-NEXT: addq %rdx, %r11 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: addq %rbp, %r12 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: addq %rsi, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: addq %r10, %rax +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: addq %r8, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: addq %r15, %r14 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rsi, %rcx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rbp, %rax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%rdx,%rsi), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%rdi,%rdx), %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r8,%rdx), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r9,%rdx), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%rbx,%rdx), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r10,%rdx), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r13,%rdx), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r12,%rdx), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r14,%rdx), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r15,%rdx), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leaq -1(%r13,%rsi), %rsi +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq $-1, %r9 +; SSE2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %r15d +; SSE2-NEXT: adcq $-1, %r15 +; SSE2-NEXT: addq $-1, %rbx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: adcq $-1, %r10 ; SSE2-NEXT: addq $-1, %rcx -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: adcq $-1, %rdx -; SSE2-NEXT: addq $-1, %rax +; SSE2-NEXT: movq %rcx, (%rsp) # 8-byte Spill +; SSE2-NEXT: movl $0, %ebx +; SSE2-NEXT: adcq $-1, %rbx +; SSE2-NEXT: addq $-1, %r11 +; SSE2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: adcq $-1, %rsi +; SSE2-NEXT: addq $-1, %r12 +; SSE2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: adcq $-1, %r13 +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: adcq $-1, %r11 +; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: adcq $-1, %r12 +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: adcq $-1, %r9 +; SSE2-NEXT: addq $-1, %r14 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq $-1, %r8 +; SSE2-NEXT: movl $0, %ebp ; SSE2-NEXT: adcq $-1, %rbp -; SSE2-NEXT: shldq $63, %rax, %rbp -; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm8 -; SSE2-NEXT: movq %rbp, %xmm0 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm9 -; SSE2-NEXT: shrq %rsi +; SSE2-NEXT: addq $-1, %rdi +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: adcq $-1, %rcx +; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: shldq $63, %rdx, %rax +; SSE2-NEXT: shldq $63, %rdi, %rcx +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shldq $63, %r8, %rbp +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; SSE2-NEXT: shldq $63, %r14, %rdi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r9 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r12 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r11 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r13 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %rsi +; SSE2-NEXT: movq (%rsp), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %rbx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r15 +; SSE2-NEXT: movq %r15, %xmm8 +; SSE2-NEXT: movq %r10, %xmm0 +; SSE2-NEXT: movq %rbx, %xmm9 ; SSE2-NEXT: movq %rsi, %xmm2 -; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm10 -; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm4 -; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movq %rbx, %xmm11 -; SSE2-NEXT: shrq %r9 +; SSE2-NEXT: movq %r13, %xmm10 +; SSE2-NEXT: movq %r11, %xmm4 +; SSE2-NEXT: movq %r12, %xmm11 ; SSE2-NEXT: movq %r9, %xmm7 -; SSE2-NEXT: shrq %r13 -; SSE2-NEXT: movq %r13, %xmm12 -; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movq %r12, %xmm1 -; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movq %r14, %xmm13 -; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movq %r15, %xmm6 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm14 -; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movq %r10, %xmm5 +; SSE2-NEXT: movq %rdi, %xmm12 +; SSE2-NEXT: movq %rbp, %xmm5 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm13 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: movq %rdx, %xmm14 +; SSE2-NEXT: movq %rax, %xmm6 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax ; SSE2-NEXT: movq %rax, %xmm15 @@ -1885,24 +1940,25 @@ ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE2-NEXT: movupd %xmm1, (%rax) +; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE2-NEXT: movupd %xmm0, (%rax) +; SSE2-NEXT: addq $8, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1923,98 +1979,150 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpextrw $6, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $7, %xmm0, %r11d -; AVX1-NEXT: vpextrw $0, %xmm3, %r14d -; AVX1-NEXT: vpextrw $1, %xmm3, %r15d -; AVX1-NEXT: vpextrw $2, %xmm3, %r10d -; AVX1-NEXT: vpextrw $3, %xmm3, %r9d -; AVX1-NEXT: vpextrw $4, %xmm3, %r8d -; AVX1-NEXT: vpextrw $5, %xmm3, %ebx -; AVX1-NEXT: vpextrw $6, %xmm3, %ebp -; AVX1-NEXT: vpextrw $7, %xmm3, %edi -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: vpextrw $0, %xmm0, %esi -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx +; AVX1-NEXT: vpextrw $7, %xmm0, %eax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $7, %xmm3, %ebp +; AVX1-NEXT: vpextrw $6, %xmm3, %ebx +; AVX1-NEXT: vpextrw $5, %xmm3, %edx +; AVX1-NEXT: vpextrw $4, %xmm3, %eax +; AVX1-NEXT: vpextrw $1, %xmm3, %r14d +; AVX1-NEXT: vpextrw $0, %xmm3, %r9d +; AVX1-NEXT: vpextrw $3, %xmm3, %r13d +; AVX1-NEXT: vpextrw $2, %xmm3, %r12d +; AVX1-NEXT: vpextrw $5, %xmm0, %r15d +; AVX1-NEXT: vpextrw $4, %xmm0, %esi +; AVX1-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $0, %xmm0, %ecx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $7, %xmm2, %edi +; AVX1-NEXT: addq %rbp, %rdi +; AVX1-NEXT: vpextrw $6, %xmm2, %r8d +; AVX1-NEXT: addq %rbx, %r8 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx ; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: addq %rsi, %rax -; AVX1-NEXT: vpextrw $7, %xmm2, %edx -; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi -; AVX1-NEXT: vpextrw $6, %xmm2, %edx -; AVX1-NEXT: leaq -1(%rbp,%rdx), %rbp -; AVX1-NEXT: vpextrw $5, %xmm2, %edx -; AVX1-NEXT: leaq -1(%rbx,%rdx), %rbx -; AVX1-NEXT: vpextrw $4, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r8,%rdx), %r8 -; AVX1-NEXT: vpextrw $3, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r9,%rdx), %r9 -; AVX1-NEXT: vpextrw $2, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r10,%rdx), %r10 -; AVX1-NEXT: vpextrw $1, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r15,%rdx), %r13 -; AVX1-NEXT: vpextrw $0, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r14,%rdx), %r12 -; AVX1-NEXT: vpextrw $7, %xmm1, %edx -; AVX1-NEXT: leaq -1(%r11,%rdx), %r15 -; AVX1-NEXT: vpextrw $6, %xmm1, %edx +; AVX1-NEXT: movq %rcx, %r11 +; AVX1-NEXT: vpextrw $4, %xmm2, %r10d +; AVX1-NEXT: addq %rax, %r10 +; AVX1-NEXT: vpextrw $1, %xmm2, %eax +; AVX1-NEXT: addq %r14, %rax +; AVX1-NEXT: movq %rax, %r14 +; AVX1-NEXT: vpextrw $0, %xmm2, %eax +; AVX1-NEXT: addq %r9, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm2, %eax +; AVX1-NEXT: addq %r13, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm2, %eax +; AVX1-NEXT: addq %r12, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm1, %ebp +; AVX1-NEXT: addq %r15, %rbp +; AVX1-NEXT: vpextrw $4, %xmm1, %ebx +; AVX1-NEXT: addq %rsi, %rbx +; AVX1-NEXT: vpextrw $1, %xmm1, %r9d +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX1-NEXT: vpextrw $0, %xmm1, %esi +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX1-NEXT: vpextrw $7, %xmm1, %ecx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: leaq -1(%rax,%rcx), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: leaq -1(%rax,%rcx), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: vpextrw $2, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r8 +; AVX1-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r13d +; AVX1-NEXT: adcq $-1, %r13 +; AVX1-NEXT: addq $-1, %r11 +; AVX1-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r12d +; AVX1-NEXT: adcq $-1, %r12 +; AVX1-NEXT: addq $-1, %r10 +; AVX1-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r15d +; AVX1-NEXT: adcq $-1, %r15 +; AVX1-NEXT: addq $-1, %r14 +; AVX1-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r14d +; AVX1-NEXT: adcq $-1, %r14 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %r11d +; AVX1-NEXT: adcq $-1, %r11 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: adcq $-1, %r10 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: adcq $-1, %rdx +; AVX1-NEXT: addq $-1, %rbp +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: adcq $-1, %rdi +; AVX1-NEXT: addq $-1, %rbx +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: adcq $-1, %rcx +; AVX1-NEXT: addq $-1, %r9 +; AVX1-NEXT: movl $0, %r8d +; AVX1-NEXT: adcq $-1, %r8 +; AVX1-NEXT: addq $-1, %rsi +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: shldq $63, %rsi, %rax +; AVX1-NEXT: shldq $63, %r9, %r8 +; AVX1-NEXT: shldq $63, %rbx, %rcx +; AVX1-NEXT: shldq $63, %rbp, %rdi ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rsi,%rdx), %r14 -; AVX1-NEXT: vpextrw $5, %xmm1, %edx +; AVX1-NEXT: shldq $63, %rsi, %rdx ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rsi,%rdx), %r11 -; AVX1-NEXT: vpextrw $4, %xmm1, %edx +; AVX1-NEXT: shldq $63, %rsi, %r10 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rsi,%rdx), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: adcq $-1, %rsi -; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdx -; AVX1-NEXT: shldq $63, %rax, %rdx -; AVX1-NEXT: shldq $63, %rcx, %rsi -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm8 -; AVX1-NEXT: shrq %rbp -; AVX1-NEXT: vmovq %rbp, %xmm9 -; AVX1-NEXT: shrq %rbx -; AVX1-NEXT: vmovq %rbx, %xmm0 -; AVX1-NEXT: shrq %r8 -; AVX1-NEXT: vmovq %r8, %xmm1 -; AVX1-NEXT: shrq %r9 -; AVX1-NEXT: vmovq %r9, %xmm12 -; AVX1-NEXT: shrq %r10 -; AVX1-NEXT: vmovq %r10, %xmm13 -; AVX1-NEXT: shrq %r13 -; AVX1-NEXT: vmovq %r13, %xmm14 -; AVX1-NEXT: shrq %r12 -; AVX1-NEXT: vmovq %r12, %xmm15 -; AVX1-NEXT: shrq %r15 -; AVX1-NEXT: vmovq %r15, %xmm10 -; AVX1-NEXT: shrq %r14 -; AVX1-NEXT: vmovq %r14, %xmm11 -; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm2 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vmovq %rsi, %xmm4 -; AVX1-NEXT: vmovq %rdx, %xmm5 +; AVX1-NEXT: shldq $63, %rsi, %r11 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rsi, %r14 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rsi, %r15 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rsi, %r12 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rsi, %r13 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbp, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm8 +; AVX1-NEXT: vmovq %r13, %xmm9 +; AVX1-NEXT: vmovq %r12, %xmm0 +; AVX1-NEXT: vmovq %r15, %xmm1 +; AVX1-NEXT: vmovq %r14, %xmm12 +; AVX1-NEXT: vmovq %r11, %xmm13 +; AVX1-NEXT: vmovq %r10, %xmm14 +; AVX1-NEXT: vmovq %rdx, %xmm15 +; AVX1-NEXT: vmovq %rdi, %xmm10 +; AVX1-NEXT: vmovq %rcx, %xmm11 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vmovq %r8, %xmm4 +; AVX1-NEXT: vmovq %rax, %xmm5 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm6 @@ -2023,26 +2131,26 @@ ; AVX1-NEXT: vmovq %rax, %xmm7 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] +; AVX1-NEXT: vpsllq $48, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm8[6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX1-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 @@ -2061,187 +2169,209 @@ ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm7, %rbx +; AVX2-NEXT: vmovq %xmm7, %rbp +; AVX2-NEXT: vpextrq $1, %xmm6, %rdi +; AVX2-NEXT: vmovq %xmm6, %rsi ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %r11 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %r14 -; AVX2-NEXT: vpextrq $1, %xmm0, %rbx -; AVX2-NEXT: vpextrq $1, %xmm2, %rsi -; AVX2-NEXT: vpextrq $1, %xmm7, %r12 -; AVX2-NEXT: vpextrq $1, %xmm6, %r15 -; AVX2-NEXT: vpextrq $1, %xmm5, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vmovq %xmm3, %rbp -; AVX2-NEXT: vpextrq $1, %xmm9, %r9 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-NEXT: vpextrq $1, %xmm2, %rdx +; AVX2-NEXT: vmovq %xmm2, %r9 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %r13 +; AVX2-NEXT: vmovq %xmm2, %r12 +; AVX2-NEXT: vpextrq $1, %xmm4, %r15 +; AVX2-NEXT: vmovq %xmm4, %r14 +; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vpextrq $1, %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: addq %rbx, %rcx +; AVX2-NEXT: movq %rcx, %rbx +; AVX2-NEXT: vmovq %xmm0, %r10 +; AVX2-NEXT: addq %rbp, %r10 +; AVX2-NEXT: vpextrq $1, %xmm7, %rcx +; AVX2-NEXT: addq %rdi, %rcx +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: vmovq %xmm7, %r11 +; AVX2-NEXT: addq %rsi, %r11 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: vmovq %xmm0, %r8 +; AVX2-NEXT: addq %r9, %r8 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rdi -; AVX2-NEXT: addq %rbx, %rdi -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vpextrq $1, %xmm8, %r10 -; AVX2-NEXT: addq %rsi, %r10 -; AVX2-NEXT: vpextrq $1, %xmm7, %rsi -; AVX2-NEXT: addq %r12, %rsi -; AVX2-NEXT: movq %rsi, %r12 -; AVX2-NEXT: vpextrq $1, %xmm4, %r13 -; AVX2-NEXT: addq %r15, %r13 -; AVX2-NEXT: vpextrq $1, %xmm5, %r15 -; AVX2-NEXT: addq %rdx, %r15 -; AVX2-NEXT: vpextrq $1, %xmm3, %r8 -; AVX2-NEXT: addq %rcx, %r8 -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: vmovq %xmm6, %rdx -; AVX2-NEXT: addq %rbp, %rdx -; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r9, %rcx -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: leaq -1(%r14,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %rdi -; AVX2-NEXT: leaq -1(%r11,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm4, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vmovq %xmm3, %rbp -; AVX2-NEXT: leaq -1(%rdi,%rbp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm9, %rdi -; AVX2-NEXT: vmovq %xmm2, %rbp -; AVX2-NEXT: leaq -1(%rdi,%rbp), %rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: addq %r13, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: addq %r12, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm6, %rcx +; AVX2-NEXT: addq %r15, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vmovq %xmm6, %rcx +; AVX2-NEXT: addq %r14, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm3, %rbp +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX2-NEXT: vpextrq $1, %xmm4, %r14 +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; AVX2-NEXT: vmovq %xmm4, %r9 +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: vpextrq $1, %xmm2, %rsi +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm3, %rdx +; AVX2-NEXT: leaq -1(%rcx,%rdx), %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vmovq %xmm8, %rcx +; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: leaq -1(%rcx,%rdx), %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: addq $-1, %rbx ; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r9d -; AVX2-NEXT: adcq $-1, %r9 +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: adcq $-1, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: addq $-1, %r10 ; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: adcq $-1, %rdi -; AVX2-NEXT: addq $-1, %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %r13 -; AVX2-NEXT: movl $0, %r10d -; AVX2-NEXT: adcq $-1, %r10 -; AVX2-NEXT: addq $-1, %r15 -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: adcq $-1, %r14 -; AVX2-NEXT: addq $-1, %r8 -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: adcq $-1, %rbp -; AVX2-NEXT: addq $-1, %rsi +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: adcq $-1, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %rdi +; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: adcq $-1, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r11 +; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r13d +; AVX2-NEXT: adcq $-1, %r13 +; AVX2-NEXT: addq $-1, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %r12d ; AVX2-NEXT: adcq $-1, %r12 -; AVX2-NEXT: addq $-1, %rdx +; AVX2-NEXT: addq $-1, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r15d +; AVX2-NEXT: adcq $-1, %r15 +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: movl $0, %ebx ; AVX2-NEXT: adcq $-1, %rbx -; AVX2-NEXT: addq $-1, %rcx +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movl $0, %r10d +; AVX2-NEXT: adcq $-1, %r10 +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movl $0, %r11d +; AVX2-NEXT: adcq $-1, %r11 +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: addq $-1, %rbp +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: adcq $-1, %rdx +; AVX2-NEXT: addq $-1, %r14 +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: adcq $-1, %rdi +; AVX2-NEXT: addq $-1, %r9 +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: adcq $-1, %rcx +; AVX2-NEXT: addq $-1, %rsi ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax -; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdx, %rbx +; AVX2-NEXT: shldq $63, %rsi, %rax +; AVX2-NEXT: shldq $63, %r9, %rcx +; AVX2-NEXT: shldq $63, %r14, %rdi +; AVX2-NEXT: shldq $63, %rbp, %rdx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r15 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX2-NEXT: shldq $63, %rsi, %r12 -; AVX2-NEXT: shldq $63, %r8, %rbp -; AVX2-NEXT: shldq $63, %r15, %r14 -; AVX2-NEXT: shldq $63, %r13, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r9 -; AVX2-NEXT: vmovq %r9, %xmm8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %r11, %xmm12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm13 -; AVX2-NEXT: vmovq %r10, %xmm14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm15 -; AVX2-NEXT: vmovq %r14, %xmm10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm11 -; AVX2-NEXT: vmovq %rbp, %xmm2 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm3 -; AVX2-NEXT: vmovq %r12, %xmm4 -; AVX2-NEXT: vmovq %rbx, %xmm5 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shldq $63, %rsi, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX2-NEXT: shldq $63, %rbp, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm8 +; AVX2-NEXT: vmovq %r9, %xmm9 +; AVX2-NEXT: vmovq %r14, %xmm0 +; AVX2-NEXT: vmovq %r13, %xmm1 +; AVX2-NEXT: vmovq %r12, %xmm12 +; AVX2-NEXT: vmovq %r15, %xmm13 +; AVX2-NEXT: vmovq %rbx, %xmm14 +; AVX2-NEXT: vmovq %r10, %xmm15 +; AVX2-NEXT: vmovq %r11, %xmm10 +; AVX2-NEXT: vmovq %r8, %xmm11 +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: shrq %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovq %rdi, %xmm4 +; AVX2-NEXT: vmovq %rcx, %xmm5 ; AVX2-NEXT: vmovq %rax, %xmm6 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: vmovq %rax, %xmm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8 +; AVX2-NEXT: vpsllq $48, %xmm8, %xmm8 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -349,19 +349,17 @@ ; X64-LABEL: test_bitreverse_shli_bitreverse_i64: ; X64: # %bb.0: ; X64-NEXT: bswapq %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: andq %rcx, %rdi -; X64-NEXT: shlq $4, %rdi -; X64-NEXT: orq %rax, %rdi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: leaq (%rdi,%rcx,4), %rax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-NEXT: shll $4, %eax +; X64-NEXT: shrl $4, %edi +; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X64-NEXT: orl %eax, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: shrl $2, %edi +; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-NEXT: leal (%rdi,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X64-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -50,55 +50,59 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %edi +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %al -; X86-NEXT: movzbl %al, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: addb $255, %cl +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %eax -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, %eax -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: setb %ah +; X86-NEXT: addb $255, %al +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movzbl %ah, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %edx, %eax -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %eax -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %edx, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/fshl-splat-undef.ll b/llvm/test/CodeGen/X86/fshl-splat-undef.ll --- a/llvm/test/CodeGen/X86/fshl-splat-undef.ll +++ b/llvm/test/CodeGen/X86/fshl-splat-undef.ll @@ -20,14 +20,15 @@ define void @test_fshl(<8 x i64> %lo, <8 x i64> %hi, <8 x i64>* %arr) { ; CHECK-LABEL: test_fshl: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $63, %eax -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: movl $12, %eax -; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpand %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpsllq %xmm2, %zmm1, %zmm1 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: vpsrlq $52, %zmm0, %zmm0 +; CHECK-NEXT: movl $12, %ecx +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; CHECK-NEXT: vmovd %ecx, %xmm3 +; CHECK-NEXT: vpand %xmm2, %xmm3, %xmm4 +; CHECK-NEXT: vpsllq %xmm4, %zmm1, %zmm1 +; CHECK-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpsrlq $1, %zmm0, %zmm0 +; CHECK-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 ; CHECK-NEXT: vmovdqa64 %zmm0, (%eax) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -556,18 +556,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -583,18 +581,16 @@ ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq @@ -654,18 +650,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -681,18 +675,16 @@ ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -117,7 +117,7 @@ ; X64-NEXT: movzwl 4(%rdi), %eax ; X64-NEXT: movzbl 6(%rdi), %ecx ; X64-NEXT: movb %cl, 6(%rdi) -; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: shlq $32, %rcx @@ -149,7 +149,7 @@ ; X64-NEXT: movzwl 4(%rdi), %eax ; X64-NEXT: movzbl 6(%rdi), %ecx ; X64-NEXT: movb %cl, 6(%rdi) -; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: shlq $32, %rcx @@ -187,19 +187,18 @@ ; X64-NEXT: movzwl 4(%rdi), %ecx ; X64-NEXT: movzbl 6(%rdi), %edx ; X64-NEXT: movb %dl, 6(%rdi) -; X64-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx +; X64-NEXT: # kill: def $edx killed $edx def $rdx ; X64-NEXT: shll $16, %edx ; X64-NEXT: orl %ecx, %edx ; X64-NEXT: shlq $32, %rdx ; X64-NEXT: movl (%rdi), %ecx ; X64-NEXT: orq %rdx, %rcx ; X64-NEXT: shlq $13, %rax -; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: orq %rax, %rdx -; X64-NEXT: movl %edx, (%rdi) -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movw %dx, 4(%rdi) +; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -191,16 +191,17 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $188, %esp +; X86-NEXT: subl $184, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: andl $1, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: negl %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -208,8 +209,9 @@ ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %ecx @@ -229,18 +231,18 @@ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %edi @@ -269,139 +271,140 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %cl +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edi +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edi, %ebx ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -447,113 +450,118 @@ ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movzbl %al, %esi -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax ; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edx, %edi ; X86-NEXT: setb %cl -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %eax, %edi ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: adcl $0, %eax ; X86-NEXT: addl %esi, %edx ; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: setb %bl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %eax, %edx -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx @@ -566,55 +574,54 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: setb %bl +; X86-NEXT: setb %al ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movzbl %al, %ebp +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %eax, %edx @@ -628,36 +635,38 @@ ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %eax, %edx +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: imull %ebx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: adcl %edx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %edi @@ -679,127 +688,127 @@ ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl %ecx, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edx, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: imull %edx, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: imull %edx, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull %edx, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %edx, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: imull {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %eax +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: xorl %ecx, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: orl %ebp, %esi +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: orl %eax, %ebx ; X86-NEXT: xorl %ecx, %edi +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: xorl %ecx, %edx ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %edx ; X86-NEXT: andl $1, %edx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl %eax, %ebx +; X86-NEXT: xorl %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: xorl %eax, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %ebp, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: xorl %eax, %ebx ; X86-NEXT: orl %esi, %ebx @@ -817,7 +826,7 @@ ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movb %dl, 16(%eax) ; X86-NEXT: setne 20(%eax) -; X86-NEXT: addl $188, %esp +; X86-NEXT: addl $184, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll --- a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll @@ -12,14 +12,13 @@ define i64 @func() nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movl $2, %ecx -; X64-NEXT: movl $3, %eax -; X64-NEXT: imulq %rcx -; X64-NEXT: cmpq $2, %rdx +; X64-NEXT: movl $2, %eax +; X64-NEXT: negq %rax ; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF ; X64-NEXT: movl $1, %ecx ; X64-NEXT: cmovgeq %rax, %rcx -; X64-NEXT: cmpq $-2, %rdx +; X64-NEXT: movq $-2, %rax +; X64-NEXT: negq %rax ; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 ; X64-NEXT: cmovgeq %rcx, %rax ; X64-NEXT: retq @@ -42,16 +41,15 @@ define i64 @func3() nounwind { ; X64-LABEL: func3: ; X64: # %bb.0: -; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: movl $2, %edx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: imulq %rdx -; X64-NEXT: cmpq $2, %rdx -; X64-NEXT: movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF -; X64-NEXT: cmovgeq %rcx, %rsi -; X64-NEXT: cmpq $-2, %rdx +; X64-NEXT: movl $2, %eax +; X64-NEXT: negq %rax +; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: movabsq $4611686018427387903, %rcx # imm = 0x3FFFFFFFFFFFFFFF +; X64-NEXT: cmovgeq %rax, %rcx +; X64-NEXT: movq $-2, %rax +; X64-NEXT: negq %rax ; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; X64-NEXT: cmovgeq %rsi, %rax +; X64-NEXT: cmovgeq %rcx, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2) ret i64 %tmp @@ -60,16 +58,15 @@ define i64 @func4() nounwind { ; X64-LABEL: func4: ; X64: # %bb.0: -; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: movl $2, %edx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: imulq %rdx -; X64-NEXT: cmpq $2147483647, %rdx # imm = 0x7FFFFFFF -; X64-NEXT: movl $4294967295, %esi # imm = 0xFFFFFFFF -; X64-NEXT: cmovgq %rcx, %rsi -; X64-NEXT: cmpq $-2147483648, %rdx # imm = 0x80000000 +; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: negq %rax +; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF +; X64-NEXT: cmovgq %rax, %rcx +; X64-NEXT: movq $-2147483648, %rax # imm = 0x80000000 +; X64-NEXT: negq %rax ; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; X64-NEXT: cmovgeq %rsi, %rax +; X64-NEXT: cmovgeq %rcx, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32) ret i64 %tmp @@ -78,18 +75,15 @@ define i64 @func5() nounwind { ; X64-LABEL: func5: ; X64: # %bb.0: -; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: movl $2, %edx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: imulq %rdx ; X64-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF -; X64-NEXT: cmpq %rax, %rdx -; X64-NEXT: movl $1, %esi -; X64-NEXT: cmovgq %rcx, %rsi +; X64-NEXT: negq %rax +; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: movl $1, %ecx +; X64-NEXT: cmovgq %rax, %rcx ; X64-NEXT: movabsq $-4611686018427387904, %rax # imm = 0xC000000000000000 -; X64-NEXT: cmpq %rax, %rdx +; X64-NEXT: negq %rax ; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; X64-NEXT: cmovgeq %rsi, %rax +; X64-NEXT: cmovgeq %rcx, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63) ret i64 %tmp diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -558,12 +558,11 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] @@ -572,7 +571,7 @@ ; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo: @@ -648,19 +647,18 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u> +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,268435456,2147483648] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -1135,7 +1133,7 @@ ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u> +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] @@ -1143,19 +1141,18 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <2147483648,u,2,u> +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2,2147483648] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: por %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 @@ -1379,12 +1376,11 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] @@ -1393,7 +1389,7 @@ ; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -163,19 +163,18 @@ ; CHECK-SSE2-LABEL: test_urem_even_allones_eq: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -241,19 +240,18 @@ ; CHECK-SSE2-LABEL: test_urem_even_allones_ne: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -479,21 +477,20 @@ ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: @@ -559,19 +556,18 @@ ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -926,21 +922,20 @@ ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: @@ -1006,19 +1001,18 @@ ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -1167,21 +1161,20 @@ ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: @@ -1842,21 +1835,20 @@ ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: @@ -1921,21 +1913,20 @@ ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: