diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4377,18 +4377,45 @@ void DAGTypeLegalizer::ExpandIntRes_Rotate(SDNode *N, SDValue &Lo, SDValue &Hi) { - // Lower the rotate to shifts and ORs which can be expanded. - SDValue Res; - TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG); + // Delegate to funnel-shift expansion. + SDLoc DL(N); + unsigned Opcode = N->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR; + SDValue Res = DAG.getNode(Opcode, DL, N->getValueType(0), N->getOperand(0), + N->getOperand(0), N->getOperand(1)); SplitInteger(Res, Lo, Hi); } -void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, - SDValue &Lo, SDValue &Hi) { - // Lower the funnel shift to shifts and ORs which can be expanded. - SDValue Res; - TLI.expandFunnelShift(N, Res, DAG); - SplitInteger(Res, Lo, Hi); +void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo, + SDValue &Hi) { + // Values numbered from least significant to most significant. + SDValue In1, In2, In3, In4; + GetExpandedInteger(N->getOperand(0), In3, In4); + GetExpandedInteger(N->getOperand(1), In1, In2); + EVT HalfVT = In1.getValueType(); + + SDLoc DL(N); + unsigned Opc = N->getOpcode(); + SDValue ShAmt = N->getOperand(2); + EVT ShAmtVT = ShAmt.getValueType(); + EVT ShAmtCCVT = getSetCCResultType(ShAmtVT); + + // If the shift amount is at least half the bitwidth, swap the inputs. + unsigned HalfVTBits = HalfVT.getScalarSizeInBits(); + SDValue AndNode = DAG.getNode(ISD::AND, DL, ShAmtVT, ShAmt, + DAG.getConstant(HalfVTBits, DL, ShAmtVT)); + SDValue Cond = + DAG.getSetCC(DL, ShAmtCCVT, AndNode, DAG.getConstant(0, DL, ShAmtVT), + Opc == ISD::FSHL ? ISD::SETNE : ISD::SETEQ); + + // Expand to a pair of funnel shifts. + EVT NewShAmtVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); + SDValue NewShAmt = DAG.getAnyExtOrTrunc(ShAmt, DL, NewShAmtVT); + + SDValue Select1 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In1, In2); + SDValue Select2 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In2, In3); + SDValue Select3 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In3, In4); + Lo = DAG.getNode(Opc, DL, HalfVT, Select2, Select1, NewShAmt); + Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt); } void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo, diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -46,29 +46,19 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; CHECK-LABEL: fshl_i128: ; CHECK: // %bb.0: +; CHECK-NEXT: tst x4, #0x40 ; CHECK-NEXT: mvn w8, w4 -; CHECK-NEXT: extr x9, x3, x2, #1 -; CHECK-NEXT: lsr x10, x3, #1 -; CHECK-NEXT: and x12, x8, #0x7f -; CHECK-NEXT: lsl x11, x10, #1 -; CHECK-NEXT: tst x12, #0x40 -; CHECK-NEXT: lsl x11, x11, x4 +; CHECK-NEXT: csel x9, x2, x3, ne +; CHECK-NEXT: csel x10, x3, x0, ne +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: lsl x11, x10, x4 +; CHECK-NEXT: csel x12, x0, x1, ne +; CHECK-NEXT: lsr x10, x10, #1 ; CHECK-NEXT: lsr x9, x9, x8 -; CHECK-NEXT: orr x9, x11, x9 -; CHECK-NEXT: lsr x11, x0, #1 -; CHECK-NEXT: lsr x10, x10, x8 -; CHECK-NEXT: lsl x12, x1, x4 -; CHECK-NEXT: lsr x8, x11, x8 -; CHECK-NEXT: and x11, x4, #0x7f -; CHECK-NEXT: csel x9, x10, x9, ne -; CHECK-NEXT: csel x10, xzr, x10, ne -; CHECK-NEXT: orr x8, x12, x8 -; CHECK-NEXT: lsl x12, x0, x4 -; CHECK-NEXT: tst x11, #0x40 -; CHECK-NEXT: csel x8, x12, x8, ne -; CHECK-NEXT: csel x11, xzr, x12, ne -; CHECK-NEXT: orr x1, x8, x10 +; CHECK-NEXT: lsl x12, x12, x4 +; CHECK-NEXT: lsr x8, x10, x8 ; CHECK-NEXT: orr x0, x11, x9 +; CHECK-NEXT: orr x1, x12, x8 ; CHECK-NEXT: ret %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll --- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll @@ -67,61 +67,24 @@ } define i64 @rotl_i64(i64 %x, i64 %z) { -; SCALAR-LABEL: rotl_i64: -; SCALAR: @ %bb.0: -; SCALAR-NEXT: .save {r4, r5, r11, lr} -; SCALAR-NEXT: push {r4, r5, r11, lr} -; SCALAR-NEXT: rsb r3, r2, #0 -; SCALAR-NEXT: and r4, r2, #63 -; SCALAR-NEXT: and lr, r3, #63 -; SCALAR-NEXT: rsb r3, lr, #32 -; SCALAR-NEXT: lsl r2, r0, r4 -; SCALAR-NEXT: lsr r12, r0, lr -; SCALAR-NEXT: orr r3, r12, r1, lsl r3 -; SCALAR-NEXT: subs r12, lr, #32 -; SCALAR-NEXT: lsrpl r3, r1, r12 -; SCALAR-NEXT: subs r5, r4, #32 -; SCALAR-NEXT: movwpl r2, #0 -; SCALAR-NEXT: cmp r5, #0 -; SCALAR-NEXT: orr r2, r2, r3 -; SCALAR-NEXT: rsb r3, r4, #32 -; SCALAR-NEXT: lsr r3, r0, r3 -; SCALAR-NEXT: orr r3, r3, r1, lsl r4 -; SCALAR-NEXT: lslpl r3, r0, r5 -; SCALAR-NEXT: lsr r0, r1, lr -; SCALAR-NEXT: cmp r12, #0 -; SCALAR-NEXT: movwpl r0, #0 -; SCALAR-NEXT: orr r1, r3, r0 -; SCALAR-NEXT: mov r0, r2 -; SCALAR-NEXT: pop {r4, r5, r11, pc} -; -; NEON-LABEL: rotl_i64: -; NEON: @ %bb.0: -; NEON-NEXT: .save {r4, r5, r11, lr} -; NEON-NEXT: push {r4, r5, r11, lr} -; NEON-NEXT: and r12, r2, #63 -; NEON-NEXT: rsb r2, r2, #0 -; NEON-NEXT: rsb r3, r12, #32 -; NEON-NEXT: and r4, r2, #63 -; NEON-NEXT: subs lr, r12, #32 -; NEON-NEXT: lsr r3, r0, r3 -; NEON-NEXT: lsr r2, r1, r4 -; NEON-NEXT: orr r3, r3, r1, lsl r12 -; NEON-NEXT: lslpl r3, r0, lr -; NEON-NEXT: subs r5, r4, #32 -; NEON-NEXT: movwpl r2, #0 -; NEON-NEXT: cmp r5, #0 -; NEON-NEXT: orr r2, r3, r2 -; NEON-NEXT: lsr r3, r0, r4 -; NEON-NEXT: rsb r4, r4, #32 -; NEON-NEXT: lsl r0, r0, r12 -; NEON-NEXT: orr r3, r3, r1, lsl r4 -; NEON-NEXT: lsrpl r3, r1, r5 -; NEON-NEXT: cmp lr, #0 -; NEON-NEXT: movwpl r0, #0 -; NEON-NEXT: mov r1, r2 -; NEON-NEXT: orr r0, r0, r3 -; NEON-NEXT: pop {r4, r5, r11, pc} +; CHECK-LABEL: rotl_i64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ands r3, r2, #32 +; CHECK-NEXT: and r12, r2, #31 +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: mov r4, #31 +; CHECK-NEXT: movne r3, r1 +; CHECK-NEXT: movne r1, r0 +; CHECK-NEXT: bic r2, r4, r2 +; CHECK-NEXT: lsl lr, r3, r12 +; CHECK-NEXT: lsr r0, r1, #1 +; CHECK-NEXT: lsl r1, r1, r12 +; CHECK-NEXT: lsr r3, r3, #1 +; CHECK-NEXT: orr r0, lr, r0, lsr r2 +; CHECK-NEXT: orr r1, r1, r3, lsr r2 +; CHECK-NEXT: pop {r4, pc} %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } @@ -243,31 +206,21 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK-LABEL: rotr_i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r11, lr} -; CHECK-NEXT: push {r4, r5, r11, lr} -; CHECK-NEXT: and lr, r2, #63 -; CHECK-NEXT: rsb r2, r2, #0 -; CHECK-NEXT: rsb r3, lr, #32 -; CHECK-NEXT: and r4, r2, #63 -; CHECK-NEXT: lsr r12, r0, lr -; CHECK-NEXT: orr r3, r12, r1, lsl r3 -; CHECK-NEXT: subs r12, lr, #32 -; CHECK-NEXT: lsl r2, r0, r4 -; CHECK-NEXT: lsrpl r3, r1, r12 -; CHECK-NEXT: subs r5, r4, #32 -; CHECK-NEXT: movwpl r2, #0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: orr r2, r3, r2 -; CHECK-NEXT: rsb r3, r4, #32 -; CHECK-NEXT: lsr r3, r0, r3 -; CHECK-NEXT: orr r3, r3, r1, lsl r4 -; CHECK-NEXT: lslpl r3, r0, r5 -; CHECK-NEXT: lsr r0, r1, lr -; CHECK-NEXT: cmp r12, #0 -; CHECK-NEXT: movwpl r0, #0 -; CHECK-NEXT: orr r1, r0, r3 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r4, r5, r11, pc} +; CHECK-NEXT: ands r3, r2, #32 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: moveq r3, r0 +; CHECK-NEXT: moveq r0, r1 +; CHECK-NEXT: mov r1, #31 +; CHECK-NEXT: lsl r12, r0, #1 +; CHECK-NEXT: bic r1, r1, r2 +; CHECK-NEXT: and r2, r2, #31 +; CHECK-NEXT: lsl r12, r12, r1 +; CHECK-NEXT: orr r12, r12, r3, lsr r2 +; CHECK-NEXT: lsl r3, r3, #1 +; CHECK-NEXT: lsl r1, r3, r1 +; CHECK-NEXT: orr r1, r1, r0, lsr r2 +; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: bx lr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -45,46 +45,69 @@ ; Verify that weird types are minimally supported. declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { -; CHECK-LABEL: fshl_i37: -; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r0, [sp, #24] -; CHECK-NEXT: mov r6, r3 -; CHECK-NEXT: ldr r1, [sp, #28] -; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: mov r2, #37 -; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: bl __aeabi_uldivmod -; CHECK-NEXT: mov r0, #63 -; CHECK-NEXT: bic r1, r0, r2 -; CHECK-NEXT: lsl r0, r6, #27 -; CHECK-NEXT: lsl r3, r7, #27 -; CHECK-NEXT: orr r0, r0, r7, lsr #5 -; CHECK-NEXT: and r2, r2, #63 -; CHECK-NEXT: lsrs r7, r0, #1 -; CHECK-NEXT: rrx r0, r3 -; CHECK-NEXT: rsb r3, r1, #32 -; CHECK-NEXT: lsr r0, r0, r1 -; CHECK-NEXT: lsl r6, r4, r2 -; CHECK-NEXT: orr r0, r0, r7, lsl r3 -; CHECK-NEXT: subs r3, r1, #32 -; CHECK-NEXT: lsr r1, r7, r1 -; CHECK-NEXT: lsrpl r0, r7, r3 -; CHECK-NEXT: subs r5, r2, #32 -; CHECK-NEXT: movwpl r6, #0 -; CHECK-NEXT: orr r0, r6, r0 -; CHECK-NEXT: rsb r6, r2, #32 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: lsr r6, r4, r6 -; CHECK-NEXT: orr r2, r6, r8, lsl r2 -; CHECK-NEXT: lslpl r2, r4, r5 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: movwpl r1, #0 -; CHECK-NEXT: orr r1, r2, r1 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, pc} +; SCALAR-LABEL: fshl_i37: +; SCALAR: @ %bb.0: +; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr} +; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr} +; SCALAR-NEXT: mov r4, r1 +; SCALAR-NEXT: mov r8, r0 +; SCALAR-NEXT: ldr r0, [sp, #24] +; SCALAR-NEXT: mov r5, r3 +; SCALAR-NEXT: ldr r1, [sp, #28] +; SCALAR-NEXT: mov r6, r2 +; SCALAR-NEXT: mov r2, #37 +; SCALAR-NEXT: mov r3, #0 +; SCALAR-NEXT: bl __aeabi_uldivmod +; SCALAR-NEXT: lsl r1, r5, #27 +; SCALAR-NEXT: ands r12, r2, #32 +; SCALAR-NEXT: orr r1, r1, r6, lsr #5 +; SCALAR-NEXT: mov r3, r8 +; SCALAR-NEXT: and r5, r2, #31 +; SCALAR-NEXT: mov r0, #31 +; SCALAR-NEXT: movne r3, r1 +; SCALAR-NEXT: cmp r12, #0 +; SCALAR-NEXT: bic r2, r0, r2 +; SCALAR-NEXT: lslne r1, r6, #27 +; SCALAR-NEXT: movne r4, r8 +; SCALAR-NEXT: lsl r7, r3, r5 +; SCALAR-NEXT: lsr r0, r1, #1 +; SCALAR-NEXT: lsl r1, r4, r5 +; SCALAR-NEXT: lsr r3, r3, #1 +; SCALAR-NEXT: orr r0, r7, r0, lsr r2 +; SCALAR-NEXT: orr r1, r1, r3, lsr r2 +; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; NEON-LABEL: fshl_i37: +; NEON: @ %bb.0: +; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: push {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: mov r4, r1 +; NEON-NEXT: mov r5, r0 +; NEON-NEXT: ldr r0, [sp, #24] +; NEON-NEXT: mov r7, r3 +; NEON-NEXT: ldr r1, [sp, #28] +; NEON-NEXT: mov r6, r2 +; NEON-NEXT: mov r2, #37 +; NEON-NEXT: mov r3, #0 +; NEON-NEXT: bl __aeabi_uldivmod +; NEON-NEXT: mov r0, #31 +; NEON-NEXT: bic r1, r0, r2 +; NEON-NEXT: lsl r0, r7, #27 +; NEON-NEXT: ands r12, r2, #32 +; NEON-NEXT: orr r0, r0, r6, lsr #5 +; NEON-NEXT: mov r7, r5 +; NEON-NEXT: and r2, r2, #31 +; NEON-NEXT: movne r7, r0 +; NEON-NEXT: lslne r0, r6, #27 +; NEON-NEXT: cmp r12, #0 +; NEON-NEXT: lsl r3, r7, r2 +; NEON-NEXT: lsr r0, r0, #1 +; NEON-NEXT: movne r4, r5 +; NEON-NEXT: orr r0, r3, r0, lsr r1 +; NEON-NEXT: lsr r3, r7, #1 +; NEON-NEXT: lsl r2, r4, r2 +; NEON-NEXT: orr r1, r2, r3, lsr r1 +; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } @@ -157,8 +180,8 @@ define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) { ; CHECK-LABEL: fshl_i64_const_overshift: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsr r1, r2, #23 -; CHECK-NEXT: orr r2, r1, r3, lsl #9 +; CHECK-NEXT: lsl r1, r3, #9 +; CHECK-NEXT: orr r2, r1, r2, lsr #23 ; CHECK-NEXT: lsl r0, r0, #9 ; CHECK-NEXT: orr r1, r0, r3, lsr #23 ; CHECK-NEXT: mov r0, r2 @@ -212,46 +235,36 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshr_i37: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: ldr r0, [sp, #32] -; CHECK-NEXT: mov r6, r3 -; CHECK-NEXT: ldr r1, [sp, #36] +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: ldr r0, [sp, #24] +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: ldr r1, [sp, #28] ; CHECK-NEXT: mov r7, r2 ; CHECK-NEXT: mov r2, #37 ; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: bl __aeabi_uldivmod +; CHECK-NEXT: lsl r3, r5, #27 ; CHECK-NEXT: add r0, r2, #27 -; CHECK-NEXT: lsl r6, r6, #27 -; CHECK-NEXT: and r1, r0, #63 -; CHECK-NEXT: lsl r2, r7, #27 -; CHECK-NEXT: orr r7, r6, r7, lsr #5 -; CHECK-NEXT: mov r6, #63 -; CHECK-NEXT: rsb r3, r1, #32 -; CHECK-NEXT: lsr r2, r2, r1 -; CHECK-NEXT: subs r12, r1, #32 -; CHECK-NEXT: bic r6, r6, r0 -; CHECK-NEXT: orr r2, r2, r7, lsl r3 -; CHECK-NEXT: lsl r5, r9, #1 -; CHECK-NEXT: lsrpl r2, r7, r12 -; CHECK-NEXT: lsl r0, r5, r6 -; CHECK-NEXT: subs r4, r6, #32 -; CHECK-NEXT: lsl r3, r8, #1 -; CHECK-NEXT: movwpl r0, #0 -; CHECK-NEXT: orr r3, r3, r9, lsr #31 -; CHECK-NEXT: orr r0, r0, r2 -; CHECK-NEXT: rsb r2, r6, #32 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: lsr r1, r7, r1 -; CHECK-NEXT: lsr r2, r5, r2 -; CHECK-NEXT: orr r2, r2, r3, lsl r6 -; CHECK-NEXT: lslpl r2, r5, r4 +; CHECK-NEXT: orr r3, r3, r7, lsr #5 +; CHECK-NEXT: mov r1, #31 +; CHECK-NEXT: ands r12, r0, #32 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: moveq r5, r3 +; CHECK-NEXT: bic r1, r1, r0 +; CHECK-NEXT: lsl r2, r5, #1 +; CHECK-NEXT: lsleq r3, r7, #27 ; CHECK-NEXT: cmp r12, #0 -; CHECK-NEXT: movwpl r1, #0 -; CHECK-NEXT: orr r1, r2, r1 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-NEXT: and r7, r0, #31 +; CHECK-NEXT: lsl r2, r2, r1 +; CHECK-NEXT: moveq r4, r6 +; CHECK-NEXT: orr r0, r2, r3, lsr r7 +; CHECK-NEXT: lsl r2, r4, #1 +; CHECK-NEXT: lsl r1, r2, r1 +; CHECK-NEXT: orr r1, r1, r5, lsr r7 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } diff --git a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll --- a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll @@ -76,59 +76,43 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; CHECK-BE-LABEL: rotl_i64: ; CHECK-BE: # %bb.0: -; CHECK-BE-NEXT: negu $1, $7 -; CHECK-BE-NEXT: andi $3, $1, 63 -; CHECK-BE-NEXT: srlv $6, $4, $3 -; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: andi $2, $7, 63 -; CHECK-BE-NEXT: move $8, $6 -; CHECK-BE-NEXT: movn $8, $zero, $1 -; CHECK-BE-NEXT: sllv $9, $4, $2 -; CHECK-BE-NEXT: srl $10, $5, 1 -; CHECK-BE-NEXT: not $11, $2 -; CHECK-BE-NEXT: srlv $10, $10, $11 -; CHECK-BE-NEXT: or $9, $9, $10 -; CHECK-BE-NEXT: sllv $10, $5, $2 -; CHECK-BE-NEXT: andi $7, $7, 32 -; CHECK-BE-NEXT: movn $9, $10, $7 -; CHECK-BE-NEXT: or $2, $9, $8 -; CHECK-BE-NEXT: srlv $5, $5, $3 -; CHECK-BE-NEXT: not $3, $3 -; CHECK-BE-NEXT: sll $4, $4, 1 -; CHECK-BE-NEXT: sllv $3, $4, $3 -; CHECK-BE-NEXT: or $3, $3, $5 -; CHECK-BE-NEXT: movn $3, $6, $1 -; CHECK-BE-NEXT: movn $10, $zero, $7 +; CHECK-BE-NEXT: srl $1, $7, 5 +; CHECK-BE-NEXT: andi $1, $1, 1 +; CHECK-BE-NEXT: move $3, $4 +; CHECK-BE-NEXT: movn $3, $5, $1 +; CHECK-BE-NEXT: andi $6, $7, 31 +; CHECK-BE-NEXT: sllv $2, $3, $6 +; CHECK-BE-NEXT: movn $5, $4, $1 +; CHECK-BE-NEXT: srl $1, $5, 1 +; CHECK-BE-NEXT: not $4, $7 +; CHECK-BE-NEXT: andi $4, $4, 31 +; CHECK-BE-NEXT: srlv $1, $1, $4 +; CHECK-BE-NEXT: or $2, $2, $1 +; CHECK-BE-NEXT: sllv $1, $5, $6 +; CHECK-BE-NEXT: srl $3, $3, 1 +; CHECK-BE-NEXT: srlv $3, $3, $4 ; CHECK-BE-NEXT: jr $ra -; CHECK-BE-NEXT: or $3, $10, $3 +; CHECK-BE-NEXT: or $3, $1, $3 ; ; CHECK-LE-LABEL: rotl_i64: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: negu $1, $6 -; CHECK-LE-NEXT: andi $2, $1, 63 -; CHECK-LE-NEXT: srlv $7, $5, $2 -; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: andi $3, $6, 63 -; CHECK-LE-NEXT: move $8, $7 -; CHECK-LE-NEXT: movn $8, $zero, $1 -; CHECK-LE-NEXT: sllv $9, $5, $3 -; CHECK-LE-NEXT: srl $10, $4, 1 -; CHECK-LE-NEXT: not $11, $3 -; CHECK-LE-NEXT: srlv $10, $10, $11 -; CHECK-LE-NEXT: or $9, $9, $10 -; CHECK-LE-NEXT: sllv $10, $4, $3 -; CHECK-LE-NEXT: andi $6, $6, 32 -; CHECK-LE-NEXT: movn $9, $10, $6 -; CHECK-LE-NEXT: or $3, $9, $8 -; CHECK-LE-NEXT: srlv $4, $4, $2 -; CHECK-LE-NEXT: not $2, $2 -; CHECK-LE-NEXT: sll $5, $5, 1 -; CHECK-LE-NEXT: sllv $2, $5, $2 -; CHECK-LE-NEXT: or $2, $2, $4 -; CHECK-LE-NEXT: movn $2, $7, $1 -; CHECK-LE-NEXT: movn $10, $zero, $6 +; CHECK-LE-NEXT: srl $1, $6, 5 +; CHECK-LE-NEXT: andi $1, $1, 1 +; CHECK-LE-NEXT: move $3, $4 +; CHECK-LE-NEXT: movn $3, $5, $1 +; CHECK-LE-NEXT: andi $7, $6, 31 +; CHECK-LE-NEXT: sllv $2, $3, $7 +; CHECK-LE-NEXT: movn $5, $4, $1 +; CHECK-LE-NEXT: srl $1, $5, 1 +; CHECK-LE-NEXT: not $4, $6 +; CHECK-LE-NEXT: andi $4, $4, 31 +; CHECK-LE-NEXT: srlv $1, $1, $4 +; CHECK-LE-NEXT: or $2, $2, $1 +; CHECK-LE-NEXT: sllv $1, $5, $7 +; CHECK-LE-NEXT: srl $3, $3, 1 +; CHECK-LE-NEXT: srlv $3, $3, $4 ; CHECK-LE-NEXT: jr $ra -; CHECK-LE-NEXT: or $2, $10, $2 +; CHECK-LE-NEXT: or $3, $1, $3 %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } @@ -254,59 +238,41 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK-BE-LABEL: rotr_i64: ; CHECK-BE: # %bb.0: -; CHECK-BE-NEXT: negu $1, $7 -; CHECK-BE-NEXT: andi $2, $1, 63 -; CHECK-BE-NEXT: sllv $6, $5, $2 -; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: andi $3, $7, 63 -; CHECK-BE-NEXT: move $8, $6 -; CHECK-BE-NEXT: movn $8, $zero, $1 -; CHECK-BE-NEXT: srlv $9, $5, $3 -; CHECK-BE-NEXT: sll $10, $4, 1 -; CHECK-BE-NEXT: not $11, $3 -; CHECK-BE-NEXT: sllv $10, $10, $11 -; CHECK-BE-NEXT: or $9, $10, $9 -; CHECK-BE-NEXT: srlv $10, $4, $3 -; CHECK-BE-NEXT: andi $7, $7, 32 -; CHECK-BE-NEXT: movn $9, $10, $7 -; CHECK-BE-NEXT: or $3, $9, $8 -; CHECK-BE-NEXT: sllv $4, $4, $2 -; CHECK-BE-NEXT: not $2, $2 -; CHECK-BE-NEXT: srl $5, $5, 1 -; CHECK-BE-NEXT: srlv $2, $5, $2 -; CHECK-BE-NEXT: or $2, $4, $2 -; CHECK-BE-NEXT: movn $2, $6, $1 -; CHECK-BE-NEXT: movn $10, $zero, $7 +; CHECK-BE-NEXT: andi $1, $7, 32 +; CHECK-BE-NEXT: move $3, $5 +; CHECK-BE-NEXT: movz $3, $4, $1 +; CHECK-BE-NEXT: andi $6, $7, 31 +; CHECK-BE-NEXT: srlv $2, $3, $6 +; CHECK-BE-NEXT: movz $4, $5, $1 +; CHECK-BE-NEXT: sll $1, $4, 1 +; CHECK-BE-NEXT: not $5, $7 +; CHECK-BE-NEXT: andi $5, $5, 31 +; CHECK-BE-NEXT: sllv $1, $1, $5 +; CHECK-BE-NEXT: or $2, $1, $2 +; CHECK-BE-NEXT: srlv $1, $4, $6 +; CHECK-BE-NEXT: sll $3, $3, 1 +; CHECK-BE-NEXT: sllv $3, $3, $5 ; CHECK-BE-NEXT: jr $ra -; CHECK-BE-NEXT: or $2, $10, $2 +; CHECK-BE-NEXT: or $3, $3, $1 ; ; CHECK-LE-LABEL: rotr_i64: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: negu $1, $6 -; CHECK-LE-NEXT: andi $3, $1, 63 -; CHECK-LE-NEXT: sllv $7, $4, $3 -; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: andi $2, $6, 63 -; CHECK-LE-NEXT: move $8, $7 -; CHECK-LE-NEXT: movn $8, $zero, $1 -; CHECK-LE-NEXT: srlv $9, $4, $2 -; CHECK-LE-NEXT: sll $10, $5, 1 -; CHECK-LE-NEXT: not $11, $2 -; CHECK-LE-NEXT: sllv $10, $10, $11 -; CHECK-LE-NEXT: or $9, $10, $9 -; CHECK-LE-NEXT: srlv $10, $5, $2 -; CHECK-LE-NEXT: andi $6, $6, 32 -; CHECK-LE-NEXT: movn $9, $10, $6 -; CHECK-LE-NEXT: or $2, $9, $8 -; CHECK-LE-NEXT: sllv $5, $5, $3 -; CHECK-LE-NEXT: not $3, $3 -; CHECK-LE-NEXT: srl $4, $4, 1 -; CHECK-LE-NEXT: srlv $3, $4, $3 -; CHECK-LE-NEXT: or $3, $5, $3 -; CHECK-LE-NEXT: movn $3, $7, $1 -; CHECK-LE-NEXT: movn $10, $zero, $6 +; CHECK-LE-NEXT: andi $1, $6, 32 +; CHECK-LE-NEXT: move $3, $5 +; CHECK-LE-NEXT: movz $3, $4, $1 +; CHECK-LE-NEXT: andi $7, $6, 31 +; CHECK-LE-NEXT: srlv $2, $3, $7 +; CHECK-LE-NEXT: movz $4, $5, $1 +; CHECK-LE-NEXT: sll $1, $4, 1 +; CHECK-LE-NEXT: not $5, $6 +; CHECK-LE-NEXT: andi $5, $5, 31 +; CHECK-LE-NEXT: sllv $1, $1, $5 +; CHECK-LE-NEXT: or $2, $1, $2 +; CHECK-LE-NEXT: srlv $1, $4, $7 +; CHECK-LE-NEXT: sll $3, $3, 1 +; CHECK-LE-NEXT: sllv $3, $3, $5 ; CHECK-LE-NEXT: jr $ra -; CHECK-LE-NEXT: or $3, $10, $3 +; CHECK-LE-NEXT: or $3, $3, $1 %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll --- a/llvm/test/CodeGen/Mips/funnel-shift.ll +++ b/llvm/test/CodeGen/Mips/funnel-shift.ll @@ -72,37 +72,25 @@ ; CHECK-BE-NEXT: jal __umoddi3 ; CHECK-BE-NEXT: addiu $7, $zero, 37 ; CHECK-BE-NEXT: not $1, $3 -; CHECK-BE-NEXT: andi $2, $3, 63 -; CHECK-BE-NEXT: not $4, $2 -; CHECK-BE-NEXT: srl $5, $18, 1 -; CHECK-BE-NEXT: sllv $6, $19, $2 -; CHECK-BE-NEXT: srlv $4, $5, $4 -; CHECK-BE-NEXT: andi $5, $1, 63 -; CHECK-BE-NEXT: srl $7, $16, 5 -; CHECK-BE-NEXT: sll $8, $17, 27 -; CHECK-BE-NEXT: or $7, $8, $7 -; CHECK-BE-NEXT: srl $8, $7, 1 -; CHECK-BE-NEXT: srlv $9, $8, $5 -; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: move $10, $9 -; CHECK-BE-NEXT: movn $10, $zero, $1 -; CHECK-BE-NEXT: or $4, $6, $4 -; CHECK-BE-NEXT: sllv $6, $18, $2 -; CHECK-BE-NEXT: andi $3, $3, 32 -; CHECK-BE-NEXT: movn $4, $6, $3 -; CHECK-BE-NEXT: sll $7, $7, 31 -; CHECK-BE-NEXT: sll $2, $16, 27 -; CHECK-BE-NEXT: srl $11, $2, 1 -; CHECK-BE-NEXT: or $2, $4, $10 -; CHECK-BE-NEXT: movn $6, $zero, $3 -; CHECK-BE-NEXT: or $3, $11, $7 -; CHECK-BE-NEXT: srlv $3, $3, $5 -; CHECK-BE-NEXT: not $4, $5 -; CHECK-BE-NEXT: sll $5, $8, 1 -; CHECK-BE-NEXT: sllv $4, $5, $4 -; CHECK-BE-NEXT: or $3, $4, $3 -; CHECK-BE-NEXT: movn $3, $9, $1 -; CHECK-BE-NEXT: or $3, $6, $3 +; CHECK-BE-NEXT: srl $2, $3, 5 +; CHECK-BE-NEXT: andi $4, $2, 1 +; CHECK-BE-NEXT: movn $19, $18, $4 +; CHECK-BE-NEXT: andi $3, $3, 31 +; CHECK-BE-NEXT: sllv $2, $19, $3 +; CHECK-BE-NEXT: andi $1, $1, 31 +; CHECK-BE-NEXT: srl $5, $16, 5 +; CHECK-BE-NEXT: sll $6, $17, 27 +; CHECK-BE-NEXT: or $5, $6, $5 +; CHECK-BE-NEXT: movn $18, $5, $4 +; CHECK-BE-NEXT: srl $6, $18, 1 +; CHECK-BE-NEXT: srlv $6, $6, $1 +; CHECK-BE-NEXT: or $2, $2, $6 +; CHECK-BE-NEXT: sllv $3, $18, $3 +; CHECK-BE-NEXT: sll $6, $16, 27 +; CHECK-BE-NEXT: movn $5, $6, $4 +; CHECK-BE-NEXT: srl $4, $5, 1 +; CHECK-BE-NEXT: srlv $1, $4, $1 +; CHECK-BE-NEXT: or $3, $3, $1 ; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload @@ -134,38 +122,27 @@ ; CHECK-LE-NEXT: addiu $6, $zero, 37 ; CHECK-LE-NEXT: jal __umoddi3 ; CHECK-LE-NEXT: addiu $7, $zero, 0 -; CHECK-LE-NEXT: not $1, $2 -; CHECK-LE-NEXT: andi $3, $2, 63 -; CHECK-LE-NEXT: not $4, $3 -; CHECK-LE-NEXT: srl $5, $19, 1 -; CHECK-LE-NEXT: sllv $6, $18, $3 -; CHECK-LE-NEXT: srlv $4, $5, $4 -; CHECK-LE-NEXT: andi $5, $1, 63 -; CHECK-LE-NEXT: srl $7, $17, 5 -; CHECK-LE-NEXT: sll $8, $16, 27 -; CHECK-LE-NEXT: or $7, $8, $7 -; CHECK-LE-NEXT: srl $8, $7, 1 -; CHECK-LE-NEXT: srlv $9, $8, $5 -; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: move $10, $9 -; CHECK-LE-NEXT: movn $10, $zero, $1 -; CHECK-LE-NEXT: or $4, $6, $4 -; CHECK-LE-NEXT: sllv $6, $19, $3 -; CHECK-LE-NEXT: andi $2, $2, 32 -; CHECK-LE-NEXT: movn $4, $6, $2 -; CHECK-LE-NEXT: sll $7, $7, 31 -; CHECK-LE-NEXT: sll $3, $17, 27 -; CHECK-LE-NEXT: srl $11, $3, 1 -; CHECK-LE-NEXT: or $3, $4, $10 -; CHECK-LE-NEXT: movn $6, $zero, $2 -; CHECK-LE-NEXT: or $2, $11, $7 -; CHECK-LE-NEXT: srlv $2, $2, $5 -; CHECK-LE-NEXT: not $4, $5 -; CHECK-LE-NEXT: sll $5, $8, 1 -; CHECK-LE-NEXT: sllv $4, $5, $4 -; CHECK-LE-NEXT: or $2, $4, $2 -; CHECK-LE-NEXT: movn $2, $9, $1 +; CHECK-LE-NEXT: srl $1, $2, 5 +; CHECK-LE-NEXT: andi $1, $1, 1 +; CHECK-LE-NEXT: srl $3, $17, 5 +; CHECK-LE-NEXT: sll $4, $16, 27 +; CHECK-LE-NEXT: or $3, $4, $3 +; CHECK-LE-NEXT: move $4, $19 +; CHECK-LE-NEXT: movn $4, $3, $1 +; CHECK-LE-NEXT: andi $5, $2, 31 +; CHECK-LE-NEXT: sllv $6, $4, $5 +; CHECK-LE-NEXT: not $2, $2 +; CHECK-LE-NEXT: andi $7, $2, 31 +; CHECK-LE-NEXT: sll $2, $17, 27 +; CHECK-LE-NEXT: movn $3, $2, $1 +; CHECK-LE-NEXT: srl $2, $3, 1 +; CHECK-LE-NEXT: srlv $2, $2, $7 ; CHECK-LE-NEXT: or $2, $6, $2 +; CHECK-LE-NEXT: movn $18, $19, $1 +; CHECK-LE-NEXT: sllv $1, $18, $5 +; CHECK-LE-NEXT: srl $3, $4, 1 +; CHECK-LE-NEXT: srlv $3, $3, $7 +; CHECK-LE-NEXT: or $3, $1, $3 ; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload @@ -250,15 +227,15 @@ ; CHECK-BE-NEXT: srl $1, $6, 23 ; CHECK-BE-NEXT: sll $2, $5, 9 ; CHECK-BE-NEXT: or $2, $2, $1 -; CHECK-BE-NEXT: sll $1, $6, 9 -; CHECK-BE-NEXT: srl $3, $7, 23 +; CHECK-BE-NEXT: srl $1, $7, 23 +; CHECK-BE-NEXT: sll $3, $6, 9 ; CHECK-BE-NEXT: jr $ra ; CHECK-BE-NEXT: or $3, $3, $1 ; ; CHECK-LE-LABEL: fshl_i64_const_overshift: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: sll $1, $7, 9 -; CHECK-LE-NEXT: srl $2, $6, 23 +; CHECK-LE-NEXT: srl $1, $6, 23 +; CHECK-LE-NEXT: sll $2, $7, 9 ; CHECK-LE-NEXT: or $2, $2, $1 ; CHECK-LE-NEXT: srl $1, $7, 23 ; CHECK-LE-NEXT: sll $3, $4, 9 @@ -338,40 +315,25 @@ ; CHECK-BE-NEXT: jal __umoddi3 ; CHECK-BE-NEXT: addiu $7, $zero, 37 ; CHECK-BE-NEXT: addiu $1, $3, 27 -; CHECK-BE-NEXT: andi $2, $1, 63 -; CHECK-BE-NEXT: not $3, $2 -; CHECK-BE-NEXT: srl $4, $16, 5 -; CHECK-BE-NEXT: sll $5, $17, 27 -; CHECK-BE-NEXT: or $4, $5, $4 -; CHECK-BE-NEXT: sll $5, $4, 1 -; CHECK-BE-NEXT: sll $6, $16, 27 -; CHECK-BE-NEXT: srlv $6, $6, $2 -; CHECK-BE-NEXT: sllv $3, $5, $3 -; CHECK-BE-NEXT: not $5, $1 -; CHECK-BE-NEXT: andi $7, $5, 63 -; CHECK-BE-NEXT: sll $8, $18, 1 -; CHECK-BE-NEXT: sllv $8, $8, $7 -; CHECK-BE-NEXT: andi $5, $5, 32 -; CHECK-BE-NEXT: move $9, $8 -; CHECK-BE-NEXT: movn $9, $zero, $5 -; CHECK-BE-NEXT: or $3, $3, $6 -; CHECK-BE-NEXT: srlv $2, $4, $2 -; CHECK-BE-NEXT: andi $1, $1, 32 -; CHECK-BE-NEXT: movn $3, $2, $1 -; CHECK-BE-NEXT: srl $4, $18, 31 +; CHECK-BE-NEXT: andi $3, $1, 32 +; CHECK-BE-NEXT: srl $2, $16, 5 +; CHECK-BE-NEXT: sll $4, $17, 27 +; CHECK-BE-NEXT: or $4, $4, $2 +; CHECK-BE-NEXT: movz $19, $18, $3 +; CHECK-BE-NEXT: movz $18, $4, $3 +; CHECK-BE-NEXT: andi $5, $1, 31 +; CHECK-BE-NEXT: srlv $2, $18, $5 +; CHECK-BE-NEXT: not $1, $1 +; CHECK-BE-NEXT: andi $1, $1, 31 ; CHECK-BE-NEXT: sll $6, $19, 1 -; CHECK-BE-NEXT: or $4, $6, $4 -; CHECK-BE-NEXT: or $3, $9, $3 -; CHECK-BE-NEXT: movn $2, $zero, $1 -; CHECK-BE-NEXT: sllv $1, $4, $7 -; CHECK-BE-NEXT: not $4, $7 -; CHECK-BE-NEXT: lui $6, 32767 -; CHECK-BE-NEXT: ori $6, $6, 65535 -; CHECK-BE-NEXT: and $6, $18, $6 -; CHECK-BE-NEXT: srlv $4, $6, $4 -; CHECK-BE-NEXT: or $1, $1, $4 -; CHECK-BE-NEXT: movn $1, $8, $5 -; CHECK-BE-NEXT: or $2, $1, $2 +; CHECK-BE-NEXT: sllv $6, $6, $1 +; CHECK-BE-NEXT: or $2, $6, $2 +; CHECK-BE-NEXT: sll $6, $16, 27 +; CHECK-BE-NEXT: movz $4, $6, $3 +; CHECK-BE-NEXT: srlv $3, $4, $5 +; CHECK-BE-NEXT: sll $4, $18, 1 +; CHECK-BE-NEXT: sllv $1, $4, $1 +; CHECK-BE-NEXT: or $3, $1, $3 ; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload ; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload @@ -404,39 +366,25 @@ ; CHECK-LE-NEXT: jal __umoddi3 ; CHECK-LE-NEXT: addiu $7, $zero, 0 ; CHECK-LE-NEXT: addiu $1, $2, 27 -; CHECK-LE-NEXT: andi $2, $1, 63 -; CHECK-LE-NEXT: not $3, $2 -; CHECK-LE-NEXT: srl $4, $17, 5 -; CHECK-LE-NEXT: sll $5, $16, 27 -; CHECK-LE-NEXT: or $4, $5, $4 -; CHECK-LE-NEXT: sll $5, $4, 1 -; CHECK-LE-NEXT: sll $6, $17, 27 -; CHECK-LE-NEXT: srlv $6, $6, $2 -; CHECK-LE-NEXT: sllv $3, $5, $3 -; CHECK-LE-NEXT: not $5, $1 -; CHECK-LE-NEXT: andi $7, $5, 63 -; CHECK-LE-NEXT: sll $8, $19, 1 -; CHECK-LE-NEXT: sllv $8, $8, $7 -; CHECK-LE-NEXT: andi $5, $5, 32 -; CHECK-LE-NEXT: move $9, $8 -; CHECK-LE-NEXT: movn $9, $zero, $5 -; CHECK-LE-NEXT: or $3, $3, $6 -; CHECK-LE-NEXT: srlv $4, $4, $2 -; CHECK-LE-NEXT: andi $1, $1, 32 -; CHECK-LE-NEXT: movn $3, $4, $1 -; CHECK-LE-NEXT: srl $2, $19, 31 -; CHECK-LE-NEXT: sll $6, $18, 1 -; CHECK-LE-NEXT: or $6, $6, $2 -; CHECK-LE-NEXT: or $2, $9, $3 -; CHECK-LE-NEXT: movn $4, $zero, $1 -; CHECK-LE-NEXT: sllv $1, $6, $7 -; CHECK-LE-NEXT: not $3, $7 -; CHECK-LE-NEXT: lui $6, 32767 -; CHECK-LE-NEXT: ori $6, $6, 65535 -; CHECK-LE-NEXT: and $6, $19, $6 -; CHECK-LE-NEXT: srlv $3, $6, $3 -; CHECK-LE-NEXT: or $1, $1, $3 -; CHECK-LE-NEXT: movn $1, $8, $5 +; CHECK-LE-NEXT: andi $3, $1, 32 +; CHECK-LE-NEXT: srl $2, $17, 5 +; CHECK-LE-NEXT: sll $4, $16, 27 +; CHECK-LE-NEXT: or $2, $4, $2 +; CHECK-LE-NEXT: sll $4, $17, 27 +; CHECK-LE-NEXT: move $5, $19 +; CHECK-LE-NEXT: movz $5, $2, $3 +; CHECK-LE-NEXT: movz $2, $4, $3 +; CHECK-LE-NEXT: andi $4, $1, 31 +; CHECK-LE-NEXT: srlv $2, $2, $4 +; CHECK-LE-NEXT: not $1, $1 +; CHECK-LE-NEXT: andi $1, $1, 31 +; CHECK-LE-NEXT: sll $6, $5, 1 +; CHECK-LE-NEXT: sllv $6, $6, $1 +; CHECK-LE-NEXT: or $2, $6, $2 +; CHECK-LE-NEXT: srlv $4, $5, $4 +; CHECK-LE-NEXT: movz $18, $19, $3 +; CHECK-LE-NEXT: sll $3, $18, 1 +; CHECK-LE-NEXT: sllv $1, $3, $1 ; CHECK-LE-NEXT: or $3, $1, $4 ; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll @@ -87,52 +87,44 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; CHECK32_32-LABEL: rotl_i64: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: clrlwi 5, 6, 26 -; CHECK32_32-NEXT: subfic 8, 5, 32 -; CHECK32_32-NEXT: neg 6, 6 -; CHECK32_32-NEXT: slw 7, 3, 5 -; CHECK32_32-NEXT: addi 9, 5, -32 -; CHECK32_32-NEXT: srw 8, 4, 8 -; CHECK32_32-NEXT: clrlwi 6, 6, 26 -; CHECK32_32-NEXT: slw 9, 4, 9 -; CHECK32_32-NEXT: or 7, 7, 8 -; CHECK32_32-NEXT: subfic 8, 6, 32 -; CHECK32_32-NEXT: or 7, 7, 9 -; CHECK32_32-NEXT: addi 9, 6, -32 -; CHECK32_32-NEXT: slw 8, 3, 8 -; CHECK32_32-NEXT: srw 9, 3, 9 -; CHECK32_32-NEXT: srw 3, 3, 6 -; CHECK32_32-NEXT: srw 6, 4, 6 -; CHECK32_32-NEXT: or 6, 6, 8 -; CHECK32_32-NEXT: or 6, 6, 9 -; CHECK32_32-NEXT: slw 4, 4, 5 -; CHECK32_32-NEXT: or 3, 7, 3 -; CHECK32_32-NEXT: or 4, 4, 6 +; CHECK32_32-NEXT: andi. 5, 6, 32 +; CHECK32_32-NEXT: clrlwi 5, 6, 27 +; CHECK32_32-NEXT: subfic 6, 5, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB4_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 7, 3, 0 +; CHECK32_32-NEXT: ori 3, 4, 0 +; CHECK32_32-NEXT: b .LBB4_3 +; CHECK32_32-NEXT: .LBB4_2: +; CHECK32_32-NEXT: addi 7, 4, 0 +; CHECK32_32-NEXT: .LBB4_3: +; CHECK32_32-NEXT: srw 4, 7, 6 +; CHECK32_32-NEXT: slw 8, 3, 5 +; CHECK32_32-NEXT: srw 6, 3, 6 +; CHECK32_32-NEXT: slw 5, 7, 5 +; CHECK32_32-NEXT: or 3, 8, 4 +; CHECK32_32-NEXT: or 4, 5, 6 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: rotl_i64: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: clrlwi 5, 6, 26 -; CHECK32_64-NEXT: neg 6, 6 -; CHECK32_64-NEXT: subfic 8, 5, 32 -; CHECK32_64-NEXT: slw 7, 3, 5 -; CHECK32_64-NEXT: clrlwi 6, 6, 26 -; CHECK32_64-NEXT: srw 8, 4, 8 -; CHECK32_64-NEXT: addi 9, 5, -32 -; CHECK32_64-NEXT: or 7, 7, 8 -; CHECK32_64-NEXT: subfic 8, 6, 32 -; CHECK32_64-NEXT: slw 5, 4, 5 -; CHECK32_64-NEXT: slw 9, 4, 9 -; CHECK32_64-NEXT: srw 10, 3, 6 -; CHECK32_64-NEXT: srw 4, 4, 6 -; CHECK32_64-NEXT: addi 6, 6, -32 -; CHECK32_64-NEXT: slw 8, 3, 8 -; CHECK32_64-NEXT: srw 3, 3, 6 -; CHECK32_64-NEXT: or 4, 4, 8 -; CHECK32_64-NEXT: or 6, 7, 9 -; CHECK32_64-NEXT: or 4, 4, 3 -; CHECK32_64-NEXT: or 3, 6, 10 -; CHECK32_64-NEXT: or 4, 5, 4 +; CHECK32_64-NEXT: andi. 5, 6, 32 +; CHECK32_64-NEXT: clrlwi 5, 6, 27 +; CHECK32_64-NEXT: bc 12, 2, .LBB4_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 7, 3, 0 +; CHECK32_64-NEXT: ori 3, 4, 0 +; CHECK32_64-NEXT: b .LBB4_3 +; CHECK32_64-NEXT: .LBB4_2: +; CHECK32_64-NEXT: addi 7, 4, 0 +; CHECK32_64-NEXT: .LBB4_3: +; CHECK32_64-NEXT: subfic 6, 5, 32 +; CHECK32_64-NEXT: srw 4, 7, 6 +; CHECK32_64-NEXT: slw 8, 3, 5 +; CHECK32_64-NEXT: srw 6, 3, 6 +; CHECK32_64-NEXT: slw 5, 7, 5 +; CHECK32_64-NEXT: or 3, 8, 4 +; CHECK32_64-NEXT: or 4, 5, 6 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: rotl_i64: @@ -256,52 +248,44 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK32_32-LABEL: rotr_i64: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: clrlwi 5, 6, 26 -; CHECK32_32-NEXT: subfic 8, 5, 32 -; CHECK32_32-NEXT: neg 6, 6 -; CHECK32_32-NEXT: srw 7, 4, 5 -; CHECK32_32-NEXT: addi 9, 5, -32 -; CHECK32_32-NEXT: slw 8, 3, 8 -; CHECK32_32-NEXT: clrlwi 6, 6, 26 -; CHECK32_32-NEXT: srw 9, 3, 9 -; CHECK32_32-NEXT: or 7, 7, 8 -; CHECK32_32-NEXT: subfic 8, 6, 32 -; CHECK32_32-NEXT: or 7, 7, 9 -; CHECK32_32-NEXT: addi 9, 6, -32 -; CHECK32_32-NEXT: srw 8, 4, 8 -; CHECK32_32-NEXT: slw 9, 4, 9 -; CHECK32_32-NEXT: slw 4, 4, 6 -; CHECK32_32-NEXT: slw 6, 3, 6 -; CHECK32_32-NEXT: or 6, 6, 8 -; CHECK32_32-NEXT: or 6, 6, 9 -; CHECK32_32-NEXT: srw 3, 3, 5 -; CHECK32_32-NEXT: or 4, 7, 4 -; CHECK32_32-NEXT: or 3, 3, 6 +; CHECK32_32-NEXT: andi. 5, 6, 32 +; CHECK32_32-NEXT: clrlwi 5, 6, 27 +; CHECK32_32-NEXT: subfic 6, 5, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB11_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 7, 4, 0 +; CHECK32_32-NEXT: b .LBB11_3 +; CHECK32_32-NEXT: .LBB11_2: +; CHECK32_32-NEXT: addi 7, 3, 0 +; CHECK32_32-NEXT: addi 3, 4, 0 +; CHECK32_32-NEXT: .LBB11_3: +; CHECK32_32-NEXT: srw 4, 7, 5 +; CHECK32_32-NEXT: slw 8, 3, 6 +; CHECK32_32-NEXT: srw 5, 3, 5 +; CHECK32_32-NEXT: slw 6, 7, 6 +; CHECK32_32-NEXT: or 3, 8, 4 +; CHECK32_32-NEXT: or 4, 6, 5 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: rotr_i64: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: clrlwi 5, 6, 26 -; CHECK32_64-NEXT: neg 6, 6 -; CHECK32_64-NEXT: subfic 8, 5, 32 -; CHECK32_64-NEXT: srw 7, 4, 5 -; CHECK32_64-NEXT: clrlwi 6, 6, 26 -; CHECK32_64-NEXT: slw 8, 3, 8 -; CHECK32_64-NEXT: addi 9, 5, -32 -; CHECK32_64-NEXT: or 7, 7, 8 -; CHECK32_64-NEXT: subfic 8, 6, 32 +; CHECK32_64-NEXT: andi. 5, 6, 32 +; CHECK32_64-NEXT: clrlwi 5, 6, 27 +; CHECK32_64-NEXT: bc 12, 2, .LBB11_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 7, 4, 0 +; CHECK32_64-NEXT: b .LBB11_3 +; CHECK32_64-NEXT: .LBB11_2: +; CHECK32_64-NEXT: addi 7, 3, 0 +; CHECK32_64-NEXT: addi 3, 4, 0 +; CHECK32_64-NEXT: .LBB11_3: +; CHECK32_64-NEXT: subfic 6, 5, 32 +; CHECK32_64-NEXT: srw 4, 7, 5 +; CHECK32_64-NEXT: slw 8, 3, 6 ; CHECK32_64-NEXT: srw 5, 3, 5 -; CHECK32_64-NEXT: srw 9, 3, 9 -; CHECK32_64-NEXT: slw 10, 4, 6 -; CHECK32_64-NEXT: slw 3, 3, 6 -; CHECK32_64-NEXT: addi 6, 6, -32 -; CHECK32_64-NEXT: srw 8, 4, 8 -; CHECK32_64-NEXT: slw 4, 4, 6 -; CHECK32_64-NEXT: or 3, 3, 8 -; CHECK32_64-NEXT: or 6, 7, 9 -; CHECK32_64-NEXT: or 3, 3, 4 -; CHECK32_64-NEXT: or 4, 6, 10 -; CHECK32_64-NEXT: or 3, 5, 3 +; CHECK32_64-NEXT: slw 6, 7, 6 +; CHECK32_64-NEXT: or 3, 8, 4 +; CHECK32_64-NEXT: or 4, 6, 5 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: rotr_i64: diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll @@ -43,58 +43,47 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) { ; CHECK32_32-LABEL: fshl_i64: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: clrlwi 7, 8, 26 -; CHECK32_32-NEXT: not 8, 8 -; CHECK32_32-NEXT: rotlwi 6, 6, 31 -; CHECK32_32-NEXT: subfic 10, 7, 32 -; CHECK32_32-NEXT: srwi 9, 5, 1 +; CHECK32_32-NEXT: andi. 7, 8, 32 +; CHECK32_32-NEXT: clrlwi 7, 8, 27 +; CHECK32_32-NEXT: subfic 8, 7, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB1_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 9, 5, 0 +; CHECK32_32-NEXT: ori 3, 4, 0 +; CHECK32_32-NEXT: ori 4, 6, 0 +; CHECK32_32-NEXT: b .LBB1_3 +; CHECK32_32-NEXT: .LBB1_2: +; CHECK32_32-NEXT: addi 9, 4, 0 +; CHECK32_32-NEXT: addi 4, 5, 0 +; CHECK32_32-NEXT: .LBB1_3: +; CHECK32_32-NEXT: srw 5, 9, 8 ; CHECK32_32-NEXT: slw 3, 3, 7 -; CHECK32_32-NEXT: clrlwi 8, 8, 26 -; CHECK32_32-NEXT: rlwimi 6, 5, 31, 0, 0 -; CHECK32_32-NEXT: srw 5, 4, 10 -; CHECK32_32-NEXT: srw 10, 9, 8 -; CHECK32_32-NEXT: srw 6, 6, 8 +; CHECK32_32-NEXT: srw 4, 4, 8 +; CHECK32_32-NEXT: slw 6, 9, 7 ; CHECK32_32-NEXT: or 3, 3, 5 -; CHECK32_32-NEXT: subfic 5, 8, 32 -; CHECK32_32-NEXT: addi 8, 8, -32 -; CHECK32_32-NEXT: slw 5, 9, 5 -; CHECK32_32-NEXT: srw 8, 9, 8 -; CHECK32_32-NEXT: addi 9, 7, -32 -; CHECK32_32-NEXT: slw 9, 4, 9 -; CHECK32_32-NEXT: or 5, 6, 5 -; CHECK32_32-NEXT: or 3, 3, 9 -; CHECK32_32-NEXT: or 5, 5, 8 -; CHECK32_32-NEXT: slw 4, 4, 7 -; CHECK32_32-NEXT: or 3, 3, 10 -; CHECK32_32-NEXT: or 4, 4, 5 +; CHECK32_32-NEXT: or 4, 6, 4 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: fshl_i64: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: clrlwi 7, 8, 26 -; CHECK32_64-NEXT: not 8, 8 -; CHECK32_64-NEXT: subfic 9, 7, 32 -; CHECK32_64-NEXT: rotlwi 6, 6, 31 +; CHECK32_64-NEXT: andi. 7, 8, 32 +; CHECK32_64-NEXT: clrlwi 7, 8, 27 +; CHECK32_64-NEXT: bc 12, 2, .LBB1_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 9, 5, 0 +; CHECK32_64-NEXT: ori 3, 4, 0 +; CHECK32_64-NEXT: ori 5, 6, 0 +; CHECK32_64-NEXT: b .LBB1_3 +; CHECK32_64-NEXT: .LBB1_2: +; CHECK32_64-NEXT: addi 9, 4, 0 +; CHECK32_64-NEXT: .LBB1_3: +; CHECK32_64-NEXT: subfic 8, 7, 32 +; CHECK32_64-NEXT: srw 4, 9, 8 ; CHECK32_64-NEXT: slw 3, 3, 7 -; CHECK32_64-NEXT: clrlwi 8, 8, 26 -; CHECK32_64-NEXT: srw 9, 4, 9 -; CHECK32_64-NEXT: rlwimi 6, 5, 31, 0, 0 -; CHECK32_64-NEXT: srwi 5, 5, 1 -; CHECK32_64-NEXT: addi 10, 7, -32 -; CHECK32_64-NEXT: or 3, 3, 9 -; CHECK32_64-NEXT: subfic 9, 8, 32 -; CHECK32_64-NEXT: slw 7, 4, 7 -; CHECK32_64-NEXT: slw 4, 4, 10 -; CHECK32_64-NEXT: srw 10, 5, 8 -; CHECK32_64-NEXT: srw 6, 6, 8 -; CHECK32_64-NEXT: addi 8, 8, -32 -; CHECK32_64-NEXT: slw 9, 5, 9 ; CHECK32_64-NEXT: srw 5, 5, 8 -; CHECK32_64-NEXT: or 6, 6, 9 +; CHECK32_64-NEXT: slw 6, 9, 7 ; CHECK32_64-NEXT: or 3, 3, 4 ; CHECK32_64-NEXT: or 4, 6, 5 -; CHECK32_64-NEXT: or 3, 3, 10 -; CHECK32_64-NEXT: or 4, 7, 4 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: fshl_i64: @@ -112,387 +101,128 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; CHECK32_32-LABEL: fshl_i128: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: stwu 1, -64(1) -; CHECK32_32-NEXT: lwz 0, 84(1) -; CHECK32_32-NEXT: rotlwi 12, 8, 31 -; CHECK32_32-NEXT: srwi 11, 7, 1 -; CHECK32_32-NEXT: rlwimi 12, 7, 31, 0, 0 -; CHECK32_32-NEXT: andi. 7, 0, 127 -; CHECK32_32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: rotlwi 10, 10, 31 -; CHECK32_32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: rotlwi 30, 9, 31 -; CHECK32_32-NEXT: subfic 27, 7, 32 -; CHECK32_32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: rlwimi 10, 9, 31, 0, 0 -; CHECK32_32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: rlwimi 30, 8, 31, 0, 0 -; CHECK32_32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: not 8, 0 -; CHECK32_32-NEXT: subfic 9, 7, 96 -; CHECK32_32-NEXT: addi 0, 7, -64 -; CHECK32_32-NEXT: slw 28, 3, 7 -; CHECK32_32-NEXT: subfic 25, 7, 64 -; CHECK32_32-NEXT: srw 22, 4, 27 -; CHECK32_32-NEXT: stw 20, 16(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: srw 9, 6, 9 -; CHECK32_32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: slw 23, 5, 0 -; CHECK32_32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: addi 29, 7, -96 -; CHECK32_32-NEXT: srw 20, 5, 25 -; CHECK32_32-NEXT: or 28, 28, 22 -; CHECK32_32-NEXT: srw 22, 6, 25 -; CHECK32_32-NEXT: subfic 25, 25, 32 -; CHECK32_32-NEXT: stw 24, 32(1) # 4-byte Folded Spill +; CHECK32_32-NEXT: lwz 11, 20(1) +; CHECK32_32-NEXT: andi. 12, 11, 64 ; CHECK32_32-NEXT: mcrf 1, 0 -; CHECK32_32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: addi 26, 7, -32 -; CHECK32_32-NEXT: andi. 8, 8, 127 -; CHECK32_32-NEXT: slw 24, 5, 7 -; CHECK32_32-NEXT: slw 29, 6, 29 -; CHECK32_32-NEXT: or 9, 23, 9 -; CHECK32_32-NEXT: slw 25, 5, 25 -; CHECK32_32-NEXT: srw 5, 5, 27 -; CHECK32_32-NEXT: srw 27, 6, 27 -; CHECK32_32-NEXT: stw 21, 20(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: slw 21, 4, 26 -; CHECK32_32-NEXT: subfic 23, 8, 32 -; CHECK32_32-NEXT: or 27, 24, 27 -; CHECK32_32-NEXT: subfic 24, 8, 96 -; CHECK32_32-NEXT: or 9, 9, 29 -; CHECK32_32-NEXT: addi 29, 8, -64 -; CHECK32_32-NEXT: or 25, 22, 25 -; CHECK32_32-NEXT: stw 19, 12(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: srw 19, 12, 8 -; CHECK32_32-NEXT: or 28, 28, 21 -; CHECK32_32-NEXT: slw 21, 11, 23 -; CHECK32_32-NEXT: slw 24, 11, 24 -; CHECK32_32-NEXT: srw 22, 12, 29 -; CHECK32_32-NEXT: slw 26, 6, 26 -; CHECK32_32-NEXT: or 5, 25, 5 -; CHECK32_32-NEXT: addi 25, 8, -96 -; CHECK32_32-NEXT: or 21, 19, 21 -; CHECK32_32-NEXT: srw 19, 10, 8 -; CHECK32_32-NEXT: or 24, 22, 24 -; CHECK32_32-NEXT: slw 22, 30, 23 -; CHECK32_32-NEXT: or 27, 27, 26 -; CHECK32_32-NEXT: addi 26, 8, -32 -; CHECK32_32-NEXT: srw 25, 11, 25 -; CHECK32_32-NEXT: or 22, 19, 22 -; CHECK32_32-NEXT: or 28, 28, 20 -; CHECK32_32-NEXT: srw 20, 11, 26 -; CHECK32_32-NEXT: or 25, 24, 25 -; CHECK32_32-NEXT: subfic 24, 8, 64 -; CHECK32_32-NEXT: srw 26, 30, 26 -; CHECK32_32-NEXT: or 26, 22, 26 -; CHECK32_32-NEXT: subfic 22, 24, 32 -; CHECK32_32-NEXT: slw 23, 12, 23 -; CHECK32_32-NEXT: srw 22, 12, 22 -; CHECK32_32-NEXT: slw 12, 12, 24 -; CHECK32_32-NEXT: slw 24, 11, 24 -; CHECK32_32-NEXT: cmplwi 5, 7, 64 -; CHECK32_32-NEXT: or 24, 24, 22 -; CHECK32_32-NEXT: slw 22, 6, 0 -; CHECK32_32-NEXT: slw 6, 6, 7 -; CHECK32_32-NEXT: slw 7, 4, 7 -; CHECK32_32-NEXT: srw 29, 11, 29 -; CHECK32_32-NEXT: srw 11, 11, 8 -; CHECK32_32-NEXT: cmplwi 6, 8, 64 -; CHECK32_32-NEXT: srw 8, 30, 8 -; CHECK32_32-NEXT: or 5, 7, 5 -; CHECK32_32-NEXT: or 7, 26, 12 -; CHECK32_32-NEXT: or 12, 24, 23 -; CHECK32_32-NEXT: bc 12, 20, .LBB2_1 -; CHECK32_32-NEXT: b .LBB2_2 -; CHECK32_32-NEXT: .LBB2_1: -; CHECK32_32-NEXT: addi 9, 28, 0 +; CHECK32_32-NEXT: andi. 12, 11, 32 +; CHECK32_32-NEXT: clrlwi 11, 11, 27 +; CHECK32_32-NEXT: bc 12, 6, .LBB2_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 4, 6, 0 +; CHECK32_32-NEXT: ori 12, 7, 0 +; CHECK32_32-NEXT: ori 3, 5, 0 +; CHECK32_32-NEXT: ori 5, 8, 0 +; CHECK32_32-NEXT: ori 6, 9, 0 +; CHECK32_32-NEXT: ori 7, 10, 0 +; CHECK32_32-NEXT: b .LBB2_3 ; CHECK32_32-NEXT: .LBB2_2: -; CHECK32_32-NEXT: li 28, 0 -; CHECK32_32-NEXT: bc 12, 20, .LBB2_4 -; CHECK32_32-NEXT: # %bb.3: -; CHECK32_32-NEXT: ori 5, 22, 0 -; CHECK32_32-NEXT: b .LBB2_4 -; CHECK32_32-NEXT: .LBB2_4: -; CHECK32_32-NEXT: bc 12, 24, .LBB2_6 -; CHECK32_32-NEXT: # %bb.5: -; CHECK32_32-NEXT: ori 7, 25, 0 +; CHECK32_32-NEXT: addi 12, 5, 0 +; CHECK32_32-NEXT: addi 5, 6, 0 +; CHECK32_32-NEXT: addi 6, 7, 0 +; CHECK32_32-NEXT: addi 7, 8, 0 +; CHECK32_32-NEXT: .LBB2_3: +; CHECK32_32-NEXT: subfic 8, 11, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB2_5 +; CHECK32_32-NEXT: # %bb.4: +; CHECK32_32-NEXT: ori 9, 12, 0 +; CHECK32_32-NEXT: ori 3, 4, 0 +; CHECK32_32-NEXT: ori 4, 5, 0 +; CHECK32_32-NEXT: ori 5, 6, 0 +; CHECK32_32-NEXT: ori 6, 7, 0 ; CHECK32_32-NEXT: b .LBB2_6 +; CHECK32_32-NEXT: .LBB2_5: +; CHECK32_32-NEXT: addi 9, 4, 0 +; CHECK32_32-NEXT: addi 4, 12, 0 ; CHECK32_32-NEXT: .LBB2_6: -; CHECK32_32-NEXT: or 8, 8, 12 -; CHECK32_32-NEXT: or 21, 21, 20 -; CHECK32_32-NEXT: bc 12, 20, .LBB2_8 -; CHECK32_32-NEXT: # %bb.7: -; CHECK32_32-NEXT: ori 6, 28, 0 -; CHECK32_32-NEXT: b .LBB2_8 -; CHECK32_32-NEXT: .LBB2_8: -; CHECK32_32-NEXT: bc 12, 6, .LBB2_10 -; CHECK32_32-NEXT: # %bb.9: -; CHECK32_32-NEXT: ori 4, 5, 0 -; CHECK32_32-NEXT: b .LBB2_10 -; CHECK32_32-NEXT: .LBB2_10: -; CHECK32_32-NEXT: bc 12, 2, .LBB2_12 -; CHECK32_32-NEXT: # %bb.11: -; CHECK32_32-NEXT: ori 5, 7, 0 -; CHECK32_32-NEXT: b .LBB2_13 -; CHECK32_32-NEXT: .LBB2_12: -; CHECK32_32-NEXT: addi 5, 10, 0 -; CHECK32_32-NEXT: .LBB2_13: -; CHECK32_32-NEXT: bc 12, 24, .LBB2_15 -; CHECK32_32-NEXT: # %bb.14: -; CHECK32_32-NEXT: ori 7, 29, 0 -; CHECK32_32-NEXT: ori 11, 28, 0 -; CHECK32_32-NEXT: ori 0, 28, 0 -; CHECK32_32-NEXT: b .LBB2_16 -; CHECK32_32-NEXT: .LBB2_15: -; CHECK32_32-NEXT: addi 7, 8, 0 -; CHECK32_32-NEXT: addi 0, 21, 0 -; CHECK32_32-NEXT: .LBB2_16: -; CHECK32_32-NEXT: bc 12, 6, .LBB2_18 -; CHECK32_32-NEXT: # %bb.17: -; CHECK32_32-NEXT: ori 3, 9, 0 -; CHECK32_32-NEXT: b .LBB2_18 -; CHECK32_32-NEXT: .LBB2_18: -; CHECK32_32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: or 6, 6, 5 -; CHECK32_32-NEXT: bc 12, 20, .LBB2_20 -; CHECK32_32-NEXT: # %bb.19: -; CHECK32_32-NEXT: ori 5, 28, 0 -; CHECK32_32-NEXT: b .LBB2_21 -; CHECK32_32-NEXT: .LBB2_20: -; CHECK32_32-NEXT: addi 5, 27, 0 -; CHECK32_32-NEXT: .LBB2_21: -; CHECK32_32-NEXT: bc 12, 2, .LBB2_22 -; CHECK32_32-NEXT: b .LBB2_23 -; CHECK32_32-NEXT: .LBB2_22: -; CHECK32_32-NEXT: addi 7, 30, 0 -; CHECK32_32-NEXT: .LBB2_23: -; CHECK32_32-NEXT: or 3, 3, 11 -; CHECK32_32-NEXT: or 4, 4, 0 -; CHECK32_32-NEXT: or 5, 5, 7 -; CHECK32_32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 28, 48(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 27, 44(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 26, 40(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 25, 36(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 24, 32(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 19, 12(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: addi 1, 1, 64 +; CHECK32_32-NEXT: srw 7, 9, 8 +; CHECK32_32-NEXT: slw 3, 3, 11 +; CHECK32_32-NEXT: srw 10, 4, 8 +; CHECK32_32-NEXT: slw 9, 9, 11 +; CHECK32_32-NEXT: srw 12, 5, 8 +; CHECK32_32-NEXT: slw 0, 4, 11 +; CHECK32_32-NEXT: srw 6, 6, 8 +; CHECK32_32-NEXT: slw 8, 5, 11 +; CHECK32_32-NEXT: or 3, 3, 7 +; CHECK32_32-NEXT: or 4, 9, 10 +; CHECK32_32-NEXT: or 5, 0, 12 +; CHECK32_32-NEXT: or 6, 8, 6 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: fshl_i128: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: stwu 1, -64(1) -; CHECK32_64-NEXT: lwz 12, 84(1) -; CHECK32_64-NEXT: rotlwi 11, 8, 31 -; CHECK32_64-NEXT: rotlwi 10, 10, 31 -; CHECK32_64-NEXT: rlwimi 10, 9, 31, 0, 0 -; CHECK32_64-NEXT: rlwimi 11, 7, 31, 0, 0 -; CHECK32_64-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: rotlwi 30, 9, 31 -; CHECK32_64-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: not 9, 12 -; CHECK32_64-NEXT: rlwimi 30, 8, 31, 0, 0 -; CHECK32_64-NEXT: andi. 8, 12, 127 -; CHECK32_64-NEXT: stw 22, 24(1) # 4-byte Folded Spill +; CHECK32_64-NEXT: stwu 1, -16(1) +; CHECK32_64-NEXT: lwz 11, 36(1) +; CHECK32_64-NEXT: andi. 12, 11, 64 +; CHECK32_64-NEXT: stw 30, 8(1) # 4-byte Folded Spill ; CHECK32_64-NEXT: mcrf 1, 0 -; CHECK32_64-NEXT: subfic 12, 8, 96 -; CHECK32_64-NEXT: addi 0, 8, -64 -; CHECK32_64-NEXT: subfic 27, 8, 32 -; CHECK32_64-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: andi. 9, 9, 127 -; CHECK32_64-NEXT: srw 12, 6, 12 -; CHECK32_64-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: subfic 25, 8, 64 -; CHECK32_64-NEXT: slw 23, 5, 0 -; CHECK32_64-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: addi 26, 8, -32 -; CHECK32_64-NEXT: srw 22, 4, 27 -; CHECK32_64-NEXT: srwi 7, 7, 1 -; CHECK32_64-NEXT: or 12, 23, 12 -; CHECK32_64-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: slw 28, 3, 8 -; CHECK32_64-NEXT: srw 23, 6, 25 -; CHECK32_64-NEXT: stw 18, 8(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: subfic 18, 9, 32 -; CHECK32_64-NEXT: or 28, 28, 22 -; CHECK32_64-NEXT: srw 22, 5, 27 -; CHECK32_64-NEXT: srw 27, 6, 27 -; CHECK32_64-NEXT: stw 20, 16(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: srw 20, 5, 25 -; CHECK32_64-NEXT: subfic 25, 25, 32 -; CHECK32_64-NEXT: stw 21, 20(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: slw 21, 4, 26 -; CHECK32_64-NEXT: slw 26, 6, 26 -; CHECK32_64-NEXT: or 28, 28, 21 -; CHECK32_64-NEXT: slw 21, 7, 18 -; CHECK32_64-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: slw 24, 5, 8 -; CHECK32_64-NEXT: slw 5, 5, 25 -; CHECK32_64-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: addi 29, 8, -96 -; CHECK32_64-NEXT: subfic 25, 9, 96 -; CHECK32_64-NEXT: slw 29, 6, 29 -; CHECK32_64-NEXT: or 27, 24, 27 -; CHECK32_64-NEXT: stw 19, 12(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: srw 19, 11, 9 -; CHECK32_64-NEXT: addi 24, 9, -64 -; CHECK32_64-NEXT: or 12, 12, 29 -; CHECK32_64-NEXT: srw 29, 10, 9 -; CHECK32_64-NEXT: slw 25, 7, 25 -; CHECK32_64-NEXT: or 21, 19, 21 -; CHECK32_64-NEXT: srw 19, 11, 24 -; CHECK32_64-NEXT: or 5, 23, 5 -; CHECK32_64-NEXT: slw 23, 30, 18 -; CHECK32_64-NEXT: or 27, 27, 26 -; CHECK32_64-NEXT: addi 26, 9, -96 -; CHECK32_64-NEXT: or 25, 19, 25 -; CHECK32_64-NEXT: lwz 19, 12(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 29, 29, 23 -; CHECK32_64-NEXT: addi 23, 9, -32 -; CHECK32_64-NEXT: srw 26, 7, 26 -; CHECK32_64-NEXT: or 28, 28, 20 -; CHECK32_64-NEXT: srw 20, 7, 23 -; CHECK32_64-NEXT: or 26, 25, 26 -; CHECK32_64-NEXT: subfic 25, 9, 64 -; CHECK32_64-NEXT: srw 23, 30, 23 -; CHECK32_64-NEXT: or 29, 29, 23 -; CHECK32_64-NEXT: subfic 23, 25, 32 -; CHECK32_64-NEXT: or 5, 5, 22 -; CHECK32_64-NEXT: slw 22, 11, 18 -; CHECK32_64-NEXT: lwz 18, 8(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: srw 23, 11, 23 -; CHECK32_64-NEXT: slw 11, 11, 25 -; CHECK32_64-NEXT: slw 25, 7, 25 -; CHECK32_64-NEXT: cmplwi 5, 8, 64 -; CHECK32_64-NEXT: bc 12, 20, .LBB2_1 -; CHECK32_64-NEXT: b .LBB2_2 -; CHECK32_64-NEXT: .LBB2_1: -; CHECK32_64-NEXT: addi 12, 28, 0 +; CHECK32_64-NEXT: clrlwi 12, 11, 27 +; CHECK32_64-NEXT: andi. 11, 11, 32 +; CHECK32_64-NEXT: bc 12, 6, .LBB2_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 4, 6, 0 +; CHECK32_64-NEXT: ori 30, 7, 0 +; CHECK32_64-NEXT: ori 3, 5, 0 +; CHECK32_64-NEXT: ori 7, 9, 0 +; CHECK32_64-NEXT: b .LBB2_3 ; CHECK32_64-NEXT: .LBB2_2: -; CHECK32_64-NEXT: lwz 28, 48(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 25, 25, 23 -; CHECK32_64-NEXT: bc 12, 6, .LBB2_4 -; CHECK32_64-NEXT: # %bb.3: -; CHECK32_64-NEXT: ori 3, 12, 0 -; CHECK32_64-NEXT: b .LBB2_4 -; CHECK32_64-NEXT: .LBB2_4: -; CHECK32_64-NEXT: slw 23, 6, 0 -; CHECK32_64-NEXT: slw 6, 6, 8 -; CHECK32_64-NEXT: slw 8, 4, 8 -; CHECK32_64-NEXT: cmplwi 6, 9, 64 -; CHECK32_64-NEXT: or 5, 8, 5 -; CHECK32_64-NEXT: bc 12, 20, .LBB2_6 -; CHECK32_64-NEXT: # %bb.5: -; CHECK32_64-NEXT: ori 5, 23, 0 +; CHECK32_64-NEXT: addi 30, 5, 0 +; CHECK32_64-NEXT: .LBB2_3: +; CHECK32_64-NEXT: bc 12, 2, .LBB2_5 +; CHECK32_64-NEXT: # %bb.4: +; CHECK32_64-NEXT: ori 5, 30, 0 +; CHECK32_64-NEXT: ori 3, 4, 0 ; CHECK32_64-NEXT: b .LBB2_6 +; CHECK32_64-NEXT: .LBB2_5: +; CHECK32_64-NEXT: addi 5, 4, 0 ; CHECK32_64-NEXT: .LBB2_6: -; CHECK32_64-NEXT: lwz 23, 28(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: li 8, 0 -; CHECK32_64-NEXT: srw 24, 7, 24 ; CHECK32_64-NEXT: bc 12, 6, .LBB2_8 ; CHECK32_64-NEXT: # %bb.7: -; CHECK32_64-NEXT: ori 4, 5, 0 -; CHECK32_64-NEXT: b .LBB2_8 +; CHECK32_64-NEXT: ori 4, 8, 0 +; CHECK32_64-NEXT: ori 8, 10, 0 +; CHECK32_64-NEXT: b .LBB2_9 ; CHECK32_64-NEXT: .LBB2_8: -; CHECK32_64-NEXT: bc 12, 20, .LBB2_10 -; CHECK32_64-NEXT: # %bb.9: -; CHECK32_64-NEXT: ori 6, 8, 0 -; CHECK32_64-NEXT: b .LBB2_10 -; CHECK32_64-NEXT: .LBB2_10: -; CHECK32_64-NEXT: srw 7, 7, 9 -; CHECK32_64-NEXT: srw 9, 30, 9 -; CHECK32_64-NEXT: bc 12, 24, .LBB2_12 -; CHECK32_64-NEXT: # %bb.11: +; CHECK32_64-NEXT: addi 4, 6, 0 +; CHECK32_64-NEXT: .LBB2_9: +; CHECK32_64-NEXT: subfic 11, 12, 32 +; CHECK32_64-NEXT: bc 12, 2, .LBB2_11 +; CHECK32_64-NEXT: # %bb.10: +; CHECK32_64-NEXT: ori 0, 4, 0 +; CHECK32_64-NEXT: ori 4, 7, 0 ; CHECK32_64-NEXT: ori 7, 8, 0 ; CHECK32_64-NEXT: b .LBB2_12 +; CHECK32_64-NEXT: .LBB2_11: +; CHECK32_64-NEXT: addi 0, 30, 0 ; CHECK32_64-NEXT: .LBB2_12: -; CHECK32_64-NEXT: or 0, 25, 22 -; CHECK32_64-NEXT: or 11, 29, 11 -; CHECK32_64-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: bc 12, 24, .LBB2_14 -; CHECK32_64-NEXT: # %bb.13: -; CHECK32_64-NEXT: ori 5, 26, 0 -; CHECK32_64-NEXT: b .LBB2_15 -; CHECK32_64-NEXT: .LBB2_14: -; CHECK32_64-NEXT: addi 5, 11, 0 -; CHECK32_64-NEXT: .LBB2_15: -; CHECK32_64-NEXT: or 9, 9, 0 -; CHECK32_64-NEXT: or 21, 21, 20 -; CHECK32_64-NEXT: bc 12, 2, .LBB2_16 -; CHECK32_64-NEXT: b .LBB2_17 -; CHECK32_64-NEXT: .LBB2_16: -; CHECK32_64-NEXT: addi 5, 10, 0 -; CHECK32_64-NEXT: .LBB2_17: -; CHECK32_64-NEXT: bc 12, 24, .LBB2_19 -; CHECK32_64-NEXT: # %bb.18: -; CHECK32_64-NEXT: ori 0, 8, 0 -; CHECK32_64-NEXT: b .LBB2_20 -; CHECK32_64-NEXT: .LBB2_19: -; CHECK32_64-NEXT: addi 0, 21, 0 -; CHECK32_64-NEXT: .LBB2_20: -; CHECK32_64-NEXT: bc 12, 20, .LBB2_21 -; CHECK32_64-NEXT: b .LBB2_22 -; CHECK32_64-NEXT: .LBB2_21: -; CHECK32_64-NEXT: addi 8, 27, 0 -; CHECK32_64-NEXT: .LBB2_22: -; CHECK32_64-NEXT: lwz 27, 44(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 3, 3, 7 -; CHECK32_64-NEXT: bc 12, 24, .LBB2_24 -; CHECK32_64-NEXT: # %bb.23: -; CHECK32_64-NEXT: ori 7, 24, 0 -; CHECK32_64-NEXT: b .LBB2_25 -; CHECK32_64-NEXT: .LBB2_24: -; CHECK32_64-NEXT: addi 7, 9, 0 -; CHECK32_64-NEXT: .LBB2_25: -; CHECK32_64-NEXT: or 4, 4, 0 -; CHECK32_64-NEXT: bc 12, 2, .LBB2_26 -; CHECK32_64-NEXT: b .LBB2_27 -; CHECK32_64-NEXT: .LBB2_26: -; CHECK32_64-NEXT: addi 7, 30, 0 -; CHECK32_64-NEXT: .LBB2_27: -; CHECK32_64-NEXT: or 6, 6, 5 -; CHECK32_64-NEXT: or 5, 8, 7 -; CHECK32_64-NEXT: lwz 30, 56(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 26, 40(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 25, 36(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 24, 32(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 22, 24(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 20, 16(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: addi 1, 1, 64 +; CHECK32_64-NEXT: srw 6, 5, 11 +; CHECK32_64-NEXT: lwz 30, 8(1) # 4-byte Folded Reload +; CHECK32_64-NEXT: slw 3, 3, 12 +; CHECK32_64-NEXT: srw 9, 0, 11 +; CHECK32_64-NEXT: slw 5, 5, 12 +; CHECK32_64-NEXT: srw 10, 4, 11 +; CHECK32_64-NEXT: slw 0, 0, 12 +; CHECK32_64-NEXT: srw 7, 7, 11 +; CHECK32_64-NEXT: slw 8, 4, 12 +; CHECK32_64-NEXT: or 3, 3, 6 +; CHECK32_64-NEXT: or 4, 5, 9 +; CHECK32_64-NEXT: or 5, 0, 10 +; CHECK32_64-NEXT: or 6, 8, 7 +; CHECK32_64-NEXT: addi 1, 1, 16 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: fshl_i128: ; CHECK64: # %bb.0: -; CHECK64-NEXT: clrlwi 8, 7, 25 -; CHECK64-NEXT: rotldi 5, 5, 63 -; CHECK64-NEXT: not 7, 7 -; CHECK64-NEXT: rldicl 9, 6, 63, 1 -; CHECK64-NEXT: subfic 10, 8, 64 -; CHECK64-NEXT: addi 11, 8, -64 -; CHECK64-NEXT: rldimi 5, 6, 63, 0 -; CHECK64-NEXT: clrlwi 6, 7, 25 -; CHECK64-NEXT: srd 7, 3, 10 -; CHECK64-NEXT: sld 10, 3, 11 -; CHECK64-NEXT: subfic 11, 6, 64 -; CHECK64-NEXT: addi 12, 6, -64 -; CHECK64-NEXT: sld 4, 4, 8 -; CHECK64-NEXT: srd 5, 5, 6 -; CHECK64-NEXT: sld 11, 9, 11 -; CHECK64-NEXT: or 4, 4, 7 -; CHECK64-NEXT: or 5, 5, 11 -; CHECK64-NEXT: srd 7, 9, 12 -; CHECK64-NEXT: or 4, 4, 10 -; CHECK64-NEXT: srd 6, 9, 6 -; CHECK64-NEXT: or 5, 5, 7 -; CHECK64-NEXT: sld 3, 3, 8 -; CHECK64-NEXT: or 4, 4, 6 -; CHECK64-NEXT: or 3, 3, 5 +; CHECK64-NEXT: andi. 8, 7, 64 +; CHECK64-NEXT: clrlwi 7, 7, 26 +; CHECK64-NEXT: iseleq 5, 6, 5 +; CHECK64-NEXT: subfic 8, 7, 64 +; CHECK64-NEXT: iseleq 6, 3, 6 +; CHECK64-NEXT: iseleq 3, 4, 3 +; CHECK64-NEXT: srd 4, 5, 8 +; CHECK64-NEXT: sld 5, 6, 7 +; CHECK64-NEXT: srd 6, 6, 8 +; CHECK64-NEXT: sld 7, 3, 7 +; CHECK64-NEXT: or 3, 5, 4 +; CHECK64-NEXT: or 4, 7, 6 ; CHECK64-NEXT: blr %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f @@ -525,31 +255,29 @@ ; CHECK32_32-NEXT: li 5, 0 ; CHECK32_32-NEXT: li 6, 37 ; CHECK32_32-NEXT: bl __umoddi3 -; CHECK32_32-NEXT: clrlwi 6, 4, 26 -; CHECK32_32-NEXT: srwi 3, 30, 6 -; CHECK32_32-NEXT: not 4, 4 -; CHECK32_32-NEXT: subfic 8, 6, 32 -; CHECK32_32-NEXT: slwi 5, 30, 26 -; CHECK32_32-NEXT: rlwimi 3, 29, 26, 1, 5 -; CHECK32_32-NEXT: slw 7, 27, 6 -; CHECK32_32-NEXT: clrlwi 4, 4, 26 -; CHECK32_32-NEXT: srw 8, 28, 8 -; CHECK32_32-NEXT: srw 9, 3, 4 -; CHECK32_32-NEXT: srw 5, 5, 4 -; CHECK32_32-NEXT: or 7, 7, 8 -; CHECK32_32-NEXT: subfic 8, 4, 32 -; CHECK32_32-NEXT: addi 4, 4, -32 -; CHECK32_32-NEXT: slw 8, 3, 8 -; CHECK32_32-NEXT: srw 4, 3, 4 -; CHECK32_32-NEXT: addi 3, 6, -32 -; CHECK32_32-NEXT: slw 3, 28, 3 -; CHECK32_32-NEXT: or 5, 5, 8 -; CHECK32_32-NEXT: or 3, 7, 3 -; CHECK32_32-NEXT: or 4, 5, 4 -; CHECK32_32-NEXT: slw 5, 28, 6 -; CHECK32_32-NEXT: or 3, 3, 9 -; CHECK32_32-NEXT: or 4, 5, 4 +; CHECK32_32-NEXT: rotlwi 3, 30, 27 +; CHECK32_32-NEXT: slwi 5, 30, 27 +; CHECK32_32-NEXT: andi. 6, 4, 32 +; CHECK32_32-NEXT: rlwimi 3, 29, 27, 0, 4 +; CHECK32_32-NEXT: clrlwi 4, 4, 27 +; CHECK32_32-NEXT: subfic 6, 4, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB3_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 7, 3, 0 +; CHECK32_32-NEXT: ori 8, 28, 0 +; CHECK32_32-NEXT: ori 3, 5, 0 +; CHECK32_32-NEXT: b .LBB3_3 +; CHECK32_32-NEXT: .LBB3_2: +; CHECK32_32-NEXT: addi 7, 28, 0 +; CHECK32_32-NEXT: addi 8, 27, 0 +; CHECK32_32-NEXT: .LBB3_3: ; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32_32-NEXT: srw 5, 7, 6 +; CHECK32_32-NEXT: slw 8, 8, 4 +; CHECK32_32-NEXT: srw 6, 3, 6 +; CHECK32_32-NEXT: slw 4, 7, 4 +; CHECK32_32-NEXT: or 3, 8, 5 +; CHECK32_32-NEXT: or 4, 4, 6 ; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload ; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload ; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload @@ -582,35 +310,42 @@ ; CHECK32_64-NEXT: mr 30, 6 ; CHECK32_64-NEXT: li 6, 37 ; CHECK32_64-NEXT: bl __umoddi3 -; CHECK32_64-NEXT: clrlwi 6, 4, 26 -; CHECK32_64-NEXT: not 4, 4 -; CHECK32_64-NEXT: subfic 8, 6, 32 -; CHECK32_64-NEXT: srwi 3, 30, 6 -; CHECK32_64-NEXT: slw 7, 27, 6 -; CHECK32_64-NEXT: clrlwi 4, 4, 26 +; CHECK32_64-NEXT: rotlwi 3, 30, 27 +; CHECK32_64-NEXT: andi. 5, 4, 32 +; CHECK32_64-NEXT: bc 12, 2, .LBB3_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 8, 28, 0 +; CHECK32_64-NEXT: b .LBB3_3 +; CHECK32_64-NEXT: .LBB3_2: +; CHECK32_64-NEXT: addi 8, 27, 0 +; CHECK32_64-NEXT: .LBB3_3: ; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: srw 8, 28, 8 -; CHECK32_64-NEXT: rlwimi 3, 29, 26, 1, 5 -; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: slwi 5, 30, 26 -; CHECK32_64-NEXT: or 7, 7, 8 -; CHECK32_64-NEXT: subfic 8, 4, 32 +; CHECK32_64-NEXT: rlwimi 3, 29, 27, 0, 4 +; CHECK32_64-NEXT: clrlwi 4, 4, 27 +; CHECK32_64-NEXT: bc 12, 2, .LBB3_5 +; CHECK32_64-NEXT: # %bb.4: +; CHECK32_64-NEXT: ori 7, 3, 0 +; CHECK32_64-NEXT: b .LBB3_6 +; CHECK32_64-NEXT: .LBB3_5: +; CHECK32_64-NEXT: addi 7, 28, 0 +; CHECK32_64-NEXT: .LBB3_6: +; CHECK32_64-NEXT: slwi 5, 30, 27 ; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: addi 9, 6, -32 -; CHECK32_64-NEXT: srw 10, 3, 4 -; CHECK32_64-NEXT: srw 5, 5, 4 -; CHECK32_64-NEXT: addi 4, 4, -32 -; CHECK32_64-NEXT: slw 8, 3, 8 -; CHECK32_64-NEXT: slw 9, 28, 9 -; CHECK32_64-NEXT: srw 3, 3, 4 -; CHECK32_64-NEXT: or 4, 5, 8 -; CHECK32_64-NEXT: slw 6, 28, 6 -; CHECK32_64-NEXT: or 5, 7, 9 +; CHECK32_64-NEXT: bc 12, 2, .LBB3_8 +; CHECK32_64-NEXT: # %bb.7: +; CHECK32_64-NEXT: ori 3, 5, 0 +; CHECK32_64-NEXT: b .LBB3_8 +; CHECK32_64-NEXT: .LBB3_8: +; CHECK32_64-NEXT: subfic 6, 4, 32 +; CHECK32_64-NEXT: slw 8, 8, 4 +; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32_64-NEXT: srw 9, 7, 6 +; CHECK32_64-NEXT: srw 5, 3, 6 +; CHECK32_64-NEXT: slw 4, 7, 4 +; CHECK32_64-NEXT: or 3, 8, 9 ; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 4, 4, 3 -; CHECK32_64-NEXT: or 3, 5, 10 +; CHECK32_64-NEXT: or 4, 4, 5 ; CHECK32_64-NEXT: lwz 0, 36(1) -; CHECK32_64-NEXT: or 4, 6, 4 ; CHECK32_64-NEXT: addi 1, 1, 32 ; CHECK32_64-NEXT: mtlr 0 ; CHECK32_64-NEXT: blr @@ -737,58 +472,47 @@ define i64 @fshr_i64(i64 %x, i64 %y, i64 %z) { ; CHECK32_32-LABEL: fshr_i64: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: clrlwi 7, 8, 26 -; CHECK32_32-NEXT: slwi 9, 4, 1 -; CHECK32_32-NEXT: not 8, 8 -; CHECK32_32-NEXT: rotlwi 4, 4, 1 -; CHECK32_32-NEXT: subfic 10, 7, 32 -; CHECK32_32-NEXT: srw 6, 6, 7 -; CHECK32_32-NEXT: clrlwi 8, 8, 26 -; CHECK32_32-NEXT: rlwimi 4, 3, 1, 0, 30 -; CHECK32_32-NEXT: slw 3, 5, 10 -; CHECK32_32-NEXT: slw 10, 9, 8 -; CHECK32_32-NEXT: slw 4, 4, 8 -; CHECK32_32-NEXT: or 3, 6, 3 -; CHECK32_32-NEXT: subfic 6, 8, 32 -; CHECK32_32-NEXT: addi 8, 8, -32 -; CHECK32_32-NEXT: srw 6, 9, 6 -; CHECK32_32-NEXT: slw 8, 9, 8 -; CHECK32_32-NEXT: addi 9, 7, -32 -; CHECK32_32-NEXT: srw 9, 5, 9 -; CHECK32_32-NEXT: or 3, 3, 9 -; CHECK32_32-NEXT: or 6, 4, 6 -; CHECK32_32-NEXT: or 4, 10, 3 -; CHECK32_32-NEXT: or 3, 6, 8 -; CHECK32_32-NEXT: srw 5, 5, 7 +; CHECK32_32-NEXT: andi. 7, 8, 32 +; CHECK32_32-NEXT: clrlwi 7, 8, 27 +; CHECK32_32-NEXT: subfic 8, 7, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB10_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 9, 4, 0 +; CHECK32_32-NEXT: ori 4, 5, 0 +; CHECK32_32-NEXT: b .LBB10_3 +; CHECK32_32-NEXT: .LBB10_2: +; CHECK32_32-NEXT: addi 9, 5, 0 +; CHECK32_32-NEXT: addi 3, 4, 0 +; CHECK32_32-NEXT: addi 4, 6, 0 +; CHECK32_32-NEXT: .LBB10_3: +; CHECK32_32-NEXT: srw 5, 9, 7 +; CHECK32_32-NEXT: slw 3, 3, 8 +; CHECK32_32-NEXT: srw 4, 4, 7 +; CHECK32_32-NEXT: slw 6, 9, 8 ; CHECK32_32-NEXT: or 3, 3, 5 +; CHECK32_32-NEXT: or 4, 6, 4 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: fshr_i64: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: rotlwi 7, 4, 1 -; CHECK32_64-NEXT: slwi 4, 4, 1 -; CHECK32_64-NEXT: rlwimi 7, 3, 1, 0, 30 -; CHECK32_64-NEXT: clrlwi 3, 8, 26 -; CHECK32_64-NEXT: not 8, 8 -; CHECK32_64-NEXT: subfic 9, 3, 32 -; CHECK32_64-NEXT: srw 6, 6, 3 -; CHECK32_64-NEXT: clrlwi 8, 8, 26 -; CHECK32_64-NEXT: slw 9, 5, 9 -; CHECK32_64-NEXT: addi 10, 3, -32 -; CHECK32_64-NEXT: or 6, 6, 9 -; CHECK32_64-NEXT: subfic 9, 8, 32 -; CHECK32_64-NEXT: srw 3, 5, 3 -; CHECK32_64-NEXT: srw 5, 5, 10 -; CHECK32_64-NEXT: slw 10, 4, 8 -; CHECK32_64-NEXT: slw 7, 7, 8 -; CHECK32_64-NEXT: addi 8, 8, -32 -; CHECK32_64-NEXT: srw 9, 4, 9 -; CHECK32_64-NEXT: slw 4, 4, 8 -; CHECK32_64-NEXT: or 7, 7, 9 -; CHECK32_64-NEXT: or 5, 6, 5 -; CHECK32_64-NEXT: or 6, 7, 4 -; CHECK32_64-NEXT: or 4, 10, 5 -; CHECK32_64-NEXT: or 3, 6, 3 +; CHECK32_64-NEXT: andi. 7, 8, 32 +; CHECK32_64-NEXT: clrlwi 7, 8, 27 +; CHECK32_64-NEXT: bc 12, 2, .LBB10_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 9, 4, 0 +; CHECK32_64-NEXT: b .LBB10_3 +; CHECK32_64-NEXT: .LBB10_2: +; CHECK32_64-NEXT: addi 9, 5, 0 +; CHECK32_64-NEXT: addi 3, 4, 0 +; CHECK32_64-NEXT: addi 5, 6, 0 +; CHECK32_64-NEXT: .LBB10_3: +; CHECK32_64-NEXT: subfic 8, 7, 32 +; CHECK32_64-NEXT: srw 4, 9, 7 +; CHECK32_64-NEXT: slw 3, 3, 8 +; CHECK32_64-NEXT: srw 5, 5, 7 +; CHECK32_64-NEXT: slw 6, 9, 8 +; CHECK32_64-NEXT: or 3, 3, 4 +; CHECK32_64-NEXT: or 4, 6, 5 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: fshr_i64: @@ -830,35 +554,30 @@ ; CHECK32_32-NEXT: li 5, 0 ; CHECK32_32-NEXT: li 6, 37 ; CHECK32_32-NEXT: bl __umoddi3 +; CHECK32_32-NEXT: rotlwi 3, 30, 27 ; CHECK32_32-NEXT: addi 4, 4, 27 -; CHECK32_32-NEXT: rotlwi 5, 30, 27 -; CHECK32_32-NEXT: clrlwi 8, 4, 26 -; CHECK32_32-NEXT: slwi 3, 30, 27 -; CHECK32_32-NEXT: rotlwi 7, 28, 1 -; CHECK32_32-NEXT: rlwimi 5, 29, 27, 0, 4 -; CHECK32_32-NEXT: not 4, 4 -; CHECK32_32-NEXT: subfic 9, 8, 32 -; CHECK32_32-NEXT: slwi 6, 28, 1 -; CHECK32_32-NEXT: rlwimi 7, 27, 1, 0, 30 -; CHECK32_32-NEXT: srw 3, 3, 8 -; CHECK32_32-NEXT: clrlwi 4, 4, 26 -; CHECK32_32-NEXT: slw 9, 5, 9 -; CHECK32_32-NEXT: slw 10, 6, 4 -; CHECK32_32-NEXT: slw 7, 7, 4 -; CHECK32_32-NEXT: or 3, 3, 9 -; CHECK32_32-NEXT: subfic 9, 4, 32 -; CHECK32_32-NEXT: addi 4, 4, -32 -; CHECK32_32-NEXT: srw 9, 6, 9 -; CHECK32_32-NEXT: slw 6, 6, 4 -; CHECK32_32-NEXT: addi 4, 8, -32 -; CHECK32_32-NEXT: srw 4, 5, 4 -; CHECK32_32-NEXT: or 3, 3, 4 -; CHECK32_32-NEXT: or 7, 7, 9 -; CHECK32_32-NEXT: or 4, 10, 3 -; CHECK32_32-NEXT: or 3, 7, 6 -; CHECK32_32-NEXT: srw 5, 5, 8 -; CHECK32_32-NEXT: or 3, 3, 5 +; CHECK32_32-NEXT: slwi 5, 30, 27 +; CHECK32_32-NEXT: rlwimi 3, 29, 27, 0, 4 +; CHECK32_32-NEXT: andi. 6, 4, 32 +; CHECK32_32-NEXT: clrlwi 4, 4, 27 +; CHECK32_32-NEXT: subfic 6, 4, 32 +; CHECK32_32-NEXT: bc 12, 2, .LBB11_2 +; CHECK32_32-NEXT: # %bb.1: +; CHECK32_32-NEXT: ori 7, 28, 0 +; CHECK32_32-NEXT: ori 8, 27, 0 +; CHECK32_32-NEXT: b .LBB11_3 +; CHECK32_32-NEXT: .LBB11_2: +; CHECK32_32-NEXT: addi 7, 3, 0 +; CHECK32_32-NEXT: addi 8, 28, 0 +; CHECK32_32-NEXT: addi 3, 5, 0 +; CHECK32_32-NEXT: .LBB11_3: ; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32_32-NEXT: srw 5, 7, 4 +; CHECK32_32-NEXT: slw 8, 8, 6 +; CHECK32_32-NEXT: srw 4, 3, 4 +; CHECK32_32-NEXT: slw 6, 7, 6 +; CHECK32_32-NEXT: or 3, 8, 5 +; CHECK32_32-NEXT: or 4, 6, 4 ; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload ; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload ; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload @@ -893,37 +612,36 @@ ; CHECK32_64-NEXT: bl __umoddi3 ; CHECK32_64-NEXT: addi 4, 4, 27 ; CHECK32_64-NEXT: rotlwi 3, 30, 27 -; CHECK32_64-NEXT: clrlwi 8, 4, 26 +; CHECK32_64-NEXT: andi. 5, 4, 32 ; CHECK32_64-NEXT: rlwimi 3, 29, 27, 0, 4 ; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: slwi 6, 30, 27 -; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: not 4, 4 -; CHECK32_64-NEXT: subfic 9, 8, 32 -; CHECK32_64-NEXT: rotlwi 5, 28, 1 -; CHECK32_64-NEXT: srw 6, 6, 8 -; CHECK32_64-NEXT: clrlwi 4, 4, 26 -; CHECK32_64-NEXT: slw 9, 3, 9 -; CHECK32_64-NEXT: rlwimi 5, 27, 1, 0, 30 -; CHECK32_64-NEXT: slwi 7, 28, 1 +; CHECK32_64-NEXT: bc 12, 2, .LBB11_2 +; CHECK32_64-NEXT: # %bb.1: +; CHECK32_64-NEXT: ori 7, 28, 0 +; CHECK32_64-NEXT: ori 8, 27, 0 +; CHECK32_64-NEXT: b .LBB11_3 +; CHECK32_64-NEXT: .LBB11_2: +; CHECK32_64-NEXT: addi 7, 3, 0 +; CHECK32_64-NEXT: addi 8, 28, 0 +; CHECK32_64-NEXT: .LBB11_3: +; CHECK32_64-NEXT: clrlwi 4, 4, 27 ; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: addi 10, 8, -32 +; CHECK32_64-NEXT: slwi 5, 30, 27 +; CHECK32_64-NEXT: subfic 6, 4, 32 +; CHECK32_64-NEXT: bc 12, 2, .LBB11_4 +; CHECK32_64-NEXT: b .LBB11_5 +; CHECK32_64-NEXT: .LBB11_4: +; CHECK32_64-NEXT: addi 3, 5, 0 +; CHECK32_64-NEXT: .LBB11_5: +; CHECK32_64-NEXT: srw 9, 7, 4 +; CHECK32_64-NEXT: slw 8, 8, 6 +; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32_64-NEXT: srw 4, 3, 4 +; CHECK32_64-NEXT: slw 5, 7, 6 ; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 6, 6, 9 -; CHECK32_64-NEXT: subfic 9, 4, 32 -; CHECK32_64-NEXT: srw 8, 3, 8 -; CHECK32_64-NEXT: srw 3, 3, 10 +; CHECK32_64-NEXT: or 3, 8, 9 +; CHECK32_64-NEXT: or 4, 5, 4 ; CHECK32_64-NEXT: lwz 0, 36(1) -; CHECK32_64-NEXT: slw 10, 7, 4 -; CHECK32_64-NEXT: slw 5, 5, 4 -; CHECK32_64-NEXT: addi 4, 4, -32 -; CHECK32_64-NEXT: srw 9, 7, 9 -; CHECK32_64-NEXT: slw 4, 7, 4 -; CHECK32_64-NEXT: or 5, 5, 9 -; CHECK32_64-NEXT: or 3, 6, 3 -; CHECK32_64-NEXT: or 5, 5, 4 -; CHECK32_64-NEXT: or 4, 10, 3 -; CHECK32_64-NEXT: or 3, 5, 8 ; CHECK32_64-NEXT: addi 1, 1, 32 ; CHECK32_64-NEXT: mtlr 0 ; CHECK32_64-NEXT: blr diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll @@ -176,124 +176,76 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: rol_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv a7, a1 -; RV32I-NEXT: andi a1, a2, 63 -; RV32I-NEXT: addi t0, a1, -32 -; RV32I-NEXT: addi a6, zero, 31 -; RV32I-NEXT: bltz t0, .LBB7_2 +; RV32I-NEXT: srli a3, a2, 5 +; RV32I-NEXT: andi a3, a3, 1 +; RV32I-NEXT: mv a4, a1 +; RV32I-NEXT: bnez a3, .LBB7_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sll a1, a0, t0 -; RV32I-NEXT: j .LBB7_3 +; RV32I-NEXT: mv a4, a0 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: sll a3, a7, a2 -; RV32I-NEXT: sub a1, a6, a1 -; RV32I-NEXT: srli a4, a0, 1 -; RV32I-NEXT: srl a1, a4, a1 -; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: .LBB7_3: -; RV32I-NEXT: neg a5, a2 -; RV32I-NEXT: andi a3, a5, 63 -; RV32I-NEXT: addi a4, a3, -32 -; RV32I-NEXT: bltz a4, .LBB7_5 -; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a3, a7, a4 -; RV32I-NEXT: bltz t0, .LBB7_6 -; RV32I-NEXT: j .LBB7_7 -; RV32I-NEXT: .LBB7_5: -; RV32I-NEXT: srl a4, a7, a5 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: srl a4, a0, a5 -; RV32I-NEXT: sub a3, a6, a3 -; RV32I-NEXT: slli a5, a7, 1 -; RV32I-NEXT: sll a3, a5, a3 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: bgez t0, .LBB7_7 -; RV32I-NEXT: .LBB7_6: +; RV32I-NEXT: sll a6, a4, a2 +; RV32I-NEXT: bnez a3, .LBB7_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: .LBB7_4: +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: not a5, a2 +; RV32I-NEXT: srl a1, a1, a5 +; RV32I-NEXT: or a3, a6, a1 ; RV32I-NEXT: sll a0, a0, a2 -; RV32I-NEXT: or a3, a3, a0 -; RV32I-NEXT: .LBB7_7: +; RV32I-NEXT: srli a1, a4, 1 +; RV32I-NEXT: srl a1, a1, a5 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: rol_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv a7, a1 -; RV32ZBB-NEXT: andi a1, a2, 63 -; RV32ZBB-NEXT: addi t0, a1, -32 -; RV32ZBB-NEXT: addi a6, zero, 31 -; RV32ZBB-NEXT: bltz t0, .LBB7_2 +; RV32ZBB-NEXT: srli a3, a2, 5 +; RV32ZBB-NEXT: andi a3, a3, 1 +; RV32ZBB-NEXT: mv a4, a1 +; RV32ZBB-NEXT: bnez a3, .LBB7_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sll a1, a0, t0 -; RV32ZBB-NEXT: j .LBB7_3 +; RV32ZBB-NEXT: mv a4, a0 ; RV32ZBB-NEXT: .LBB7_2: -; RV32ZBB-NEXT: sll a3, a7, a2 -; RV32ZBB-NEXT: sub a1, a6, a1 -; RV32ZBB-NEXT: srli a4, a0, 1 -; RV32ZBB-NEXT: srl a1, a4, a1 -; RV32ZBB-NEXT: or a1, a3, a1 -; RV32ZBB-NEXT: .LBB7_3: -; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a3, a5, 63 -; RV32ZBB-NEXT: addi a4, a3, -32 -; RV32ZBB-NEXT: bltz a4, .LBB7_5 -; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: srl a3, a7, a4 -; RV32ZBB-NEXT: bltz t0, .LBB7_6 -; RV32ZBB-NEXT: j .LBB7_7 -; RV32ZBB-NEXT: .LBB7_5: -; RV32ZBB-NEXT: srl a4, a7, a5 -; RV32ZBB-NEXT: or a1, a1, a4 -; RV32ZBB-NEXT: srl a4, a0, a5 -; RV32ZBB-NEXT: sub a3, a6, a3 -; RV32ZBB-NEXT: slli a5, a7, 1 -; RV32ZBB-NEXT: sll a3, a5, a3 -; RV32ZBB-NEXT: or a3, a4, a3 -; RV32ZBB-NEXT: bgez t0, .LBB7_7 -; RV32ZBB-NEXT: .LBB7_6: +; RV32ZBB-NEXT: sll a6, a4, a2 +; RV32ZBB-NEXT: bnez a3, .LBB7_4 +; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: mv a0, a1 +; RV32ZBB-NEXT: .LBB7_4: +; RV32ZBB-NEXT: srli a1, a0, 1 +; RV32ZBB-NEXT: not a5, a2 +; RV32ZBB-NEXT: srl a1, a1, a5 +; RV32ZBB-NEXT: or a3, a6, a1 ; RV32ZBB-NEXT: sll a0, a0, a2 -; RV32ZBB-NEXT: or a3, a3, a0 -; RV32ZBB-NEXT: .LBB7_7: +; RV32ZBB-NEXT: srli a1, a4, 1 +; RV32ZBB-NEXT: srl a1, a1, a5 +; RV32ZBB-NEXT: or a1, a0, a1 ; RV32ZBB-NEXT: mv a0, a3 ; RV32ZBB-NEXT: ret ; ; RV32ZBP-LABEL: rol_i64: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: mv a7, a1 -; RV32ZBP-NEXT: andi a1, a2, 63 -; RV32ZBP-NEXT: addi t0, a1, -32 -; RV32ZBP-NEXT: addi a6, zero, 31 -; RV32ZBP-NEXT: bltz t0, .LBB7_2 +; RV32ZBP-NEXT: srli a3, a2, 5 +; RV32ZBP-NEXT: andi a3, a3, 1 +; RV32ZBP-NEXT: mv a4, a1 +; RV32ZBP-NEXT: bnez a3, .LBB7_2 ; RV32ZBP-NEXT: # %bb.1: -; RV32ZBP-NEXT: sll a1, a0, t0 -; RV32ZBP-NEXT: j .LBB7_3 +; RV32ZBP-NEXT: mv a4, a0 ; RV32ZBP-NEXT: .LBB7_2: -; RV32ZBP-NEXT: sll a3, a7, a2 -; RV32ZBP-NEXT: sub a1, a6, a1 -; RV32ZBP-NEXT: srli a4, a0, 1 -; RV32ZBP-NEXT: srl a1, a4, a1 -; RV32ZBP-NEXT: or a1, a3, a1 -; RV32ZBP-NEXT: .LBB7_3: -; RV32ZBP-NEXT: neg a5, a2 -; RV32ZBP-NEXT: andi a3, a5, 63 -; RV32ZBP-NEXT: addi a4, a3, -32 -; RV32ZBP-NEXT: bltz a4, .LBB7_5 -; RV32ZBP-NEXT: # %bb.4: -; RV32ZBP-NEXT: srl a3, a7, a4 -; RV32ZBP-NEXT: bltz t0, .LBB7_6 -; RV32ZBP-NEXT: j .LBB7_7 -; RV32ZBP-NEXT: .LBB7_5: -; RV32ZBP-NEXT: srl a4, a7, a5 -; RV32ZBP-NEXT: or a1, a1, a4 -; RV32ZBP-NEXT: srl a4, a0, a5 -; RV32ZBP-NEXT: sub a3, a6, a3 -; RV32ZBP-NEXT: slli a5, a7, 1 -; RV32ZBP-NEXT: sll a3, a5, a3 -; RV32ZBP-NEXT: or a3, a4, a3 -; RV32ZBP-NEXT: bgez t0, .LBB7_7 -; RV32ZBP-NEXT: .LBB7_6: +; RV32ZBP-NEXT: sll a6, a4, a2 +; RV32ZBP-NEXT: bnez a3, .LBB7_4 +; RV32ZBP-NEXT: # %bb.3: +; RV32ZBP-NEXT: mv a0, a1 +; RV32ZBP-NEXT: .LBB7_4: +; RV32ZBP-NEXT: srli a1, a0, 1 +; RV32ZBP-NEXT: not a5, a2 +; RV32ZBP-NEXT: srl a1, a1, a5 +; RV32ZBP-NEXT: or a3, a6, a1 ; RV32ZBP-NEXT: sll a0, a0, a2 -; RV32ZBP-NEXT: or a3, a3, a0 -; RV32ZBP-NEXT: .LBB7_7: +; RV32ZBP-NEXT: srli a1, a4, 1 +; RV32ZBP-NEXT: srl a1, a1, a5 +; RV32ZBP-NEXT: or a1, a0, a1 ; RV32ZBP-NEXT: mv a0, a3 ; RV32ZBP-NEXT: ret %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b) @@ -332,125 +284,71 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: ror_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t0, a0 -; RV32I-NEXT: andi a0, a2, 63 -; RV32I-NEXT: addi a7, a0, -32 -; RV32I-NEXT: addi a6, zero, 31 -; RV32I-NEXT: bltz a7, .LBB9_2 +; RV32I-NEXT: andi a4, a2, 32 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a4, .LBB9_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srl a0, a1, a7 -; RV32I-NEXT: j .LBB9_3 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: srl a3, t0, a2 -; RV32I-NEXT: sub a0, a6, a0 -; RV32I-NEXT: slli a4, a1, 1 -; RV32I-NEXT: sll a0, a4, a0 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: .LBB9_3: -; RV32I-NEXT: neg a5, a2 -; RV32I-NEXT: andi a4, a5, 63 -; RV32I-NEXT: addi a3, a4, -32 -; RV32I-NEXT: bltz a3, .LBB9_5 -; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a3, t0, a3 -; RV32I-NEXT: bltz a7, .LBB9_6 -; RV32I-NEXT: j .LBB9_7 -; RV32I-NEXT: .LBB9_5: -; RV32I-NEXT: sll a3, t0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: sll a3, a1, a5 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: srli a5, t0, 1 -; RV32I-NEXT: srl a4, a5, a4 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: bgez a7, .LBB9_7 -; RV32I-NEXT: .LBB9_6: +; RV32I-NEXT: srl a5, a3, a2 +; RV32I-NEXT: beqz a4, .LBB9_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB9_4: +; RV32I-NEXT: slli a0, a1, 1 +; RV32I-NEXT: not a4, a2 +; RV32I-NEXT: sll a0, a0, a4 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: srl a1, a1, a2 -; RV32I-NEXT: or a3, a3, a1 -; RV32I-NEXT: .LBB9_7: -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: slli a2, a3, 1 +; RV32I-NEXT: sll a2, a2, a4 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ror_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv t0, a0 -; RV32ZBB-NEXT: andi a0, a2, 63 -; RV32ZBB-NEXT: addi a7, a0, -32 -; RV32ZBB-NEXT: addi a6, zero, 31 -; RV32ZBB-NEXT: bltz a7, .LBB9_2 +; RV32ZBB-NEXT: andi a4, a2, 32 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: beqz a4, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: srl a0, a1, a7 -; RV32ZBB-NEXT: j .LBB9_3 +; RV32ZBB-NEXT: mv a3, a1 ; RV32ZBB-NEXT: .LBB9_2: -; RV32ZBB-NEXT: srl a3, t0, a2 -; RV32ZBB-NEXT: sub a0, a6, a0 -; RV32ZBB-NEXT: slli a4, a1, 1 -; RV32ZBB-NEXT: sll a0, a4, a0 -; RV32ZBB-NEXT: or a0, a3, a0 -; RV32ZBB-NEXT: .LBB9_3: -; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a4, a5, 63 -; RV32ZBB-NEXT: addi a3, a4, -32 -; RV32ZBB-NEXT: bltz a3, .LBB9_5 -; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: sll a3, t0, a3 -; RV32ZBB-NEXT: bltz a7, .LBB9_6 -; RV32ZBB-NEXT: j .LBB9_7 -; RV32ZBB-NEXT: .LBB9_5: -; RV32ZBB-NEXT: sll a3, t0, a5 -; RV32ZBB-NEXT: or a0, a0, a3 -; RV32ZBB-NEXT: sll a3, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: srli a5, t0, 1 -; RV32ZBB-NEXT: srl a4, a5, a4 -; RV32ZBB-NEXT: or a3, a3, a4 -; RV32ZBB-NEXT: bgez a7, .LBB9_7 -; RV32ZBB-NEXT: .LBB9_6: +; RV32ZBB-NEXT: srl a5, a3, a2 +; RV32ZBB-NEXT: beqz a4, .LBB9_4 +; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB9_4: +; RV32ZBB-NEXT: slli a0, a1, 1 +; RV32ZBB-NEXT: not a4, a2 +; RV32ZBB-NEXT: sll a0, a0, a4 +; RV32ZBB-NEXT: or a0, a0, a5 ; RV32ZBB-NEXT: srl a1, a1, a2 -; RV32ZBB-NEXT: or a3, a3, a1 -; RV32ZBB-NEXT: .LBB9_7: -; RV32ZBB-NEXT: mv a1, a3 +; RV32ZBB-NEXT: slli a2, a3, 1 +; RV32ZBB-NEXT: sll a2, a2, a4 +; RV32ZBB-NEXT: or a1, a2, a1 ; RV32ZBB-NEXT: ret ; ; RV32ZBP-LABEL: ror_i64: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: mv t0, a0 -; RV32ZBP-NEXT: andi a0, a2, 63 -; RV32ZBP-NEXT: addi a7, a0, -32 -; RV32ZBP-NEXT: addi a6, zero, 31 -; RV32ZBP-NEXT: bltz a7, .LBB9_2 +; RV32ZBP-NEXT: andi a4, a2, 32 +; RV32ZBP-NEXT: mv a3, a0 +; RV32ZBP-NEXT: beqz a4, .LBB9_2 ; RV32ZBP-NEXT: # %bb.1: -; RV32ZBP-NEXT: srl a0, a1, a7 -; RV32ZBP-NEXT: j .LBB9_3 +; RV32ZBP-NEXT: mv a3, a1 ; RV32ZBP-NEXT: .LBB9_2: -; RV32ZBP-NEXT: srl a3, t0, a2 -; RV32ZBP-NEXT: sub a0, a6, a0 -; RV32ZBP-NEXT: slli a4, a1, 1 -; RV32ZBP-NEXT: sll a0, a4, a0 -; RV32ZBP-NEXT: or a0, a3, a0 -; RV32ZBP-NEXT: .LBB9_3: -; RV32ZBP-NEXT: neg a5, a2 -; RV32ZBP-NEXT: andi a4, a5, 63 -; RV32ZBP-NEXT: addi a3, a4, -32 -; RV32ZBP-NEXT: bltz a3, .LBB9_5 -; RV32ZBP-NEXT: # %bb.4: -; RV32ZBP-NEXT: sll a3, t0, a3 -; RV32ZBP-NEXT: bltz a7, .LBB9_6 -; RV32ZBP-NEXT: j .LBB9_7 -; RV32ZBP-NEXT: .LBB9_5: -; RV32ZBP-NEXT: sll a3, t0, a5 -; RV32ZBP-NEXT: or a0, a0, a3 -; RV32ZBP-NEXT: sll a3, a1, a5 -; RV32ZBP-NEXT: sub a4, a6, a4 -; RV32ZBP-NEXT: srli a5, t0, 1 -; RV32ZBP-NEXT: srl a4, a5, a4 -; RV32ZBP-NEXT: or a3, a3, a4 -; RV32ZBP-NEXT: bgez a7, .LBB9_7 -; RV32ZBP-NEXT: .LBB9_6: +; RV32ZBP-NEXT: srl a5, a3, a2 +; RV32ZBP-NEXT: beqz a4, .LBB9_4 +; RV32ZBP-NEXT: # %bb.3: +; RV32ZBP-NEXT: mv a1, a0 +; RV32ZBP-NEXT: .LBB9_4: +; RV32ZBP-NEXT: slli a0, a1, 1 +; RV32ZBP-NEXT: not a4, a2 +; RV32ZBP-NEXT: sll a0, a0, a4 +; RV32ZBP-NEXT: or a0, a0, a5 ; RV32ZBP-NEXT: srl a1, a1, a2 -; RV32ZBP-NEXT: or a3, a3, a1 -; RV32ZBP-NEXT: .LBB9_7: -; RV32ZBP-NEXT: mv a1, a3 +; RV32ZBP-NEXT: slli a2, a3, 1 +; RV32ZBP-NEXT: sll a2, a2, a4 +; RV32ZBP-NEXT: or a1, a2, a1 ; RV32ZBP-NEXT: ret %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) ret i64 %or @@ -501,8 +399,8 @@ define i64 @rori_i64(i64 %a) nounwind { ; RV32I-LABEL: rori_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a2, a1, 31 -; RV32I-NEXT: srli a3, a0, 1 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: slli a3, a1, 31 ; RV32I-NEXT: or a2, a3, a2 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: slli a0, a0, 31 @@ -512,8 +410,8 @@ ; ; RV32ZBB-LABEL: rori_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: slli a2, a1, 31 -; RV32ZBB-NEXT: srli a3, a0, 1 +; RV32ZBB-NEXT: srli a2, a0, 1 +; RV32ZBB-NEXT: slli a3, a1, 31 ; RV32ZBB-NEXT: or a2, a3, a2 ; RV32ZBB-NEXT: srli a1, a1, 1 ; RV32ZBB-NEXT: slli a0, a0, 31 @@ -523,8 +421,8 @@ ; ; RV32ZBP-LABEL: rori_i64: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: slli a2, a1, 31 -; RV32ZBP-NEXT: srli a3, a0, 1 +; RV32ZBP-NEXT: srli a2, a0, 1 +; RV32ZBP-NEXT: slli a3, a1, 31 ; RV32ZBP-NEXT: or a2, a3, a2 ; RV32ZBP-NEXT: srli a1, a1, 1 ; RV32ZBP-NEXT: slli a0, a0, 31 @@ -538,8 +436,8 @@ define i64 @rori_i64_fshr(i64 %a) nounwind { ; RV32I-LABEL: rori_i64_fshr: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a2, a0, 1 -; RV32I-NEXT: srli a3, a1, 31 +; RV32I-NEXT: srli a2, a1, 31 +; RV32I-NEXT: slli a3, a0, 1 ; RV32I-NEXT: or a2, a3, a2 ; RV32I-NEXT: srli a0, a0, 31 ; RV32I-NEXT: slli a1, a1, 1 @@ -549,8 +447,8 @@ ; ; RV32ZBB-LABEL: rori_i64_fshr: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: slli a2, a0, 1 -; RV32ZBB-NEXT: srli a3, a1, 31 +; RV32ZBB-NEXT: srli a2, a1, 31 +; RV32ZBB-NEXT: slli a3, a0, 1 ; RV32ZBB-NEXT: or a2, a3, a2 ; RV32ZBB-NEXT: srli a0, a0, 31 ; RV32ZBB-NEXT: slli a1, a1, 1 @@ -560,8 +458,8 @@ ; ; RV32ZBP-LABEL: rori_i64_fshr: ; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: slli a2, a0, 1 -; RV32ZBP-NEXT: srli a3, a1, 31 +; RV32ZBP-NEXT: srli a2, a1, 31 +; RV32ZBP-NEXT: slli a3, a0, 1 ; RV32ZBP-NEXT: or a2, a3, a2 ; RV32ZBP-NEXT: srli a0, a0, 31 ; RV32ZBP-NEXT: slli a1, a1, 1 diff --git a/llvm/test/CodeGen/RISCV/rv32zbt.ll b/llvm/test/CodeGen/RISCV/rv32zbt.ll --- a/llvm/test/CodeGen/RISCV/rv32zbt.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbt.ll @@ -340,82 +340,44 @@ define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-LABEL: fshl_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a5, a4, 63 -; RV32I-NEXT: addi a7, a5, -32 -; RV32I-NEXT: addi a6, zero, 31 -; RV32I-NEXT: bltz a7, .LBB13_2 +; RV32I-NEXT: srli a5, a4, 5 +; RV32I-NEXT: andi a5, a5, 1 +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: bnez a5, .LBB13_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sll a1, a0, a7 -; RV32I-NEXT: j .LBB13_3 +; RV32I-NEXT: mv a6, a0 ; RV32I-NEXT: .LBB13_2: -; RV32I-NEXT: sll t0, a1, a4 -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: srl a1, a1, a5 -; RV32I-NEXT: or a1, t0, a1 -; RV32I-NEXT: .LBB13_3: -; RV32I-NEXT: not t2, a4 -; RV32I-NEXT: andi t1, t2, 63 -; RV32I-NEXT: addi a5, t1, -32 -; RV32I-NEXT: srli t0, a3, 1 -; RV32I-NEXT: bltz a5, .LBB13_5 -; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a2, t0, a5 -; RV32I-NEXT: bltz a7, .LBB13_6 -; RV32I-NEXT: j .LBB13_7 -; RV32I-NEXT: .LBB13_5: -; RV32I-NEXT: srl a5, t0, t2 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a3, a3, 31 +; RV32I-NEXT: sll a7, a6, a4 +; RV32I-NEXT: bnez a5, .LBB13_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: .LBB13_4: ; RV32I-NEXT: srli a2, a2, 1 -; RV32I-NEXT: or a2, a2, a3 -; RV32I-NEXT: srl a2, a2, t2 -; RV32I-NEXT: sub a3, a6, t1 -; RV32I-NEXT: slli a5, t0, 1 -; RV32I-NEXT: sll a3, a5, a3 -; RV32I-NEXT: or a2, a2, a3 -; RV32I-NEXT: bgez a7, .LBB13_7 +; RV32I-NEXT: not a3, a4 +; RV32I-NEXT: srl a2, a2, a3 +; RV32I-NEXT: or a2, a7, a2 +; RV32I-NEXT: bnez a5, .LBB13_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB13_6: ; RV32I-NEXT: sll a0, a0, a4 -; RV32I-NEXT: or a2, a2, a0 -; RV32I-NEXT: .LBB13_7: +; RV32I-NEXT: srli a1, a6, 1 +; RV32I-NEXT: srl a1, a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV32ZBT-LABEL: fshl_i64: ; RV32ZBT: # %bb.0: -; RV32ZBT-NEXT: sll a7, a1, a4 -; RV32ZBT-NEXT: andi a5, a4, 63 -; RV32ZBT-NEXT: addi a6, zero, 31 -; RV32ZBT-NEXT: sub t0, a6, a5 -; RV32ZBT-NEXT: srli a1, a0, 1 -; RV32ZBT-NEXT: srl a1, a1, t0 -; RV32ZBT-NEXT: or a7, a7, a1 -; RV32ZBT-NEXT: addi t1, a5, -32 -; RV32ZBT-NEXT: sll t0, a0, t1 -; RV32ZBT-NEXT: slti a1, t1, 0 -; RV32ZBT-NEXT: cmov t0, a1, a7, t0 -; RV32ZBT-NEXT: not a5, a4 -; RV32ZBT-NEXT: srli a7, a3, 1 -; RV32ZBT-NEXT: srl t4, a7, a5 -; RV32ZBT-NEXT: andi t2, a5, 63 -; RV32ZBT-NEXT: addi t3, t2, -32 -; RV32ZBT-NEXT: srai a1, t3, 31 -; RV32ZBT-NEXT: and a1, a1, t4 -; RV32ZBT-NEXT: or a1, t0, a1 -; RV32ZBT-NEXT: fsri a2, a2, a3, 1 -; RV32ZBT-NEXT: srl a2, a2, a5 -; RV32ZBT-NEXT: sub a3, a6, t2 -; RV32ZBT-NEXT: slli a5, a7, 1 -; RV32ZBT-NEXT: sll a3, a5, a3 -; RV32ZBT-NEXT: or a2, a2, a3 -; RV32ZBT-NEXT: srl a3, a7, t3 -; RV32ZBT-NEXT: slti a5, t3, 0 +; RV32ZBT-NEXT: srli a5, a4, 5 +; RV32ZBT-NEXT: andi a5, a5, 1 ; RV32ZBT-NEXT: cmov a2, a5, a2, a3 -; RV32ZBT-NEXT: sll a0, a0, a4 -; RV32ZBT-NEXT: srai a3, t1, 31 -; RV32ZBT-NEXT: and a0, a3, a0 -; RV32ZBT-NEXT: or a0, a0, a2 +; RV32ZBT-NEXT: cmov a3, a5, a3, a0 +; RV32ZBT-NEXT: andi a4, a4, 31 +; RV32ZBT-NEXT: fsl a2, a3, a2, a4 +; RV32ZBT-NEXT: cmov a0, a5, a0, a1 +; RV32ZBT-NEXT: fsl a1, a0, a3, a4 +; RV32ZBT-NEXT: mv a0, a2 ; RV32ZBT-NEXT: ret %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c) ret i64 %1 @@ -453,87 +415,41 @@ define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-LABEL: fshr_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t0, a0 -; RV32I-NEXT: andi a0, a4, 63 -; RV32I-NEXT: addi a6, a0, -32 -; RV32I-NEXT: addi a7, zero, 31 -; RV32I-NEXT: bltz a6, .LBB15_2 +; RV32I-NEXT: andi a5, a4, 32 +; RV32I-NEXT: beqz a5, .LBB15_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srl a0, a3, a6 -; RV32I-NEXT: j .LBB15_3 +; RV32I-NEXT: mv a2, a3 ; RV32I-NEXT: .LBB15_2: -; RV32I-NEXT: srl a2, a2, a4 -; RV32I-NEXT: sub a0, a7, a0 -; RV32I-NEXT: slli a5, a3, 1 -; RV32I-NEXT: sll a0, a5, a0 -; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: .LBB15_3: -; RV32I-NEXT: not t2, a4 -; RV32I-NEXT: andi a5, t2, 63 -; RV32I-NEXT: addi a2, a5, -32 -; RV32I-NEXT: slli t1, t0, 1 -; RV32I-NEXT: bltz a2, .LBB15_5 -; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a1, t1, a2 -; RV32I-NEXT: bltz a6, .LBB15_6 -; RV32I-NEXT: j .LBB15_7 -; RV32I-NEXT: .LBB15_5: -; RV32I-NEXT: sll a2, t1, t2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a2, t0, a2 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: srl a2, a2, a5 -; RV32I-NEXT: srli a5, t0, 31 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: bgez a6, .LBB15_7 +; RV32I-NEXT: srl a6, a2, a4 +; RV32I-NEXT: beqz a5, .LBB15_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB15_4: +; RV32I-NEXT: slli a7, a3, 1 +; RV32I-NEXT: not t0, a4 +; RV32I-NEXT: sll a2, a7, t0 +; RV32I-NEXT: or a6, a2, a6 +; RV32I-NEXT: srl a3, a3, a4 +; RV32I-NEXT: beqz a5, .LBB15_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB15_6: -; RV32I-NEXT: srl a2, a3, a4 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: .LBB15_7: +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: sll a0, a0, t0 +; RV32I-NEXT: or a1, a0, a3 +; RV32I-NEXT: mv a0, a6 ; RV32I-NEXT: ret ; ; RV32ZBT-LABEL: fshr_i64: ; RV32ZBT: # %bb.0: -; RV32ZBT-NEXT: srl a7, a2, a4 -; RV32ZBT-NEXT: andi a5, a4, 63 -; RV32ZBT-NEXT: addi a6, zero, 31 -; RV32ZBT-NEXT: sub t0, a6, a5 -; RV32ZBT-NEXT: slli a2, a3, 1 -; RV32ZBT-NEXT: sll a2, a2, t0 -; RV32ZBT-NEXT: or a7, a7, a2 -; RV32ZBT-NEXT: addi t2, a5, -32 -; RV32ZBT-NEXT: srl t0, a3, t2 -; RV32ZBT-NEXT: slti a2, t2, 0 -; RV32ZBT-NEXT: cmov a7, a2, a7, t0 -; RV32ZBT-NEXT: not t4, a4 -; RV32ZBT-NEXT: slli t0, a0, 1 -; RV32ZBT-NEXT: sll t1, t0, t4 -; RV32ZBT-NEXT: andi t3, t4, 63 -; RV32ZBT-NEXT: addi a5, t3, -32 -; RV32ZBT-NEXT: srai a2, a5, 31 -; RV32ZBT-NEXT: and a2, a2, t1 -; RV32ZBT-NEXT: or a7, a2, a7 -; RV32ZBT-NEXT: lui a2, 524288 -; RV32ZBT-NEXT: addi a2, a2, -1 -; RV32ZBT-NEXT: and t1, a0, a2 -; RV32ZBT-NEXT: sub a2, a6, t3 -; RV32ZBT-NEXT: srl a2, t1, a2 -; RV32ZBT-NEXT: fsri a0, a0, a1, 31 -; RV32ZBT-NEXT: sll a0, a0, t4 -; RV32ZBT-NEXT: or a0, a0, a2 -; RV32ZBT-NEXT: sll a1, t0, a5 -; RV32ZBT-NEXT: slti a2, a5, 0 -; RV32ZBT-NEXT: cmov a0, a2, a0, a1 -; RV32ZBT-NEXT: srl a1, a3, a4 -; RV32ZBT-NEXT: srai a2, t2, 31 -; RV32ZBT-NEXT: and a1, a2, a1 -; RV32ZBT-NEXT: or a1, a0, a1 -; RV32ZBT-NEXT: mv a0, a7 +; RV32ZBT-NEXT: andi a5, a4, 32 +; RV32ZBT-NEXT: cmov a6, a5, a0, a3 +; RV32ZBT-NEXT: cmov a2, a5, a3, a2 +; RV32ZBT-NEXT: andi a3, a4, 31 +; RV32ZBT-NEXT: fsr a2, a2, a6, a3 +; RV32ZBT-NEXT: cmov a0, a5, a1, a0 +; RV32ZBT-NEXT: fsr a1, a6, a0, a3 +; RV32ZBT-NEXT: mv a0, a2 ; RV32ZBT-NEXT: ret %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) ret i64 %1 @@ -558,8 +474,8 @@ define i64 @fshri_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: fshri_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a1, a3, 27 -; RV32I-NEXT: srli a2, a2, 5 +; RV32I-NEXT: srli a1, a2, 5 +; RV32I-NEXT: slli a2, a3, 27 ; RV32I-NEXT: or a2, a2, a1 ; RV32I-NEXT: srli a1, a3, 5 ; RV32I-NEXT: slli a0, a0, 27 diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -571,34 +571,25 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; RV32I-LABEL: fshr64_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: andi a2, a2, 63 -; RV32I-NEXT: call __lshrdi3@plt -; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: mv s4, a1 -; RV32I-NEXT: neg a0, s0 -; RV32I-NEXT: andi a2, a0, 63 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s2 -; RV32I-NEXT: call __ashldi3@plt -; RV32I-NEXT: or a0, s3, a0 -; RV32I-NEXT: or a1, s4, a1 -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: andi a4, a2, 32 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a4, .LBB9_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: srl a5, a3, a2 +; RV32I-NEXT: beqz a4, .LBB9_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB9_4: +; RV32I-NEXT: slli a0, a1, 1 +; RV32I-NEXT: not a4, a2 +; RV32I-NEXT: sll a0, a0, a4 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: slli a2, a3, 1 +; RV32I-NEXT: sll a2, a2, a4 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: fshr64_minsize: @@ -615,182 +606,92 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-LABEL: fshr128_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s5, 0(a1) -; RV32I-NEXT: lw s6, 4(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s3, 12(a1) -; RV32I-NEXT: lw s11, 0(a2) -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: andi s0, s11, 127 -; RV32I-NEXT: addi a2, s0, -64 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __lshrdi3@plt -; RV32I-NEXT: mv s8, a0 -; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv a0, s5 -; RV32I-NEXT: mv a1, s6 -; RV32I-NEXT: mv a2, s0 -; RV32I-NEXT: call __lshrdi3@plt -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: sw a1, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi s9, zero, 64 -; RV32I-NEXT: sub a2, s9, s0 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __ashldi3@plt -; RV32I-NEXT: mv s10, a1 -; RV32I-NEXT: bgeu s0, s9, .LBB10_2 +; RV32I-NEXT: lw t2, 8(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a2, 0(a2) +; RV32I-NEXT: lw a7, 4(a1) +; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: andi a1, a2, 64 +; RV32I-NEXT: mv a5, a7 +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: beqz a1, .LBB10_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: or s8, s1, a0 +; RV32I-NEXT: mv a5, t1 +; RV32I-NEXT: mv a6, t2 ; RV32I-NEXT: .LBB10_2: -; RV32I-NEXT: mv s7, s5 -; RV32I-NEXT: beqz s0, .LBB10_4 +; RV32I-NEXT: andi a4, a2, 32 +; RV32I-NEXT: mv t0, a6 +; RV32I-NEXT: bnez a4, .LBB10_13 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv s7, s8 +; RV32I-NEXT: bnez a1, .LBB10_14 ; RV32I-NEXT: .LBB10_4: -; RV32I-NEXT: neg a0, s11 -; RV32I-NEXT: andi s1, a0, 127 -; RV32I-NEXT: mv a0, s5 -; RV32I-NEXT: mv a1, s6 -; RV32I-NEXT: mv a2, s1 -; RV32I-NEXT: call __ashldi3@plt -; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: bgeu s1, s9, .LBB10_6 -; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: or s7, s7, a0 +; RV32I-NEXT: beqz a4, .LBB10_6 +; RV32I-NEXT: .LBB10_5: +; RV32I-NEXT: mv a5, t2 ; RV32I-NEXT: .LBB10_6: -; RV32I-NEXT: bltu s0, s9, .LBB10_8 +; RV32I-NEXT: slli t3, a5, 1 +; RV32I-NEXT: not a3, a2 +; RV32I-NEXT: beqz a1, .LBB10_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: j .LBB10_9 +; RV32I-NEXT: mv t1, a7 ; RV32I-NEXT: .LBB10_8: -; RV32I-NEXT: lw a0, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a0, a0, s10 -; RV32I-NEXT: .LBB10_9: -; RV32I-NEXT: mv s8, s6 -; RV32I-NEXT: beqz s0, .LBB10_11 -; RV32I-NEXT: # %bb.10: -; RV32I-NEXT: mv s8, a0 -; RV32I-NEXT: .LBB10_11: -; RV32I-NEXT: sub a2, s9, s1 -; RV32I-NEXT: mv a0, s5 -; RV32I-NEXT: mv a1, s6 -; RV32I-NEXT: call __lshrdi3@plt -; RV32I-NEXT: mv s10, a0 -; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: mv a2, s1 -; RV32I-NEXT: call __ashldi3@plt -; RV32I-NEXT: mv s11, a0 -; RV32I-NEXT: sw a1, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a2, s1, -64 -; RV32I-NEXT: mv a0, s5 -; RV32I-NEXT: mv a1, s6 -; RV32I-NEXT: call __ashldi3@plt -; RV32I-NEXT: mv s5, a1 -; RV32I-NEXT: bgeu s1, s9, .LBB10_13 -; RV32I-NEXT: # %bb.12: -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or s8, s8, a0 -; RV32I-NEXT: or a0, s11, s10 -; RV32I-NEXT: .LBB10_13: -; RV32I-NEXT: mv s6, s4 -; RV32I-NEXT: beqz s1, .LBB10_15 -; RV32I-NEXT: # %bb.14: -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: .LBB10_15: -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: mv a2, s0 -; RV32I-NEXT: call __lshrdi3@plt -; RV32I-NEXT: bltu s0, s9, .LBB10_21 -; RV32I-NEXT: # %bb.16: -; RV32I-NEXT: bltu s1, s9, .LBB10_22 -; RV32I-NEXT: .LBB10_17: -; RV32I-NEXT: bnez s1, .LBB10_23 -; RV32I-NEXT: .LBB10_18: -; RV32I-NEXT: bgeu s0, s9, .LBB10_20 -; RV32I-NEXT: .LBB10_19: -; RV32I-NEXT: or s3, s3, a1 -; RV32I-NEXT: .LBB10_20: -; RV32I-NEXT: sw s8, 4(s2) -; RV32I-NEXT: sw s7, 0(s2) -; RV32I-NEXT: sw s3, 12(s2) -; RV32I-NEXT: sw s6, 8(s2) -; RV32I-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: srl a7, t0, a2 +; RV32I-NEXT: sll a1, t3, a3 +; RV32I-NEXT: srl a5, a5, a2 +; RV32I-NEXT: beqz a4, .LBB10_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t2, t1 +; RV32I-NEXT: .LBB10_10: +; RV32I-NEXT: or a7, a1, a7 +; RV32I-NEXT: slli a1, t2, 1 +; RV32I-NEXT: sll a1, a1, a3 +; RV32I-NEXT: or a5, a1, a5 +; RV32I-NEXT: srl a1, t2, a2 +; RV32I-NEXT: beqz a4, .LBB10_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: .LBB10_12: +; RV32I-NEXT: slli a4, t1, 1 +; RV32I-NEXT: sll a4, a4, a3 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: srl a2, t1, a2 +; RV32I-NEXT: slli a4, t0, 1 +; RV32I-NEXT: sll a3, a4, a3 +; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a7, 0(a0) ; RV32I-NEXT: ret -; RV32I-NEXT: .LBB10_21: -; RV32I-NEXT: or s6, s6, a0 -; RV32I-NEXT: bgeu s1, s9, .LBB10_17 -; RV32I-NEXT: .LBB10_22: -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw a2, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: or s5, a2, a0 -; RV32I-NEXT: beqz s1, .LBB10_18 -; RV32I-NEXT: .LBB10_23: -; RV32I-NEXT: mv s3, s5 -; RV32I-NEXT: bltu s0, s9, .LBB10_19 -; RV32I-NEXT: j .LBB10_20 +; RV32I-NEXT: .LBB10_13: +; RV32I-NEXT: mv t0, a5 +; RV32I-NEXT: beqz a1, .LBB10_4 +; RV32I-NEXT: .LBB10_14: +; RV32I-NEXT: mv t2, a3 +; RV32I-NEXT: bnez a4, .LBB10_5 +; RV32I-NEXT: j .LBB10_6 ; ; RV64I-LABEL: fshr128_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a2 -; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: andi a2, a2, 127 -; RV64I-NEXT: call __lshrti3@plt -; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: mv s4, a1 -; RV64I-NEXT: neg a0, s0 -; RV64I-NEXT: andi a2, a0, 127 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s2 -; RV64I-NEXT: call __ashlti3@plt -; RV64I-NEXT: or a0, s3, a0 -; RV64I-NEXT: or a1, s4, a1 -; RV64I-NEXT: ld s4, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: andi a4, a2, 64 +; RV64I-NEXT: mv a3, a0 +; RV64I-NEXT: beqz a4, .LBB10_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a3, a1 +; RV64I-NEXT: .LBB10_2: +; RV64I-NEXT: srl a5, a3, a2 +; RV64I-NEXT: beqz a4, .LBB10_4 +; RV64I-NEXT: # %bb.3: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB10_4: +; RV64I-NEXT: slli a0, a1, 1 +; RV64I-NEXT: not a4, a2 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: srl a1, a1, a2 +; RV64I-NEXT: slli a2, a3, 1 +; RV64I-NEXT: sll a2, a2, a4 +; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: ret %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b) ret i128 %res diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -179,102 +179,62 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-LABEL: var_shift_i64: ; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: notb %cl -; X86-FAST-NEXT: shrdl $1, %edi, %esi -; X86-FAST-NEXT: shrl %edi -; X86-FAST-NEXT: shrdl %cl, %edi, %esi -; X86-FAST-NEXT: shrl %cl, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: je .LBB5_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: .LBB5_2: -; X86-FAST-NEXT: movl %ebx, %eax -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: shll %cl, %eax -; X86-FAST-NEXT: shldl %cl, %ebx, %edx -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: je .LBB5_4 -; X86-FAST-NEXT: # %bb.3: -; X86-FAST-NEXT: movl %eax, %edx -; X86-FAST-NEXT: xorl %eax, %eax -; X86-FAST-NEXT: .LBB5_4: -; X86-FAST-NEXT: orl %edi, %edx -; X86-FAST-NEXT: orl %esi, %eax +; X86-FAST-NEXT: jne .LBB5_1 +; X86-FAST-NEXT: # %bb.2: +; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: jmp .LBB5_3 +; X86-FAST-NEXT: .LBB5_1: +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: .LBB5_3: +; X86-FAST-NEXT: movl %edi, %eax +; X86-FAST-NEXT: shldl %cl, %esi, %eax +; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-FAST-NEXT: shldl %cl, %edi, %edx ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi -; X86-FAST-NEXT: popl %ebx ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: var_shift_i64: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %ebp ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: shrl %eax -; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: shll $31, %edi -; X86-SLOW-NEXT: orl %eax, %edi -; X86-SLOW-NEXT: movl %ecx, %eax -; X86-SLOW-NEXT: movb %cl, %ch -; X86-SLOW-NEXT: notb %ch -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: shrl %esi -; X86-SLOW-NEXT: leal (%esi,%esi), %ebp -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: shll %cl, %ebx -; X86-SLOW-NEXT: movl %edx, %eax -; X86-SLOW-NEXT: shrl %eax -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: testb $32, {{[0-9]+}}(%esp) +; X86-SLOW-NEXT: testb $32, %bl ; X86-SLOW-NEXT: jne .LBB5_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: orl %eax, %ebx +; X86-SLOW-NEXT: movl %edx, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: jmp .LBB5_3 ; X86-SLOW-NEXT: .LBB5_1: -; X86-SLOW-NEXT: movl %edx, %ebx -; X86-SLOW-NEXT: xorl %edx, %edx +; X86-SLOW-NEXT: movl %eax, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: .LBB5_3: -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: shrl %eax +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: orl %edi, %eax +; X86-SLOW-NEXT: shrl %esi ; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb $32, %ch -; X86-SLOW-NEXT: jne .LBB5_4 -; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: orl %edi, %ebp -; X86-SLOW-NEXT: jmp .LBB5_6 -; X86-SLOW-NEXT: .LBB5_4: -; X86-SLOW-NEXT: movl %esi, %ebp -; X86-SLOW-NEXT: xorl %esi, %esi -; X86-SLOW-NEXT: .LBB5_6: -; X86-SLOW-NEXT: orl %ebp, %edx -; X86-SLOW-NEXT: orl %esi, %ebx -; X86-SLOW-NEXT: movl %edx, %eax -; X86-SLOW-NEXT: movl %ebx, %edx +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: orl %esi, %edx ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx -; X86-SLOW-NEXT: popl %ebp ; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i64: @@ -307,226 +267,50 @@ ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: subl $72, %esp -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl %edx, %edi -; X86-FAST-NEXT: shldl $31, %eax, %edi -; X86-FAST-NEXT: movl %ebx, %eax -; X86-FAST-NEXT: notl %ebx -; X86-FAST-NEXT: andl $127, %ebx -; X86-FAST-NEXT: movb $64, %cl -; X86-FAST-NEXT: subb %bl, %cl -; X86-FAST-NEXT: shrl %edx -; X86-FAST-NEXT: movl %edx, %ebp -; X86-FAST-NEXT: shldl %cl, %edi, %edx -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %edi, %edx -; X86-FAST-NEXT: shll %cl, %edx -; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: testb $64, %cl ; X86-FAST-NEXT: jne .LBB6_1 ; X86-FAST-NEXT: # %bb.2: -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: jmp .LBB6_3 -; X86-FAST-NEXT: .LBB6_1: -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_3: -; X86-FAST-NEXT: andl $127, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movb %al, %ch -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: shldl %cl, %esi, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movb %bl, %cl -; X86-FAST-NEXT: addb $-64, %cl ; X86-FAST-NEXT: movl %edi, %eax -; X86-FAST-NEXT: movl %ebp, %edx -; X86-FAST-NEXT: shrdl %cl, %ebp, %eax -; X86-FAST-NEXT: shrl %cl, %ebp +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl %ebx, %ebp +; X86-FAST-NEXT: movl %edx, %ebx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: jne .LBB6_4 -; X86-FAST-NEXT: # %bb.5: -; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: jmp .LBB6_6 +; X86-FAST-NEXT: je .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_6: +; X86-FAST-NEXT: movl %edx, %esi +; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl %ebx, %edi +; X86-FAST-NEXT: movl %eax, %ebx +; X86-FAST-NEXT: jmp .LBB6_6 +; X86-FAST-NEXT: .LBB6_1: ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: shldl %cl, %eax, %ebp -; X86-FAST-NEXT: shll %cl, %eax -; X86-FAST-NEXT: shll %cl, %esi -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: jne .LBB6_7 -; X86-FAST-NEXT: # %bb.8: -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: jmp .LBB6_9 -; X86-FAST-NEXT: .LBB6_7: -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %eax, %ebp -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_9: -; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: jb .LBB6_11 -; X86-FAST-NEXT: # %bb.10: -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_11: -; X86-FAST-NEXT: movb %bl, %cl -; X86-FAST-NEXT: shrdl %cl, %edx, %edi -; X86-FAST-NEXT: shrl %cl, %edx -; X86-FAST-NEXT: shldl $31, %eax, %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: shrdl $1, %ebp, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: shrdl %cl, %esi, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, %eax -; X86-FAST-NEXT: shrl %cl, %eax -; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: je .LBB6_13 -; X86-FAST-NEXT: # %bb.12: -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %edx, %edi -; X86-FAST-NEXT: xorl %eax, %eax -; X86-FAST-NEXT: xorl %edx, %edx -; X86-FAST-NEXT: .LBB6_13: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: jb .LBB6_15 -; X86-FAST-NEXT: # %bb.14: -; X86-FAST-NEXT: xorl %ebp, %ebp -; X86-FAST-NEXT: .LBB6_15: -; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movb $64, %cl -; X86-FAST-NEXT: subb %ch, %cl -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: shrl %cl, %ebp -; X86-FAST-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: movl $0, %edx -; X86-FAST-NEXT: jne .LBB6_17 -; X86-FAST-NEXT: # %bb.16: -; X86-FAST-NEXT: movl %ebp, %edx -; X86-FAST-NEXT: .LBB6_17: -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: addb $-64, %ch -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: shll %cl, %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: shldl %cl, %edi, %edx -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: jne .LBB6_19 -; X86-FAST-NEXT: # %bb.18: -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_19: -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jb .LBB6_21 -; X86-FAST-NEXT: # %bb.20: -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_21: -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: jae .LBB6_23 -; X86-FAST-NEXT: # %bb.22: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: .LBB6_23: -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: jne .LBB6_25 -; X86-FAST-NEXT: # %bb.24: -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_25: -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jb .LBB6_27 -; X86-FAST-NEXT: # %bb.26: -; X86-FAST-NEXT: xorl %edx, %edx -; X86-FAST-NEXT: .LBB6_27: -; X86-FAST-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: shrdl %cl, %esi, %edi ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-FAST-NEXT: jne .LBB6_29 -; X86-FAST-NEXT: # %bb.28: -; X86-FAST-NEXT: movl %edi, %ebp -; X86-FAST-NEXT: .LBB6_29: -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-FAST-NEXT: jae .LBB6_31 -; X86-FAST-NEXT: # %bb.30: -; X86-FAST-NEXT: orl %ebp, %esi -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_31: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jae .LBB6_33 -; X86-FAST-NEXT: # %bb.32: -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-FAST-NEXT: jne .LBB6_4 +; X86-FAST-NEXT: .LBB6_5: ; X86-FAST-NEXT: movl %eax, %ebp -; X86-FAST-NEXT: .LBB6_33: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jae .LBB6_35 -; X86-FAST-NEXT: # %bb.34: -; X86-FAST-NEXT: movl %edx, %ecx -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: orl %eax, %edx -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %ecx, %edx -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-FAST-NEXT: .LBB6_35: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: je .LBB6_37 -; X86-FAST-NEXT: # %bb.36: -; X86-FAST-NEXT: movl %ebp, %ecx -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-FAST-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_37: -; X86-FAST-NEXT: orl %ecx, %edi -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-FAST-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: je .LBB6_39 -; X86-FAST-NEXT: # %bb.38: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-FAST-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-FAST-NEXT: .LBB6_39: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-FAST-NEXT: orl %edx, %esi -; X86-FAST-NEXT: movl %ecx, 12(%eax) -; X86-FAST-NEXT: movl %esi, 8(%eax) -; X86-FAST-NEXT: movl %edi, 4(%eax) -; X86-FAST-NEXT: movl %ebx, (%eax) -; X86-FAST-NEXT: addl $72, %esp +; X86-FAST-NEXT: .LBB6_6: +; X86-FAST-NEXT: movl %ebx, %eax +; X86-FAST-NEXT: shldl %cl, %ebp, %eax +; X86-FAST-NEXT: movl %edi, %ebp +; X86-FAST-NEXT: shldl %cl, %ebx, %ebp +; X86-FAST-NEXT: movl %edx, %ebx +; X86-FAST-NEXT: shldl %cl, %edi, %ebx +; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-FAST-NEXT: shldl %cl, %edx, %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FAST-NEXT: movl %esi, 12(%ecx) +; X86-FAST-NEXT: movl %ebx, 8(%ecx) +; X86-FAST-NEXT: movl %ebp, 4(%ecx) +; X86-FAST-NEXT: movl %eax, (%ecx) +; X86-FAST-NEXT: movl %ecx, %eax ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi ; X86-FAST-NEXT: popl %ebx @@ -539,289 +323,76 @@ ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: subl $76, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: pushl %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: andl $127, %eax -; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: # kill: def $al killed $al killed $eax -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: shrl %esi -; X86-SLOW-NEXT: movb %al, %ah -; X86-SLOW-NEXT: notb %ah -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: shll %cl, %ebx -; X86-SLOW-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-SLOW-NEXT: testb $32, %al +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: testb $64, %al ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: orl (%esp), %edx # 4-byte Folded Reload -; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: orl %edi, %ebp +; X86-SLOW-NEXT: movl %ebp, %ecx +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %edx, %ebx +; X86-SLOW-NEXT: movl %esi, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: jmp .LBB6_3 ; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl %ebx, %ebp -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: xorl %ebx, %ebx -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: .LBB6_3: -; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload -; X86-SLOW-NEXT: jb .LBB6_5 -; X86-SLOW-NEXT: # %bb.4: -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: .LBB6_5: -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: notl %ebx -; X86-SLOW-NEXT: andl $127, %ebx -; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: movl %esi, %ecx -; X86-SLOW-NEXT: shrl %ecx -; X86-SLOW-NEXT: movl %eax, %esi -; X86-SLOW-NEXT: shll $31, %esi -; X86-SLOW-NEXT: orl %ecx, %esi -; X86-SLOW-NEXT: movl %esi, %ecx -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: movl $0, %esi -; X86-SLOW-NEXT: movl $0, %ecx -; X86-SLOW-NEXT: jne .LBB6_7 -; X86-SLOW-NEXT: # %bb.6: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: movl %ebp, %ecx -; X86-SLOW-NEXT: .LBB6_7: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: shrl %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: shll $31, %esi -; X86-SLOW-NEXT: orl %eax, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: addl %edi, %edi -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: jne .LBB6_9 -; X86-SLOW-NEXT: # %bb.8: -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: .LBB6_9: -; X86-SLOW-NEXT: movb %bl, %dh -; X86-SLOW-NEXT: addb $-64, %dh -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: movb %dh, %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: movl $0, %ecx -; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-SLOW-NEXT: jne .LBB6_11 -; X86-SLOW-NEXT: # %bb.10: -; X86-SLOW-NEXT: movl %esi, %ecx -; X86-SLOW-NEXT: .LBB6_11: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: jb .LBB6_13 -; X86-SLOW-NEXT: # %bb.12: -; X86-SLOW-NEXT: xorl %eax, %eax -; X86-SLOW-NEXT: .LBB6_13: -; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movb $64, %ch -; X86-SLOW-NEXT: movb $64, %ah -; X86-SLOW-NEXT: subb %dl, %ah -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: leal (%ebp,%ebp), %edi +; X86-SLOW-NEXT: testb $32, %al +; X86-SLOW-NEXT: jne .LBB6_4 +; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: movl %ecx, %ebx +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: jmp .LBB6_6 +; X86-SLOW-NEXT: .LBB6_4: +; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: movl %edx, %ebp +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: .LBB6_6: +; X86-SLOW-NEXT: movl %edx, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %eax, %ecx ; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: testb $32, %ah -; X86-SLOW-NEXT: jne .LBB6_14 -; X86-SLOW-NEXT: # %bb.15: -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: jmp .LBB6_16 -; X86-SLOW-NEXT: .LBB6_14: -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: .LBB6_16: -; X86-SLOW-NEXT: addb $-64, %dl -; X86-SLOW-NEXT: movb %dl, %cl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: shrl %ebx +; X86-SLOW-NEXT: movb %al, %ch +; X86-SLOW-NEXT: notb %ch +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: orl %edi, %ebx +; X86-SLOW-NEXT: movl %ebp, %edi +; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: movb %dl, %cl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb $32, %dl -; X86-SLOW-NEXT: jne .LBB6_17 -; X86-SLOW-NEXT: # %bb.18: -; X86-SLOW-NEXT: orl %eax, %edi -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jae .LBB6_20 -; X86-SLOW-NEXT: jmp .LBB6_21 -; X86-SLOW-NEXT: .LBB6_17: +; X86-SLOW-NEXT: shrl %edx +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: orl %edi, %edx ; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: xorl %esi, %esi -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jb .LBB6_21 -; X86-SLOW-NEXT: .LBB6_20: -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: .LBB6_21: -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: jae .LBB6_23 -; X86-SLOW-NEXT: # %bb.22: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: orl %ebp, %esi -; X86-SLOW-NEXT: .LBB6_23: -; X86-SLOW-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: jae .LBB6_25 -; X86-SLOW-NEXT: # %bb.24: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_25: -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: shll $31, %esi -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb %bl, %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: addl %edi, %edi -; X86-SLOW-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: jne .LBB6_27 -; X86-SLOW-NEXT: # %bb.26: -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_27: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: movb %dh, %cl -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: jne .LBB6_29 -; X86-SLOW-NEXT: # %bb.28: -; X86-SLOW-NEXT: orl %eax, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_29: -; X86-SLOW-NEXT: subb %bl, %ch -; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: shrl %ebp ; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: orl %edi, %ebp +; X86-SLOW-NEXT: movb %al, %cl +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: shrl %esi ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: testb $32, %ch -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: jne .LBB6_30 -; X86-SLOW-NEXT: # %bb.31: -; X86-SLOW-NEXT: orl %ecx, %edx -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jb .LBB6_33 -; X86-SLOW-NEXT: jmp .LBB6_34 -; X86-SLOW-NEXT: .LBB6_30: -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: xorl %eax, %eax -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jae .LBB6_34 -; X86-SLOW-NEXT: .LBB6_33: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-SLOW-NEXT: orl %eax, %edx -; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_34: -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jb .LBB6_35 -; X86-SLOW-NEXT: # %bb.36: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SLOW-NEXT: jmp .LBB6_37 -; X86-SLOW-NEXT: .LBB6_35: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SLOW-NEXT: orl %ecx, %eax -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: .LBB6_37: +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: orl %eax, %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB6_39 -; X86-SLOW-NEXT: # %bb.38: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, %ebx -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_39: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-SLOW-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: je .LBB6_41 -; X86-SLOW-NEXT: # %bb.40: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: .LBB6_41: -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-SLOW-NEXT: movl %esi, 12(%eax) -; X86-SLOW-NEXT: movl %edi, 8(%eax) +; X86-SLOW-NEXT: movl %ebp, 8(%eax) +; X86-SLOW-NEXT: movl %edx, 4(%eax) ; X86-SLOW-NEXT: movl %ebx, (%eax) -; X86-SLOW-NEXT: movl %ebp, 4(%eax) -; X86-SLOW-NEXT: addl $76, %esp +; X86-SLOW-NEXT: addl $4, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx @@ -830,65 +401,39 @@ ; ; X64-FAST-LABEL: var_shift_i128: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movq %r8, %r9 -; X64-FAST-NEXT: movq %rcx, %r8 -; X64-FAST-NEXT: movl %r9d, %ecx -; X64-FAST-NEXT: shldq %cl, %rdi, %rsi -; X64-FAST-NEXT: shrdq $1, %r8, %rdx -; X64-FAST-NEXT: shrq %r8 -; X64-FAST-NEXT: notb %cl -; X64-FAST-NEXT: shrdq %cl, %r8, %rdx -; X64-FAST-NEXT: shrq %cl, %r8 -; X64-FAST-NEXT: xorl %eax, %eax -; X64-FAST-NEXT: testb $64, %cl -; X64-FAST-NEXT: cmovneq %r8, %rdx -; X64-FAST-NEXT: cmovneq %rax, %r8 -; X64-FAST-NEXT: movl %r9d, %ecx -; X64-FAST-NEXT: shlq %cl, %rdi -; X64-FAST-NEXT: testb $64, %r9b +; X64-FAST-NEXT: testb $64, %r8b ; X64-FAST-NEXT: cmovneq %rdi, %rsi -; X64-FAST-NEXT: cmoveq %rdi, %rax -; X64-FAST-NEXT: orq %rdx, %rax -; X64-FAST-NEXT: orq %rsi, %r8 -; X64-FAST-NEXT: movq %r8, %rdx +; X64-FAST-NEXT: cmoveq %rcx, %rdx +; X64-FAST-NEXT: cmovneq %rcx, %rdi +; X64-FAST-NEXT: movq %rdi, %rax +; X64-FAST-NEXT: movl %r8d, %ecx +; X64-FAST-NEXT: shldq %cl, %rdx, %rax +; X64-FAST-NEXT: shldq %cl, %rdi, %rsi +; X64-FAST-NEXT: movq %rsi, %rdx ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i128: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq %rcx, %r11 -; X64-SLOW-NEXT: movq %rdx, %r9 +; X64-SLOW-NEXT: testb $64, %r8b +; X64-SLOW-NEXT: cmovneq %rdi, %rsi +; X64-SLOW-NEXT: cmoveq %rcx, %rdx +; X64-SLOW-NEXT: cmovneq %rcx, %rdi +; X64-SLOW-NEXT: movq %rdi, %rax ; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rsi -; X64-SLOW-NEXT: movq %rdi, %rdx +; X64-SLOW-NEXT: shlq %cl, %rax ; X64-SLOW-NEXT: shrq %rdx -; X64-SLOW-NEXT: movl %r8d, %r10d -; X64-SLOW-NEXT: notb %r10b -; X64-SLOW-NEXT: movl %r10d, %ecx +; X64-SLOW-NEXT: movl %r8d, %r9d +; X64-SLOW-NEXT: notb %r9b +; X64-SLOW-NEXT: movl %r9d, %ecx ; X64-SLOW-NEXT: shrq %cl, %rdx -; X64-SLOW-NEXT: orq %rsi, %rdx -; X64-SLOW-NEXT: shrq %r9 -; X64-SLOW-NEXT: movq %r11, %rax -; X64-SLOW-NEXT: shlq $63, %rax -; X64-SLOW-NEXT: orq %r9, %rax -; X64-SLOW-NEXT: shrq %cl, %rax -; X64-SLOW-NEXT: shrq %r11 -; X64-SLOW-NEXT: leaq (%r11,%r11), %rsi +; X64-SLOW-NEXT: orq %rdx, %rax ; X64-SLOW-NEXT: movl %r8d, %ecx ; X64-SLOW-NEXT: shlq %cl, %rsi -; X64-SLOW-NEXT: orq %rax, %rsi -; X64-SLOW-NEXT: movl %r10d, %ecx -; X64-SLOW-NEXT: shrq %cl, %r11 -; X64-SLOW-NEXT: xorl %eax, %eax -; X64-SLOW-NEXT: testb $64, %r10b -; X64-SLOW-NEXT: cmovneq %r11, %rsi -; X64-SLOW-NEXT: cmovneq %rax, %r11 -; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rdi -; X64-SLOW-NEXT: testb $64, %r8b -; X64-SLOW-NEXT: cmovneq %rdi, %rdx -; X64-SLOW-NEXT: cmoveq %rdi, %rax -; X64-SLOW-NEXT: orq %rsi, %rax -; X64-SLOW-NEXT: orq %r11, %rdx +; X64-SLOW-NEXT: shrq %rdi +; X64-SLOW-NEXT: movl %r9d, %ecx +; X64-SLOW-NEXT: shrq %cl, %rdi +; X64-SLOW-NEXT: orq %rsi, %rdi +; X64-SLOW-NEXT: movq %rdi, %rdx ; X64-SLOW-NEXT: retq %tmp = tail call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %tmp diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -176,106 +176,60 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-LABEL: var_shift_i64: ; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %ebp -; X86-FAST-NEXT: pushl %ebx -; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %bl -; X86-FAST-NEXT: movb %bl, %ch -; X86-FAST-NEXT: notb %ch -; X86-FAST-NEXT: shldl $1, %eax, %edx -; X86-FAST-NEXT: addl %eax, %eax -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: shldl %cl, %eax, %edx -; X86-FAST-NEXT: movl %ebp, %edi -; X86-FAST-NEXT: movb %bl, %cl -; X86-FAST-NEXT: shrl %cl, %edi -; X86-FAST-NEXT: shrdl %cl, %ebp, %esi -; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: je .LBB5_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: .LBB5_2: -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: shll %cl, %eax -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: je .LBB5_4 -; X86-FAST-NEXT: # %bb.3: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: je .LBB5_1 +; X86-FAST-NEXT: # %bb.2: +; X86-FAST-NEXT: movl %esi, %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: jmp .LBB5_3 +; X86-FAST-NEXT: .LBB5_1: ; X86-FAST-NEXT: movl %eax, %edx -; X86-FAST-NEXT: xorl %eax, %eax -; X86-FAST-NEXT: .LBB5_4: -; X86-FAST-NEXT: orl %edi, %edx -; X86-FAST-NEXT: orl %esi, %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: .LBB5_3: +; X86-FAST-NEXT: shrdl %cl, %edx, %eax +; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-FAST-NEXT: shrdl %cl, %esi, %edx ; X86-FAST-NEXT: popl %esi -; X86-FAST-NEXT: popl %edi -; X86-FAST-NEXT: popl %ebx -; X86-FAST-NEXT: popl %ebp ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: var_shift_i64: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %ebp ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: pushl %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %bl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl %eax, %edi -; X86-SLOW-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shrl $31, %ecx -; X86-SLOW-NEXT: leal (%ecx,%edx,2), %edx -; X86-SLOW-NEXT: movb %bl, %ch -; X86-SLOW-NEXT: notb %ch -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: movb %bl, %cl -; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: leal (%esi,%esi), %ebp -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: movb %bl, %cl -; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: jne .LBB5_1 +; X86-SLOW-NEXT: je .LBB5_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: orl (%esp), %ebp # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %edx, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: jmp .LBB5_3 ; X86-SLOW-NEXT: .LBB5_1: -; X86-SLOW-NEXT: movl %esi, %ebp -; X86-SLOW-NEXT: xorl %esi, %esi +; X86-SLOW-NEXT: movl %eax, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: .LBB5_3: -; X86-SLOW-NEXT: addl %eax, %eax +; X86-SLOW-NEXT: leal (%esi,%esi), %edi +; X86-SLOW-NEXT: movb %bl, %ch +; X86-SLOW-NEXT: notb %ch ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: testb $32, %ch -; X86-SLOW-NEXT: jne .LBB5_4 -; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: orl %edi, %edx -; X86-SLOW-NEXT: jmp .LBB5_6 -; X86-SLOW-NEXT: .LBB5_4: -; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: xorl %eax, %eax -; X86-SLOW-NEXT: .LBB5_6: +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: movb %bl, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: orl %edi, %eax +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: addl %edx, %edx +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shll %cl, %edx ; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: orl %ebp, %eax -; X86-SLOW-NEXT: addl $4, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx -; X86-SLOW-NEXT: popl %ebp ; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i64: @@ -307,243 +261,48 @@ ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: subl $76, %esp -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: pushl %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-FAST-NEXT: movl %ebx, %ecx -; X86-FAST-NEXT: andl $127, %ecx -; X86-FAST-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movb %cl, %ch -; X86-FAST-NEXT: movb $64, %cl -; X86-FAST-NEXT: subb %ch, %cl -; X86-FAST-NEXT: shll %cl, %edi -; X86-FAST-NEXT: movb %cl, (%esp) # 1-byte Spill -; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: movl $0, %esi -; X86-FAST-NEXT: jne .LBB6_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: .LBB6_2: -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %eax, %edi -; X86-FAST-NEXT: movl %ebp, %eax -; X86-FAST-NEXT: shldl $1, %ebp, %edi -; X86-FAST-NEXT: addl %ebp, %eax -; X86-FAST-NEXT: notl %ebx -; X86-FAST-NEXT: andl $127, %ebx -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movb %bl, %cl -; X86-FAST-NEXT: shldl %cl, %eax, %edi -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: shll %cl, %eax -; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: movl %eax, %esi -; X86-FAST-NEXT: jne .LBB6_4 -; X86-FAST-NEXT: # %bb.3: -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: shrdl %cl, %edi, %esi -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: shrl %cl, %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: shrl %cl, %edi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FAST-NEXT: testb $64, %cl +; X86-FAST-NEXT: je .LBB6_1 +; X86-FAST-NEXT: # %bb.2: +; X86-FAST-NEXT: movl %edi, %ebp +; X86-FAST-NEXT: movl %ebx, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl %edx, %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: shrdl %cl, %edx, %ebp -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: jne .LBB6_5 -; X86-FAST-NEXT: # %bb.6: -; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: jmp .LBB6_7 -; X86-FAST-NEXT: .LBB6_5: -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_7: -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: movl $0, %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: jne .LBB6_9 -; X86-FAST-NEXT: # %bb.8: -; X86-FAST-NEXT: movl %eax, %esi -; X86-FAST-NEXT: .LBB6_9: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: jb .LBB6_11 -; X86-FAST-NEXT: # %bb.10: -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_11: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: shrdl $31, %edi, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movb %bl, %cl -; X86-FAST-NEXT: shll %cl, %eax -; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: movl $0, %edi -; X86-FAST-NEXT: jne .LBB6_13 -; X86-FAST-NEXT: # %bb.12: -; X86-FAST-NEXT: movl %eax, %edi -; X86-FAST-NEXT: .LBB6_13: -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movb (%esp), %cl # 1-byte Reload -; X86-FAST-NEXT: shldl %cl, %ebp, %eax ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: jne .LBB6_15 -; X86-FAST-NEXT: # %bb.14: -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_15: -; X86-FAST-NEXT: movb %bl, %dh -; X86-FAST-NEXT: addb $-64, %dh -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: movb %dh, %cl -; X86-FAST-NEXT: shll %cl, %eax -; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-FAST-NEXT: testb $32, %dh -; X86-FAST-NEXT: movl $0, %eax -; X86-FAST-NEXT: jne .LBB6_17 -; X86-FAST-NEXT: # %bb.16: -; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-FAST-NEXT: .LBB6_17: -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: jb .LBB6_19 -; X86-FAST-NEXT: # %bb.18: -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_19: -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jb .LBB6_21 -; X86-FAST-NEXT: # %bb.20: -; X86-FAST-NEXT: xorl %esi, %esi -; X86-FAST-NEXT: .LBB6_21: -; X86-FAST-NEXT: addb $-64, %ch -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: shrl %cl, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: movl $0, %eax -; X86-FAST-NEXT: jne .LBB6_23 -; X86-FAST-NEXT: # %bb.22: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: .LBB6_23: -; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: jae .LBB6_25 -; X86-FAST-NEXT: # %bb.24: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-FAST-NEXT: .LBB6_25: -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-FAST-NEXT: movb %ch, %cl -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: shrdl %cl, %eax, %ebp -; X86-FAST-NEXT: testb $32, %ch -; X86-FAST-NEXT: jne .LBB6_27 -; X86-FAST-NEXT: # %bb.26: -; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_27: -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jb .LBB6_29 -; X86-FAST-NEXT: # %bb.28: -; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-FAST-NEXT: .LBB6_29: -; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: je .LBB6_4 +; X86-FAST-NEXT: jmp .LBB6_5 +; X86-FAST-NEXT: .LBB6_1: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: jae .LBB6_31 -; X86-FAST-NEXT: # %bb.30: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_31: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: shldl $1, %eax, %ebp -; X86-FAST-NEXT: movl %ebp, %eax -; X86-FAST-NEXT: movl %ebx, %ecx -; X86-FAST-NEXT: shldl %cl, %edi, %eax -; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: jne .LBB6_33 -; X86-FAST-NEXT: # %bb.32: -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_33: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: movb %dh, %cl -; X86-FAST-NEXT: shldl %cl, %esi, %eax -; X86-FAST-NEXT: testb $32, %dh -; X86-FAST-NEXT: jne .LBB6_35 -; X86-FAST-NEXT: # %bb.34: -; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_35: -; X86-FAST-NEXT: movb $64, %cl -; X86-FAST-NEXT: subb %bl, %cl -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: shrdl %cl, %eax, %esi -; X86-FAST-NEXT: shrl %cl, %eax ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: je .LBB6_37 -; X86-FAST-NEXT: # %bb.36: -; X86-FAST-NEXT: movl %eax, %esi -; X86-FAST-NEXT: xorl %eax, %eax -; X86-FAST-NEXT: .LBB6_37: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jae .LBB6_39 -; X86-FAST-NEXT: # %bb.38: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-FAST-NEXT: orl %eax, %ecx -; X86-FAST-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_39: -; X86-FAST-NEXT: cmpl $64, %ebx -; X86-FAST-NEXT: jae .LBB6_41 -; X86-FAST-NEXT: # %bb.40: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-FAST-NEXT: orl %esi, %eax -; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_41: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: je .LBB6_43 -; X86-FAST-NEXT: # %bb.42: +; X86-FAST-NEXT: jne .LBB6_5 +; X86-FAST-NEXT: .LBB6_4: +; X86-FAST-NEXT: movl %edx, %ebx +; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl %ebp, %esi ; X86-FAST-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-FAST-NEXT: .LBB6_43: -; X86-FAST-NEXT: orl %edx, %ebp -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-FAST-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-FAST-NEXT: je .LBB6_45 -; X86-FAST-NEXT: # %bb.44: -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-FAST-NEXT: .LBB6_45: -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-FAST-NEXT: movl %ecx, 4(%eax) -; X86-FAST-NEXT: movl %esi, (%eax) -; X86-FAST-NEXT: movl %ebp, 12(%eax) +; X86-FAST-NEXT: .LBB6_5: +; X86-FAST-NEXT: shrdl %cl, %esi, %ebp +; X86-FAST-NEXT: shrdl %cl, %edi, %esi +; X86-FAST-NEXT: shrdl %cl, %edx, %edi +; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-FAST-NEXT: shrdl %cl, %ebx, %edx +; X86-FAST-NEXT: movl %edx, 12(%eax) ; X86-FAST-NEXT: movl %edi, 8(%eax) -; X86-FAST-NEXT: addl $76, %esp +; X86-FAST-NEXT: movl %esi, 4(%eax) +; X86-FAST-NEXT: movl %ebp, (%eax) +; X86-FAST-NEXT: addl $4, %esp ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi ; X86-FAST-NEXT: popl %ebx @@ -556,281 +315,76 @@ ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: subl $72, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: subl $8, %esp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: andl $127, %eax -; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: leal (%edi,%edi), %ebp -; X86-SLOW-NEXT: notb %al -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %ebp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: leal (%esi,%esi), %ebx -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb $32, %dl -; X86-SLOW-NEXT: jne .LBB6_1 +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: testb $64, %cl +; X86-SLOW-NEXT: je .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-SLOW-NEXT: orl %edi, %ebx -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: jmp .LBB6_3 -; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl %eax, %ebp -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: .LBB6_3: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %edx +; X86-SLOW-NEXT: movl %edi, %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl %ebp, %eax +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: testb $32, %cl +; X86-SLOW-NEXT: jne .LBB6_5 +; X86-SLOW-NEXT: .LBB6_4: +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: movl %edx, %ebp +; X86-SLOW-NEXT: movl %eax, %edx +; X86-SLOW-NEXT: jmp .LBB6_6 +; X86-SLOW-NEXT: .LBB6_1: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: jb .LBB6_5 -; X86-SLOW-NEXT: # %bb.4: -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: testb $32, %cl +; X86-SLOW-NEXT: je .LBB6_4 ; X86-SLOW-NEXT: .LBB6_5: -; X86-SLOW-NEXT: leal (%ecx,%ecx), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: notl %ebx -; X86-SLOW-NEXT: andl $127, %ebx -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: .LBB6_6: +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: movl %ecx, %ebx +; X86-SLOW-NEXT: notb %bl +; X86-SLOW-NEXT: leal (%ebp,%ebp), %eax ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shrl $31, %ecx -; X86-SLOW-NEXT: leal (%ecx,%edi,2), %ecx -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, %edi +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: orl %edx, %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: leal (%esi,%esi), %edx ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: movl $0, %edi -; X86-SLOW-NEXT: movl $0, %ecx -; X86-SLOW-NEXT: jne .LBB6_7 -; X86-SLOW-NEXT: # %bb.6: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: movl %esi, %ecx -; X86-SLOW-NEXT: .LBB6_7: -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: shrl $31, %ecx -; X86-SLOW-NEXT: leal (%ecx,%eax,2), %esi +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: orl %ebp, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shrl %cl, %esi ; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-SLOW-NEXT: leal (%esi,%esi), %ebp ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: jne .LBB6_9 -; X86-SLOW-NEXT: # %bb.8: -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_9: -; X86-SLOW-NEXT: movb %bl, %dh -; X86-SLOW-NEXT: addb $-64, %dh -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: movb %dh, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: movl $0, %ecx -; X86-SLOW-NEXT: jne .LBB6_11 -; X86-SLOW-NEXT: # %bb.10: -; X86-SLOW-NEXT: movl %esi, %ecx -; X86-SLOW-NEXT: .LBB6_11: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: jb .LBB6_13 -; X86-SLOW-NEXT: # %bb.12: -; X86-SLOW-NEXT: xorl %ebp, %ebp -; X86-SLOW-NEXT: .LBB6_13: -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb $64, %ch -; X86-SLOW-NEXT: movb $64, %ah -; X86-SLOW-NEXT: subb %dl, %ah -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb $32, %ah -; X86-SLOW-NEXT: jne .LBB6_14 -; X86-SLOW-NEXT: # %bb.15: -; X86-SLOW-NEXT: orl %edi, %ebp -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, %esi -; X86-SLOW-NEXT: jmp .LBB6_16 -; X86-SLOW-NEXT: .LBB6_14: -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: .LBB6_16: -; X86-SLOW-NEXT: addb $-64, %dl -; X86-SLOW-NEXT: movb %dl, %cl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: movb %dl, %cl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: testb $32, %dl -; X86-SLOW-NEXT: jne .LBB6_17 -; X86-SLOW-NEXT: # %bb.18: -; X86-SLOW-NEXT: orl %eax, %ebp -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jae .LBB6_20 -; X86-SLOW-NEXT: jmp .LBB6_21 -; X86-SLOW-NEXT: .LBB6_17: -; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: xorl %edi, %edi -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jb .LBB6_21 -; X86-SLOW-NEXT: .LBB6_20: -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-SLOW-NEXT: .LBB6_21: -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: jae .LBB6_23 -; X86-SLOW-NEXT: # %bb.22: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: .LBB6_23: -; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: jb .LBB6_24 -; X86-SLOW-NEXT: # %bb.25: -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: jmp .LBB6_26 -; X86-SLOW-NEXT: .LBB6_24: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_26: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: shrl $31, %eax -; X86-SLOW-NEXT: leal (%eax,%esi,2), %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb %bl, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: jne .LBB6_28 -; X86-SLOW-NEXT: # %bb.27: -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_28: -; X86-SLOW-NEXT: movl %ebp, %eax -; X86-SLOW-NEXT: movb %dh, %cl -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: jne .LBB6_30 -; X86-SLOW-NEXT: # %bb.29: -; X86-SLOW-NEXT: orl %esi, %eax -; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_30: -; X86-SLOW-NEXT: subb %bl, %ch -; X86-SLOW-NEXT: movl %ebp, %eax -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: addl %ebp, %ebp -; X86-SLOW-NEXT: notb %cl -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: movl %ebp, %esi -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: testb $32, %ch +; X86-SLOW-NEXT: addl %edi, %edi +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: orl %esi, %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: jne .LBB6_31 -; X86-SLOW-NEXT: # %bb.32: -; X86-SLOW-NEXT: orl %ebp, %esi -; X86-SLOW-NEXT: movl %esi, %ebp -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jb .LBB6_34 -; X86-SLOW-NEXT: jmp .LBB6_35 -; X86-SLOW-NEXT: .LBB6_31: -; X86-SLOW-NEXT: movl %eax, %ebp -; X86-SLOW-NEXT: xorl %eax, %eax -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jae .LBB6_35 -; X86-SLOW-NEXT: .LBB6_34: -; X86-SLOW-NEXT: movl %ebp, %esi -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-SLOW-NEXT: orl %eax, %ebp -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %esi, %ebp -; X86-SLOW-NEXT: .LBB6_35: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: cmpl $64, %ebx -; X86-SLOW-NEXT: jae .LBB6_37 -; X86-SLOW-NEXT: # %bb.36: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SLOW-NEXT: orl %ebp, %eax -; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_37: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB6_39 -; X86-SLOW-NEXT: # %bb.38: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-SLOW-NEXT: .LBB6_39: -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-SLOW-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-SLOW-NEXT: je .LBB6_41 -; X86-SLOW-NEXT: # %bb.40: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-SLOW-NEXT: .LBB6_41: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-SLOW-NEXT: orl %ecx, %ebx -; X86-SLOW-NEXT: orl %ebp, %edx -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-SLOW-NEXT: movl %ebx, (%eax) -; X86-SLOW-NEXT: movl %esi, 12(%eax) -; X86-SLOW-NEXT: movl %edx, 4(%eax) -; X86-SLOW-NEXT: movl %edi, 8(%eax) -; X86-SLOW-NEXT: addl $72, %esp +; X86-SLOW-NEXT: movl %edi, 12(%ecx) +; X86-SLOW-NEXT: movl %ebp, 8(%ecx) +; X86-SLOW-NEXT: movl %edx, 4(%ecx) +; X86-SLOW-NEXT: movl %eax, (%ecx) +; X86-SLOW-NEXT: movl %ecx, %eax +; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx @@ -839,65 +393,37 @@ ; ; X64-FAST-LABEL: var_shift_i128: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movq %r8, %r9 -; X64-FAST-NEXT: movq %rcx, %r8 -; X64-FAST-NEXT: movl %r9d, %ecx -; X64-FAST-NEXT: shrdq %cl, %r8, %rdx -; X64-FAST-NEXT: shrq %cl, %r8 -; X64-FAST-NEXT: xorl %eax, %eax -; X64-FAST-NEXT: testb $64, %r9b -; X64-FAST-NEXT: cmovneq %r8, %rdx -; X64-FAST-NEXT: cmovneq %rax, %r8 -; X64-FAST-NEXT: shldq $1, %rdi, %rsi -; X64-FAST-NEXT: addq %rdi, %rdi -; X64-FAST-NEXT: notb %r9b -; X64-FAST-NEXT: movl %r9d, %ecx -; X64-FAST-NEXT: shldq %cl, %rdi, %rsi -; X64-FAST-NEXT: shlq %cl, %rdi -; X64-FAST-NEXT: testb $64, %r9b -; X64-FAST-NEXT: cmovneq %rdi, %rsi -; X64-FAST-NEXT: cmoveq %rdi, %rax -; X64-FAST-NEXT: orq %rdx, %rax -; X64-FAST-NEXT: orq %rsi, %r8 -; X64-FAST-NEXT: movq %r8, %rdx +; X64-FAST-NEXT: movq %rdx, %rax +; X64-FAST-NEXT: testb $64, %r8b +; X64-FAST-NEXT: cmoveq %rdi, %rsi +; X64-FAST-NEXT: cmoveq %rcx, %rdi +; X64-FAST-NEXT: cmovneq %rcx, %rax +; X64-FAST-NEXT: movl %r8d, %ecx +; X64-FAST-NEXT: shrdq %cl, %rdi, %rax +; X64-FAST-NEXT: shrdq %cl, %rsi, %rdi +; X64-FAST-NEXT: movq %rdi, %rdx ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i128: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq %rcx, %r9 -; X64-SLOW-NEXT: movq %rdx, %r10 -; X64-SLOW-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; X64-SLOW-NEXT: andq %rdi, %rax -; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shrq %cl, %rax -; X64-SLOW-NEXT: movq %rdi, %rcx -; X64-SLOW-NEXT: shrq $63, %rcx -; X64-SLOW-NEXT: leaq (%rcx,%rsi,2), %rdx -; X64-SLOW-NEXT: movl %r8d, %r11d -; X64-SLOW-NEXT: notb %r11b -; X64-SLOW-NEXT: movl %r11d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rdx -; X64-SLOW-NEXT: orq %rax, %rdx +; X64-SLOW-NEXT: testb $64, %r8b +; X64-SLOW-NEXT: cmoveq %rdi, %rsi +; X64-SLOW-NEXT: cmoveq %rcx, %rdi +; X64-SLOW-NEXT: cmovneq %rcx, %rdx ; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shrq %cl, %r10 -; X64-SLOW-NEXT: leaq (%r9,%r9), %rsi -; X64-SLOW-NEXT: movl %r11d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rsi -; X64-SLOW-NEXT: orq %r10, %rsi +; X64-SLOW-NEXT: shrq %cl, %rdx +; X64-SLOW-NEXT: leaq (%rdi,%rdi), %rax +; X64-SLOW-NEXT: movl %r8d, %r9d +; X64-SLOW-NEXT: notb %r9b +; X64-SLOW-NEXT: movl %r9d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rax +; X64-SLOW-NEXT: orq %rdx, %rax ; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shrq %cl, %r9 -; X64-SLOW-NEXT: xorl %eax, %eax -; X64-SLOW-NEXT: testb $64, %r8b -; X64-SLOW-NEXT: cmovneq %r9, %rsi -; X64-SLOW-NEXT: cmovneq %rax, %r9 -; X64-SLOW-NEXT: addq %rdi, %rdi -; X64-SLOW-NEXT: movl %r11d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rdi -; X64-SLOW-NEXT: testb $64, %r11b -; X64-SLOW-NEXT: cmovneq %rdi, %rdx -; X64-SLOW-NEXT: cmoveq %rdi, %rax -; X64-SLOW-NEXT: orq %rsi, %rax -; X64-SLOW-NEXT: orq %r9, %rdx +; X64-SLOW-NEXT: shrq %cl, %rdi +; X64-SLOW-NEXT: leaq (%rsi,%rsi), %rdx +; X64-SLOW-NEXT: movl %r9d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rdx +; X64-SLOW-NEXT: orq %rdi, %rdx ; X64-SLOW-NEXT: retq %tmp = tail call i128 @llvm.fshr.i128(i128 %x, i128 %y, i128 %z) ret i128 %tmp @@ -1004,9 +530,9 @@ define i64 @const_shift_i64(i64 %x, i64 %y) nounwind { ; X86-FAST-LABEL: const_shift_i64: ; X86-FAST: # %bb.0: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: shldl $25, %ecx, %edx ; X86-FAST-NEXT: shrdl $7, %ecx, %eax ; X86-FAST-NEXT: retl diff --git a/llvm/test/CodeGen/X86/funnel-shift-rot.ll b/llvm/test/CodeGen/X86/funnel-shift-rot.ll --- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll @@ -276,34 +276,19 @@ define i64 @rotr_i64(i64 %x, i64 %z) nounwind { ; X32-SSE2-LABEL: rotr_i64: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pushl %ebp -; X32-SSE2-NEXT: pushl %ebx -; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi -; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: shrl %cl, %esi -; X32-SSE2-NEXT: movl %ebx, %edi -; X32-SSE2-NEXT: shrdl %cl, %edx, %edi -; X32-SSE2-NEXT: xorl %ebp, %ebp -; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %esi, %edi -; X32-SSE2-NEXT: cmovnel %ebp, %esi -; X32-SSE2-NEXT: negb %cl -; X32-SSE2-NEXT: movl %ebx, %eax -; X32-SSE2-NEXT: shll %cl, %eax -; X32-SSE2-NEXT: shldl %cl, %ebx, %edx +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %eax, %edx -; X32-SSE2-NEXT: cmovnel %ebp, %eax -; X32-SSE2-NEXT: orl %edi, %eax -; X32-SSE2-NEXT: orl %esi, %edx +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: cmovel %esi, %edx +; X32-SSE2-NEXT: cmovel %eax, %esi +; X32-SSE2-NEXT: movl %esi, %eax +; X32-SSE2-NEXT: shrdl %cl, %edx, %eax +; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: shrdl %cl, %esi, %edx ; X32-SSE2-NEXT: popl %esi -; X32-SSE2-NEXT: popl %edi -; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: rotr_i64: diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -40,38 +40,22 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X32-SSE2-LABEL: fshl_i64: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pushl %ebp -; X32-SSE2-NEXT: pushl %ebx ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %ch -; X32-SSE2-NEXT: movb %ch, %cl -; X32-SSE2-NEXT: notb %cl -; X32-SSE2-NEXT: shrdl $1, %ebx, %esi -; X32-SSE2-NEXT: shrl %ebx -; X32-SSE2-NEXT: shrdl %cl, %ebx, %esi -; X32-SSE2-NEXT: shrl %cl, %ebx -; X32-SSE2-NEXT: xorl %ebp, %ebp +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %ebx, %esi -; X32-SSE2-NEXT: cmovnel %ebp, %ebx +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: cmovnel %esi, %edi +; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edx +; X32-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %esi ; X32-SSE2-NEXT: movl %edi, %eax -; X32-SSE2-NEXT: movb %ch, %cl -; X32-SSE2-NEXT: shll %cl, %eax +; X32-SSE2-NEXT: shldl %cl, %esi, %eax +; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx ; X32-SSE2-NEXT: shldl %cl, %edi, %edx -; X32-SSE2-NEXT: testb $32, %ch -; X32-SSE2-NEXT: cmovnel %eax, %edx -; X32-SSE2-NEXT: cmovnel %ebp, %eax -; X32-SSE2-NEXT: orl %esi, %eax -; X32-SSE2-NEXT: orl %ebx, %edx ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi -; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i64: @@ -92,169 +76,40 @@ ; X32-SSE2-NEXT: pushl %ebx ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi -; X32-SSE2-NEXT: subl $64, %esp ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: movl %esi, %edi -; X32-SSE2-NEXT: shldl $31, %ecx, %edi -; X32-SSE2-NEXT: notl %ebx -; X32-SSE2-NEXT: andl $127, %ebx -; X32-SSE2-NEXT: movb $64, %cl -; X32-SSE2-NEXT: subb %bl, %cl -; X32-SSE2-NEXT: shrl %esi -; X32-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-SSE2-NEXT: shldl %cl, %edi, %esi -; X32-SSE2-NEXT: movl %edi, %ebp -; X32-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: shll %cl, %ebp -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %ebp, %esi -; X32-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: cmovnel %eax, %ebp -; X32-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: andl $127, %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shldl %cl, %ebp, %edx -; X32-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl %ebx, %ecx -; X32-SSE2-NEXT: addb $-64, %cl -; X32-SSE2-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-SSE2-NEXT: shrdl %cl, %esi, %edi -; X32-SSE2-NEXT: shrl %cl, %esi -; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %esi, %edi -; X32-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl $0, %ecx -; X32-SSE2-NEXT: cmovnel %ecx, %esi -; X32-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shldl %cl, %edi, %esi -; X32-SSE2-NEXT: movl %edi, %edx -; X32-SSE2-NEXT: shll %cl, %edx -; X32-SSE2-NEXT: shll %cl, %ebp -; X32-SSE2-NEXT: testb $32, %al -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-SSE2-NEXT: cmovnel %ebp, %eax -; X32-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: cmovnel %edx, %esi -; X32-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl $0, %eax -; X32-SSE2-NEXT: cmovnel %eax, %ebp -; X32-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: cmovnel %eax, %edx -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: cmpl $64, %ecx -; X32-SSE2-NEXT: cmovael %eax, %edx -; X32-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-SSE2-NEXT: shldl $31, %eax, %ebp -; X32-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: shrdl $1, %eax, %esi -; X32-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl %ebx, %ecx -; X32-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload -; X32-SSE2-NEXT: shrdl %cl, %edx, %eax -; X32-SSE2-NEXT: shrl %cl, %edx -; X32-SSE2-NEXT: movl %esi, %ebx -; X32-SSE2-NEXT: shrdl %cl, %ebp, %ebx -; X32-SSE2-NEXT: movl %ebp, %esi -; X32-SSE2-NEXT: shrl %cl, %esi -; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %esi, %ebx -; X32-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: cmovnel %edx, %eax -; X32-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movl $0, %eax -; X32-SSE2-NEXT: cmovnel %eax, %esi -; X32-SSE2-NEXT: cmovnel %eax, %ecx -; X32-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-SSE2-NEXT: cmpl $64, %ebx -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-SSE2-NEXT: cmovael %eax, %ecx -; X32-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: xorl %ebp, %ebp -; X32-SSE2-NEXT: movb $64, %ch -; X32-SSE2-NEXT: subb %bl, %ch ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movb %ch, %cl -; X32-SSE2-NEXT: shrl %cl, %edx -; X32-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: testb $32, %ch -; X32-SSE2-NEXT: cmovnel %ebp, %edx -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-SSE2-NEXT: movb %bl, %cl -; X32-SSE2-NEXT: addb $-64, %cl -; X32-SSE2-NEXT: movl %edi, %ebp -; X32-SSE2-NEXT: shll %cl, %ebp -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shldl %cl, %edi, %eax -; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %ebp, %eax -; X32-SSE2-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-SSE2-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-SSE2-NEXT: movl $0, %edi -; X32-SSE2-NEXT: cmovael %edi, %ebx -; X32-SSE2-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-SSE2-NEXT: cmpl $64, %ebx -; X32-SSE2-NEXT: cmovbl %edx, %eax -; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: movl $0, %edi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE2-NEXT: testb $64, %cl +; X32-SSE2-NEXT: movl %esi, %eax +; X32-SSE2-NEXT: cmovnel %ebx, %eax +; X32-SSE2-NEXT: movl %edx, %ebp ; X32-SSE2-NEXT: cmovnel %edi, %ebp -; X32-SSE2-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-SSE2-NEXT: cmovael %edi, %edx -; X32-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-SSE2-NEXT: movb %ch, %cl -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-SSE2-NEXT: shrdl %cl, %edx, %edi -; X32-SSE2-NEXT: testb $32, %ch -; X32-SSE2-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-SSE2-NEXT: cmpl $64, %ebx -; X32-SSE2-NEXT: cmovael %ebp, %edi -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-SSE2-NEXT: cmpl $64, %edx -; X32-SSE2-NEXT: cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-SSE2-NEXT: cmpl $64, %edx -; X32-SSE2-NEXT: cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-SSE2-NEXT: testl %edx, %edx -; X32-SSE2-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-SSE2-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-SSE2-NEXT: movl %ecx, %edx -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-SSE2-NEXT: testl %ebx, %ebx -; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edi -; X32-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-SSE2-NEXT: orl (%esp), %eax # 4-byte Folded Reload +; X32-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %edi +; X32-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %ebx +; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edx +; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %esi +; X32-SSE2-NEXT: testb $32, %cl +; X32-SSE2-NEXT: cmovnel %esi, %edx +; X32-SSE2-NEXT: cmovnel %ebp, %esi +; X32-SSE2-NEXT: cmovnel %eax, %ebp +; X32-SSE2-NEXT: cmovel %edi, %ebx +; X32-SSE2-NEXT: cmovel %eax, %edi +; X32-SSE2-NEXT: movl %edi, %eax +; X32-SSE2-NEXT: shldl %cl, %ebx, %eax +; X32-SSE2-NEXT: movl %ebp, %ebx +; X32-SSE2-NEXT: shldl %cl, %edi, %ebx +; X32-SSE2-NEXT: movl %esi, %edi +; X32-SSE2-NEXT: shldl %cl, %ebp, %edi +; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: shldl %cl, %esi, %edx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: movl %eax, 12(%ecx) +; X32-SSE2-NEXT: movl %edx, 12(%ecx) ; X32-SSE2-NEXT: movl %edi, 8(%ecx) -; X32-SSE2-NEXT: movl %esi, 4(%ecx) -; X32-SSE2-NEXT: movl %edx, (%ecx) +; X32-SSE2-NEXT: movl %ebx, 4(%ecx) +; X32-SSE2-NEXT: movl %eax, (%ecx) ; X32-SSE2-NEXT: movl %ecx, %eax -; X32-SSE2-NEXT: addl $64, %esp ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi ; X32-SSE2-NEXT: popl %ebx @@ -263,27 +118,15 @@ ; ; X64-AVX2-LABEL: fshl_i128: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: movq %r8, %r9 -; X64-AVX2-NEXT: movq %rcx, %r8 -; X64-AVX2-NEXT: movl %r9d, %ecx -; X64-AVX2-NEXT: shldq %cl, %rdi, %rsi -; X64-AVX2-NEXT: shrdq $1, %r8, %rdx -; X64-AVX2-NEXT: shrq %r8 -; X64-AVX2-NEXT: notb %cl -; X64-AVX2-NEXT: shrdq %cl, %r8, %rdx -; X64-AVX2-NEXT: shrq %cl, %r8 -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: testb $64, %cl -; X64-AVX2-NEXT: cmovneq %r8, %rdx -; X64-AVX2-NEXT: cmovneq %rax, %r8 -; X64-AVX2-NEXT: movl %r9d, %ecx -; X64-AVX2-NEXT: shlq %cl, %rdi -; X64-AVX2-NEXT: testb $64, %r9b +; X64-AVX2-NEXT: testb $64, %r8b ; X64-AVX2-NEXT: cmovneq %rdi, %rsi -; X64-AVX2-NEXT: cmoveq %rdi, %rax -; X64-AVX2-NEXT: orq %rdx, %rax -; X64-AVX2-NEXT: orq %rsi, %r8 -; X64-AVX2-NEXT: movq %r8, %rdx +; X64-AVX2-NEXT: cmoveq %rcx, %rdx +; X64-AVX2-NEXT: cmovneq %rcx, %rdi +; X64-AVX2-NEXT: movq %rdi, %rax +; X64-AVX2-NEXT: movl %r8d, %ecx +; X64-AVX2-NEXT: shldq %cl, %rdx, %rax +; X64-AVX2-NEXT: shldq %cl, %rdi, %rsi +; X64-AVX2-NEXT: movq %rsi, %rdx ; X64-AVX2-NEXT: retq %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f @@ -294,7 +137,6 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X32-SSE2-LABEL: fshl_i37: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pushl %ebp ; X32-SSE2-NEXT: pushl %ebx ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi @@ -302,40 +144,31 @@ ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-SSE2-NEXT: shldl $27, %ebx, %edi -; X32-SSE2-NEXT: shll $27, %ebx -; X32-SSE2-NEXT: shrdl $1, %edi, %ebx -; X32-SSE2-NEXT: shrl %edi ; X32-SSE2-NEXT: pushl $0 ; X32-SSE2-NEXT: pushl $37 ; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: calll __umoddi3 ; X32-SSE2-NEXT: addl $16, %esp -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: notb %cl -; X32-SSE2-NEXT: shrdl %cl, %edi, %ebx -; X32-SSE2-NEXT: shrl %cl, %edi -; X32-SSE2-NEXT: xorl %eax, %eax +; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %edi, %ebx -; X32-SSE2-NEXT: cmovnel %eax, %edi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shll %cl, %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-SSE2-NEXT: shldl %cl, %ebp, %esi -; X32-SSE2-NEXT: testb $32, %dl -; X32-SSE2-NEXT: cmovnel %eax, %esi -; X32-SSE2-NEXT: movl $0, %ecx -; X32-SSE2-NEXT: cmovnel %ecx, %eax -; X32-SSE2-NEXT: orl %ebx, %eax -; X32-SSE2-NEXT: orl %edi, %esi +; X32-SSE2-NEXT: jne .LBB3_1 +; X32-SSE2-NEXT: # %bb.2: +; X32-SSE2-NEXT: movl %edi, %ebx +; X32-SSE2-NEXT: movl %esi, %edi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-SSE2-NEXT: jmp .LBB3_3 +; X32-SSE2-NEXT: .LBB3_1: +; X32-SSE2-NEXT: shll $27, %ebx +; X32-SSE2-NEXT: .LBB3_3: +; X32-SSE2-NEXT: movl %edi, %eax +; X32-SSE2-NEXT: shldl %cl, %ebx, %eax +; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: shldl %cl, %edi, %esi ; X32-SSE2-NEXT: movl %esi, %edx ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi ; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i37: @@ -468,51 +301,39 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X32-SSE2-LABEL: fshr_i37: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pushl %ebp ; X32-SSE2-NEXT: pushl %ebx ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: shldl $1, %edi, %esi -; X32-SSE2-NEXT: addl %edi, %edi +; X32-SSE2-NEXT: shldl $27, %ebx, %esi ; X32-SSE2-NEXT: pushl $0 ; X32-SSE2-NEXT: pushl $37 ; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: calll __umoddi3 ; X32-SSE2-NEXT: addl $16, %esp -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: addb $27, %dl -; X32-SSE2-NEXT: movl %edx, %eax -; X32-SSE2-NEXT: notb %al -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shldl %cl, %edi, %esi -; X32-SSE2-NEXT: shldl $27, %ebp, %ebx -; X32-SSE2-NEXT: shll $27, %ebp -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shrdl %cl, %ebx, %ebp -; X32-SSE2-NEXT: shrl %cl, %ebx -; X32-SSE2-NEXT: xorl %ecx, %ecx -; X32-SSE2-NEXT: testb $32, %dl -; X32-SSE2-NEXT: cmovnel %ebx, %ebp -; X32-SSE2-NEXT: cmovnel %ecx, %ebx -; X32-SSE2-NEXT: xorl %edx, %edx ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll %cl, %edi -; X32-SSE2-NEXT: testb $32, %al -; X32-SSE2-NEXT: cmovnel %edi, %esi -; X32-SSE2-NEXT: cmovnel %edx, %edi -; X32-SSE2-NEXT: orl %ebp, %edi -; X32-SSE2-NEXT: orl %ebx, %esi -; X32-SSE2-NEXT: movl %edi, %eax +; X32-SSE2-NEXT: addl $27, %ecx +; X32-SSE2-NEXT: testb $32, %cl +; X32-SSE2-NEXT: je .LBB10_1 +; X32-SSE2-NEXT: # %bb.2: +; X32-SSE2-NEXT: movl %edi, %edx +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-SSE2-NEXT: jmp .LBB10_3 +; X32-SSE2-NEXT: .LBB10_1: +; X32-SSE2-NEXT: shll $27, %ebx ; X32-SSE2-NEXT: movl %esi, %edx +; X32-SSE2-NEXT: movl %ebx, %esi +; X32-SSE2-NEXT: .LBB10_3: +; X32-SSE2-NEXT: shrdl %cl, %edx, %esi +; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: shrdl %cl, %edi, %edx +; X32-SSE2-NEXT: movl %esi, %eax ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi ; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i37: @@ -1070,9 +891,9 @@ define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) nounwind { ; X32-SSE2-LABEL: fshr_i64_const_overshift: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: shrdl $9, %ecx, %eax ; X32-SSE2-NEXT: shldl $23, %ecx, %edx ; X32-SSE2-NEXT: retl