Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7204,6 +7204,72 @@ return SDValue(); } +/// If we have a shift-by-constant of a bitwise logic op that itself has a +/// shift-by-constant operand with identical opcode, we may be able to convert +/// that into 2 independent shifts followed by the logic op. This is a +/// throughput improvement. +static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) { + // Match a one-use bitwise logic op. + SDValue LogicOp = Shift->getOperand(0); + if (!LogicOp.hasOneUse()) + return SDValue(); + + unsigned LogicOpcode = LogicOp.getOpcode(); + if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR && + LogicOpcode != ISD::XOR) + return SDValue(); + + // Find a matching one-use shift by constant. + unsigned ShiftOpcode = Shift->getOpcode(); + SDValue C1 = Shift->getOperand(1); + ConstantSDNode *C1Node = isConstOrConstSplat(C1); + assert(C1Node && "Expected a shift with constant operand"); + const APInt &C1Val = C1Node->getAPIntValue(); + auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp, + const APInt *&ShiftAmtVal) { + if (V.getOpcode() != ShiftOpcode || !V.hasOneUse()) + return false; + + ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1)); + if (!ShiftCNode) + return false; + + // Capture the shifted operand and shift amount value. + ShiftOp = V.getOperand(0); + ShiftAmtVal = &ShiftCNode->getAPIntValue(); + + // Shift amount types do not have to match their operand type, so check that + // the constants are the same width. + if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth()) + return false; + + // The fold is not valid if the sum of the shift values exceeds bitwidth. + if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits())) + return false; + + return true; + }; + + // Logic ops are commutative, so check each operand for a match. + SDValue X, Y; + const APInt *C0Val; + if (matchFirstShift(LogicOp.getOperand(0), X, C0Val)) + Y = LogicOp.getOperand(1); + else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val)) + Y = LogicOp.getOperand(0); + else + return SDValue(); + + // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) + SDLoc DL(Shift); + EVT VT = Shift->getValueType(0); + EVT ShiftAmtVT = Shift->getOperand(1).getValueType(); + SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT); + SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC); + SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1); + return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2); +} + /// Handle transforms common to the three shifts, when the shift amount is a /// constant. /// We are looking for: (shift being one of shl/sra/srl) @@ -7222,6 +7288,14 @@ if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level)) return SDValue(); + // TODO: This is limited to early combining because it may reveal regressions + // otherwise. But since we just checked a target hook to see if this is + // desirable, that should have filtered out cases where this interferes + // with some other pattern matching. + if (!LegalTypes) + if (SDValue R = combineShiftOfShiftedLogic(N, DAG)) + return R; + // We want to pull some binops through shifts, so that we have (and (shift)) // instead of (shift (and)), likewise for add, or, xor, etc. This sort of // thing happens with address calculations, so it's important to canonicalize Index: llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll +++ llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll @@ -265,12 +265,12 @@ define i32 @test_nouseful_bits(i8 %a, i32 %b) { ; CHECK-LABEL: test_nouseful_bits: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: bfi w8, w8, #8, #24 -; CHECK-NEXT: mov w9, w0 -; CHECK-NEXT: bfi w9, w8, #8, #24 -; CHECK-NEXT: bfi w0, w9, #8, #24 -; CHECK-NEXT: lsl w0, w0, #8 +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: lsl w8, w8, #8 +; CHECK-NEXT: mov w9, w8 +; CHECK-NEXT: bfxil w9, w0, #0, #8 +; CHECK-NEXT: bfi w8, w9, #16, #16 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %conv = zext i8 %a to i32 ; 0 0 0 A %shl = shl i32 %b, 8 ; B2 B1 B0 0 Index: llvm/trunk/test/CodeGen/AArch64/shift-logic.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/shift-logic.ll +++ llvm/trunk/test/CodeGen/AArch64/shift-logic.ll @@ -4,8 +4,8 @@ define i8 @shl_and(i8 %x, i8 %y) nounwind { ; CHECK-LABEL: shl_and: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, w0, lsl #3 -; CHECK-NEXT: lsl w0, w8, #2 +; CHECK-NEXT: lsl w8, w0, #5 +; CHECK-NEXT: and w0, w8, w1, lsl #2 ; CHECK-NEXT: ret %sh0 = shl i8 %x, 3 %r = and i8 %sh0, %y @@ -16,8 +16,8 @@ define i16 @shl_or(i16 %x, i16 %y) nounwind { ; CHECK-LABEL: shl_or: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w1, w0, lsl #5 -; CHECK-NEXT: lsl w0, w8, #7 +; CHECK-NEXT: lsl w8, w0, #12 +; CHECK-NEXT: orr w0, w8, w1, lsl #7 ; CHECK-NEXT: ret %sh0 = shl i16 %x, 5 %r = or i16 %y, %sh0 @@ -28,8 +28,8 @@ define i32 @shl_xor(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: shl_xor: ; CHECK: // %bb.0: -; CHECK-NEXT: eor w8, w1, w0, lsl #5 -; CHECK-NEXT: lsl w0, w8, #7 +; CHECK-NEXT: lsl w8, w0, #12 +; CHECK-NEXT: eor w0, w8, w1, lsl #7 ; CHECK-NEXT: ret %sh0 = shl i32 %x, 5 %r = xor i32 %sh0, %y @@ -40,8 +40,8 @@ define i64 @lshr_and(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: lshr_and: ; CHECK: // %bb.0: -; CHECK-NEXT: and x8, x1, x0, lsr #5 -; CHECK-NEXT: lsr x0, x8, #7 +; CHECK-NEXT: lsr x8, x0, #12 +; CHECK-NEXT: and x0, x8, x1, lsr #7 ; CHECK-NEXT: ret %sh0 = lshr i64 %x, 5 %r = and i64 %y, %sh0 @@ -52,9 +52,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: lshr_or: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.4s, v0.4s, #5 +; CHECK-NEXT: ushr v1.4s, v1.4s, #7 +; CHECK-NEXT: ushr v0.4s, v0.4s, #12 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.4s, v0.4s, #7 ; CHECK-NEXT: ret %sh0 = lshr <4 x i32> %x, %r = or <4 x i32> %sh0, %y @@ -65,9 +65,9 @@ define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: lshr_xor: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.8h, v0.8h, #5 -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: ushr v0.8h, v0.8h, #7 +; CHECK-NEXT: ushr v1.8h, v1.8h, #7 +; CHECK-NEXT: ushr v0.8h, v0.8h, #12 +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %sh0 = lshr <8 x i16> %x, %r = xor <8 x i16> %y, %sh0 @@ -79,9 +79,9 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: ashr_and: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.16b, v0.16b, #3 -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: sshr v0.16b, v0.16b, #2 +; CHECK-NEXT: sshr v1.16b, v1.16b, #2 +; CHECK-NEXT: sshr v0.16b, v0.16b, #5 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %sh0 = ashr <16 x i8> %x, %r = and <16 x i8> %y, %sh0 @@ -92,9 +92,9 @@ define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: ashr_or: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sshr v1.2d, v1.2d, #7 +; CHECK-NEXT: sshr v0.2d, v0.2d, #12 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshr v0.2d, v0.2d, #7 ; CHECK-NEXT: ret %sh0 = ashr <2 x i64> %x, %r = or <2 x i64> %sh0, %y @@ -105,8 +105,8 @@ define i32 @ashr_xor(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: ashr_xor: ; CHECK: // %bb.0: -; CHECK-NEXT: eor w8, w1, w0, asr #5 -; CHECK-NEXT: asr w0, w8, #7 +; CHECK-NEXT: asr w8, w0, #12 +; CHECK-NEXT: eor w0, w8, w1, asr #7 ; CHECK-NEXT: ret %sh0 = ashr i32 %x, 5 %r = xor i32 %y, %sh0 Index: llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll +++ llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll @@ -15,10 +15,10 @@ ; Make sure the cmp is not scheduled before the InlineAsm that clobbers cc. ; CHECK: bl _f2 -; CHECK: cmp {{r[0-9]+}}, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: addeq {{r[0-9]+}}, #1 -; CHECK-NEXT: lsls +; CHECK: clz {{r[0-9]+}} +; CHECK-DAG: lsrs {{r[0-9]+}} +; CHECK-DAG: lsls {{r[0-9]+}} +; CHECK-NEXT: orr.w {{r[0-9]+}} ; CHECK-NEXT: InlineAsm Start define void @test(%s1* %this, i32 %format, i32 %w, i32 %h, i32 %levels, i32* %s, i8* %data, i32* nocapture %rowbytes, void (i8*, i8*)* %release, i8* %info) nounwind { entry: Index: llvm/trunk/test/CodeGen/X86/shift-logic.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/shift-logic.ll +++ llvm/trunk/test/CodeGen/X86/shift-logic.ll @@ -4,10 +4,10 @@ define i8 @shl_and(i8 %x, i8 %y) nounwind { ; CHECK-LABEL: shl_and: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal (,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shlb $2, %sil +; CHECK-NEXT: shlb $5, %al ; CHECK-NEXT: andb %sil, %al -; CHECK-NEXT: shlb $2, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %sh0 = shl i8 %x, 3 @@ -20,9 +20,9 @@ ; CHECK-LABEL: shl_or: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $5, %eax +; CHECK-NEXT: shll $7, %esi +; CHECK-NEXT: shll $12, %eax ; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: shll $7, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %sh0 = shl i16 %x, 5 @@ -35,9 +35,9 @@ ; CHECK-LABEL: shl_xor: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $5, %eax +; CHECK-NEXT: shll $7, %esi +; CHECK-NEXT: shll $12, %eax ; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: shll $7, %eax ; CHECK-NEXT: retq %sh0 = shl i32 %x, 5 %r = xor i32 %sh0, %y @@ -49,9 +49,9 @@ ; CHECK-LABEL: lshr_and: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shrq $5, %rax +; CHECK-NEXT: shrq $7, %rsi +; CHECK-NEXT: shrq $12, %rax ; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: shrq $7, %rax ; CHECK-NEXT: retq %sh0 = lshr i64 %x, 5 %r = and i64 %y, %sh0 @@ -62,9 +62,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: lshr_or: ; CHECK: # %bb.0: -; CHECK-NEXT: psrld $5, %xmm0 +; CHECK-NEXT: psrld $7, %xmm1 +; CHECK-NEXT: psrld $12, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: psrld $7, %xmm0 ; CHECK-NEXT: retq %sh0 = lshr <4 x i32> %x, %r = or <4 x i32> %sh0, %y @@ -75,9 +75,9 @@ define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: lshr_xor: ; CHECK: # %bb.0: -; CHECK-NEXT: psrlw $5, %xmm0 +; CHECK-NEXT: psrlw $7, %xmm1 +; CHECK-NEXT: psrlw $12, %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: psrlw $7, %xmm0 ; CHECK-NEXT: retq %sh0 = lshr <8 x i16> %x, %r = xor <8 x i16> %y, %sh0 @@ -89,17 +89,17 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: ashr_and: ; CHECK: # %bb.0: -; CHECK-NEXT: psrlw $3, %xmm0 +; CHECK-NEXT: psrlw $2, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: psubb %xmm2, %xmm1 +; CHECK-NEXT: psrlw $5, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: psubb %xmm2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: psrlw $2, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: retq %sh0 = ashr <16 x i8> %x, %r = and <16 x i8> %y, %sh0 @@ -110,19 +110,19 @@ define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: ashr_or: ; CHECK: # %bb.0: +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $7, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-NEXT: psrlq $7, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrad $5, %xmm2 +; CHECK-NEXT: psrad $12, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-NEXT: psrlq $5, %xmm0 +; CHECK-NEXT: psrlq $12, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrad $7, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-NEXT: psrlq $7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq %sh0 = ashr <2 x i64> %x, %r = or <2 x i64> %sh0, %y @@ -134,9 +134,9 @@ ; CHECK-LABEL: ashr_xor: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: sarl $5, %eax +; CHECK-NEXT: sarl $7, %esi +; CHECK-NEXT: sarl $12, %eax ; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: sarl $7, %eax ; CHECK-NEXT: retq %sh0 = ashr i32 %x, 5 %r = xor i32 %y, %sh0