Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -447,6 +447,7 @@ SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); + SDValue foldRedundantShiftedMasks(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); @@ -4104,6 +4105,108 @@ return false; } +// fold expressions x1 and x2 alike: +// x1 = ( and, x, 0x00FF ) +// x2 = (( shl x, 8 ) and 0xFF00 ) +// into +// x2 = shl x1, 8 ; reuse the computation of x1 +SDValue DAGCombiner::foldRedundantShiftedMasks(SDNode *AND) { + const SDValue &SHIFT = AND->getOperand(0); + if ((SHIFT.getNumOperands() != 2) || (!SHIFT.hasOneUse())) + return SDValue(); + + const ConstantSDNode *ShiftAmount = + dyn_cast(SHIFT.getOperand(1)); + if (!ShiftAmount) + return SDValue(); + + const ConstantSDNode *Mask = dyn_cast(AND->getOperand(1)); + if (!Mask) + return SDValue(); + + SDValue MASKED = SHIFT.getOperand(0); + const auto &MaskedValue = dyn_cast(MASKED); + unsigned N0Opcode = SHIFT.getOpcode(); + for (SDNode *OtherUser : MaskedValue->uses()) { + if ((&(*OtherUser) == ShiftAmount) || (OtherUser->getOpcode() != ISD::AND)) + continue; + + ConstantSDNode *OtherMask = + dyn_cast(OtherUser->getOperand(1)); + + if (!OtherMask) + continue; + + bool CanReduce = false; + + const APInt &MaskValue = Mask->getAPIntValue(); + const APInt &ShiftValue = ShiftAmount->getAPIntValue(); + const APInt &OtherMaskValue = OtherMask->getAPIntValue(); + + KnownBits MaskedValueBits; + DAG.computeKnownBits(MASKED, MaskedValueBits); + KnownBits ShiftedValueBits; + DAG.computeKnownBits(SHIFT, ShiftedValueBits); + + const APInt EffectiveOtherMask = OtherMaskValue & ~MaskedValueBits.Zero; + const APInt EffectiveMask = MaskValue & ~ShiftedValueBits.Zero; + +// LLVM_DEBUG( +// dbgs() << "\tValue being masked and shif-masked: "; MASKED.dump(); +// dbgs() << "\t\tValue zero bits: 0x" +// << MaskedValueBits.Zero.toString(16, false) +// << "\n\n\t\tApplied mask: 0x" +// << OtherMaskValue.toString(16, false) << " : "; +// OtherUser->dump(); +// dbgs() << "\t\tEffective mask: 0x" +// << EffectiveOtherMask.toString(16, false) +// << "\n\n\tShifted by: " << ShiftValue.getZExtValue() << " : "; +// SHIFT.dump(); dbgs() << "\t\tAnd masked by: 0x" +// << MaskValue.toString(16, false) << " : "; +// AND->dump(); dbgs() << "\t\tEffective mask to shifted value: 0x" +// << EffectiveMask.toString(16, false) << '\n';); + + switch (N0Opcode) { + case ISD::SHL: + CanReduce = (EffectiveOtherMask.shl(EffectiveMask) == EffectiveMask) || + (EffectiveMask.lshr(ShiftValue) == EffectiveOtherMask); + break; + case ISD::SRA: + if (!MaskedValueBits.Zero.isSignBitSet()) { + CanReduce = (EffectiveOtherMask.ashr(ShiftValue) == EffectiveMask); + break; + } else // Same as SRL + N0Opcode = ISD::SRL; + /* fall-thru */ + case ISD::SRL: + CanReduce = (EffectiveOtherMask.lshr(ShiftValue) == EffectiveMask) || + (EffectiveMask.shl(ShiftValue) == EffectiveOtherMask); + break; + case ISD::ROTR: + CanReduce = (EffectiveOtherMask.rotr(ShiftValue) == EffectiveMask); + break; + case ISD::ROTL: +// CanReduce = (EffectiveOtherMask.rotl(ShiftValue) == EffectiveMask); +// break; + /* fall-thru */ + default: + return SDValue(); + } + if (CanReduce) { + LLVM_DEBUG(dbgs() << "\tCan just shift the masked value\n"); + + SDValue ShiftTheAND(OtherUser, 0); + const SDLoc DL(SHIFT); + EVT VT = AND->getValueType(0); + SDValue NewShift = + DAG.getNode(N0Opcode, DL, VT, ShiftTheAND, SHIFT.getOperand(1)); + AddToWorklist(OtherUser); + return NewShift; + } + } + return SDValue(); +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4316,6 +4419,9 @@ } } + if (SDValue r = foldRedundantShiftedMasks(N)) + return r; + if (Level >= AfterLegalizeTypes) { // Attempt to propagate the AND back up to the leaves which, if they're // loads, can be combined to narrow loads and the AND node can be removed. Index: test/CodeGen/ARM/2018_05_29_FoldRedundantMask.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/2018_05_29_FoldRedundantMask.ll @@ -0,0 +1,177 @@ +; RUN: llc -O3 -march=arm %s -o - | FileCheck %s + +define i16 @LSR8(i16* %a){ +entry: + %data16 = getelementptr inbounds i16, i16* %a, i64 0 + %0 = load i16, i16* %data16, align 2 + %and = and i16 %0, 65280 + %1 = lshr i16 %0, 8 + %and3 = and i16 %1, 255 + %or = or i16 %and3, %and + ret i16 %or +} +;LSR8: +; CHECK-LABEL: LSR8 +; ldrb r0, [r0, #1] +; CHECK: ldrb [[R:r[0-9]+]], {{\[}}r{{[0-9]+}}, #1{{\]}} +; CHECK-NEXT: orr {{r[0-9]+}}, [[R]], [[R]], lsl #8 +; orr r0, r0, r0, lsl #8 +; mov pc, lr + +define i16 @LSR12A(i16* %a){ +entry: + %data16 = getelementptr inbounds i16, i16* %a, i64 0 + %0 = load i16, i16* %data16, align 2 + %and = and i16 %0, 61440 + %1 = lshr i16 %0, 12 + %and3 = and i16 %1, 15 + %or = or i16 %and3, %and + ret i16 %or +} +;LSR12A: +; CHECK-LABEL: LSR12A +; ldrh r0, [r0] +; CHECK: ldrh [[R:r[0-9]+]], [r{{[0-9]+}}] +; and r0, r0, #61440 +; CHECK-NEXT: and [[R1:r[0-9]+]], [[R]], #61440 +; orr r0, r0, r0, lsr #12 +; CHECK-NEXT: orr r{{[0-9]+}}, [[R1]], [[R1]], lsr #12 +; mov pc, lr + +define i16 @LSR12B(i16* %a){ +entry: + %data16 = getelementptr inbounds i16, i16* %a, i64 0 + %0 = load i16, i16* %data16, align 2 + %and = and i16 %0, 61440 + %1 = lshr i16 %0, 12 + %and3 = and i16 %1, 255 + %or = or i16 %and3, %and + ret i16 %or +} +;LSR12B: +; CHECK-LABEL: LSR12B +; ldrh r0, [r0] +; CHECK: ldrh [[R:r[0-9]+]], [r{{[0-9]+}}] +; and r0, r0, #61440 +; CHECK-NEXT: and [[R1:r[0-9]+]], [[R]], #61440 +; orr r0, r0, r0, lsr #12 +; CHECK-NEXT: orr r{{[0-9]+}}, [[R1]], [[R1]], lsr #12 +; mov pc, lr +; CHECK-NEXT: mov pc, lr + +define i16 @LSR12C(i16* %a){ +entry: + %data16 = getelementptr inbounds i16, i16* %a, i64 0 + %0 = load i16, i16* %data16, align 2 + %and = and i16 %0, 65280 + %1 = lshr i16 %0, 12 + %and3 = and i16 %1, 15 + %or = or i16 %and3, %and + ret i16 %or +} +;LSR12C: +; CHECK-LABEL: LSR12C +; ldrh r0, [r0] +; CHECK: ldrh [[R:r[0-9]+]], [r{{[0-9]+}}] +; and r1, r0, #65280 +; CHECK-NEXT: and [[R1:r[0-9]+]], [[R]], #65280 +; orr r0, r1, r0, lsr #12 +; CHECK-NEXT: orr r{{[0-9]+}}, [[R1]], r{{.*}} lsr #12 +; mov pc, lr +; CHECK-NEXT: mov pc, lr + +define i32 @ASR(i16* %a){ +entry: + %data16 = getelementptr inbounds i16, i16* %a, i64 0 + %l = load i16, i16* %data16, align 2 + %0 = zext i16 %l to i32 + %and = and i32 %0, 64512 + %1 = ashr i32 %0, 8 + %and3 = and i32 %1, 65532 + %or = or i32 %and3, %and + ret i32 %or +} +;ASR: +; CHECK-LABEL: ASR +; ldrh r0, [r0] +; CHECK: ldrh [[R:r[0-9]+]], [r{{[0-9]+}}] +; and r0, r0, #64512 +; CHECK-NEXT: and [[R1:r[0-9]+]], [[R]], #64512 +; orr r0, r0, r0, lsr #8 +; CHECK-NEXT: orr r{{[0-9]+}}, [[R1]], r{{.*}} lsr #8 +; mov pc, lr +; CHECK-NEXT: mov pc, lr + +define i16 @ASR2(i16* %a){ +entry: + %data16 = getelementptr inbounds i16, i16* %a, i64 0 + %l = load i16, i16* %data16, align 2 + %0 = and i16 %l, 64512 + %1 = ashr i16 %l, 8 + %and3 = and i16 %1, 65532 + %or = or i16 %and3, %0 + ret i16 %or +} +;ASR2: +; CHECK-LABEL: ASR2 +; ldrh r0, [r0] +; CHECK: ldrh [[R:r[0-9]+]], [r{{[0-9]+}}] +; ldr r1, .LCPI5_0 +; CHECK-NEXT: ldr [[R1:r[0-9]+]], [[C:\.[A-z0-9_]+]] +; and r0, r0, r1 +; CHECK-NEXT: and [[R2:r[0-9]+]], [[R]], [[R1]] +; lsl r1, r0, #16 +; CHECK-NEXT: [[R3:r[0-9]+]], [[R2]], #16 +; orr r0, r0, r1, asr #24 +; CHECK-NEXT: orr [[R4:r[0-9]+]], [[R2]], [[R3]], asr #24 +;.LCPI5_0: +; CHECK:[[C]] +; .long 4294966272 @ 0xfffffc00 +; CHECK-NEXT: .long 4294966272 @ 0xfffffc00 + +define i32 @main(i32 %ar, i8** nocapture readonly %v){ +entry: + %0 = bitcast i8** %v to i16* + %idxprom = sext i32 %ar to i64 + %arrayidx = getelementptr inbounds i16, i16* %0, i64 %idxprom + %1 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %1 to i32 + %and = and i32 %conv, 65280 + %2 = lshr i32 %conv, 8 + %and4 = and i32 %2, 255 + %or = or i32 %and4, %and + ret i32 %or +} +;main: +; CHECK-LABEL: main +; add r0, r1, r0, lsl #1 +; CHECK: add r0, r1, r0, lsl #1 +; ldrb r0, [r0, #1] +; CHECK-NEXT: ldrb r0, [r0, #1] +; orr r0, r0, r0, lsl #8 +; CHECK-NEXT: orr r0, r0, r0, lsl #8 +; mov pc, lr +; CHECK-NEXT: mov pc, lr + +define i32 @ror(i32 %a) { +entry: + %m2 = and i32 %a, 3855 + %shl = shl i32 %a, 24 + %shr = lshr i32 %a, 8 + %or = or i32 %shl, %shr + %m1 = and i32 %or, 251658255 + %or2 = or i32 %m1, %m2 + ret i32 %or2 +} +;ror: +; CHECK-LABEL: ror +; mov r1, #15 +; CHECK: mov r1, #15 +; orr r1, r1, #3840 +; CHECK-NEXT: orr r1, r1, #3840 +; and r0, r0, r1 +; CHECK-NEXT: and r0, r0, r1 +; orr r0, r0, r0, ror #8 +; CHECK-NEXT: orr r0, r0, r0, ror #8 +; mov pc, lr +; CHECK-NEXT: mov pc, lr Index: test/CodeGen/X86/pr32329.ll =================================================================== --- test/CodeGen/X86/pr32329.ll +++ test/CodeGen/X86/pr32329.ll @@ -29,18 +29,18 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl obj, %edx ; X86-NEXT: movsbl var_27, %eax ; X86-NEXT: movzwl var_2, %esi ; X86-NEXT: movl var_310, %ecx ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: addl var_24, %ecx -; X86-NEXT: andl $4194303, %edx # imm = 0x3FFFFF -; X86-NEXT: leal (%edx,%edx), %ebx -; X86-NEXT: subl %eax, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: subl %esi, %edi -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movl $4194303, %edi # imm = 0x3FFFFF +; X86-NEXT: andl obj, %edi +; X86-NEXT: leal (%edi,%edi), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 ; X86-NEXT: movl $9, %esi ; X86-NEXT: xorl %ebp, %ebp @@ -50,12 +50,12 @@ ; X86-NEXT: cmovnel %esi, %ebp ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovnel %ecx, %esi -; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmpl %edi, %ebx ; X86-NEXT: movl %ebp, var_50+4 ; X86-NEXT: movl %esi, var_50 ; X86-NEXT: setge var_205 -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: movb %bl, var_218 +; X86-NEXT: imull %eax, %edx +; X86-NEXT: movb %dl, var_218 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -68,27 +68,27 @@ ; ; X64-LABEL: foo: ; X64: # %bb.0: # %entry -; X64-NEXT: movl {{.*}}(%rip), %eax -; X64-NEXT: movsbl {{.*}}(%rip), %r9d -; X64-NEXT: movzwl {{.*}}(%rip), %r8d -; X64-NEXT: movl {{.*}}(%rip), %ecx +; X64-NEXT: movsbl var_27(%rip), %r9d +; X64-NEXT: movzwl var_2(%rip), %r8d +; X64-NEXT: movl var_310(%rip), %ecx ; X64-NEXT: imull %r9d, %ecx -; X64-NEXT: addl {{.*}}(%rip), %ecx -; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF -; X64-NEXT: leal (%rax,%rax), %edi +; X64-NEXT: addl var_24(%rip), %ecx +; X64-NEXT: movl $4194303, %esi # imm = 0x3FFFFF +; X64-NEXT: andl obj(%rip), %esi +; X64-NEXT: leal (%rsi,%rsi), %edi ; X64-NEXT: subl %r9d, %edi -; X64-NEXT: movl %edi, %esi -; X64-NEXT: subl %r8d, %esi -; X64-NEXT: imull %esi, %ecx +; X64-NEXT: movl %edi, %edx +; X64-NEXT: subl %r8d, %edx +; X64-NEXT: imull %edx, %ecx ; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; X64-NEXT: movl $9, %edx +; X64-NEXT: movl $9, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shlq %cl, %rdx -; X64-NEXT: movq %rdx, {{.*}}(%rip) -; X64-NEXT: cmpl %eax, %esi -; X64-NEXT: setge {{.*}}(%rip) +; X64-NEXT: shlq %cl, %rax +; X64-NEXT: movq %rax, var_50(%rip) +; X64-NEXT: cmpl %esi, %edx +; X64-NEXT: setge var_205(%rip) ; X64-NEXT: imull %r9d, %edi -; X64-NEXT: movb %dil, {{.*}}(%rip) +; X64-NEXT: movb %dil, var_218(%rip) ; X64-NEXT: retq entry: %bf.load = load i32, i32* bitcast (%struct.AA* @obj to i32*), align 8