Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -10098,6 +10098,97 @@ return SDValue(); } +static SDValue PerformSHLSimplify(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + // Allow the generic combiner to identify potential bswaps. + if (DCI.isBeforeLegalize()) + return SDValue(); + + // No shifted operands for 16-bit instructions. + if (ST->isThumb() && ST->isThumb1Only()) + return SDValue(); + + if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && + N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::SHL) + return SDValue(); + + SDValue SHL = N->getOperand(0); + + auto *C1ShlC2 = dyn_cast(N->getOperand(1)); + auto *C2 = dyn_cast(SHL.getOperand(1)); + if (!C1ShlC2 || !C2) + return SDValue(); + + DEBUG(dbgs() << "Trying to simplify shl: "; N->dump()); + // DAG combiner will fold: + // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 + // Other code patterns that can be also be modified have the following form: + // b + ((a << 1) | 510) + // b + ((a << 1) & 510) + // b + ((a << 1) ^ 510) + // b + ((a << 1) + 510) + + // Many instructions can perform the shift for free, but it requires both + // the operands to be registers. If c1 << c2 is too large, a mov immediate + // instruction will needed. So, unfold back to the original pattern if: + // - if c1 and c2 are small enough that they don't require mov imms. + // - the user(s) of the node can perform an shl + + APInt C2Int = C2->getAPIntValue(); + APInt C1Int = C1ShlC2->getAPIntValue(); + + // Check that performing a lshr will not lose any information. + APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), + C2Int.getBitWidth() - C2->getZExtValue()); + if ((C1Int & Mask) != C1Int) + return SDValue(); + + // Shift the first constant. + C1Int.lshrInPlace(C2Int); + + // The immediates are encoded as an 8-bit value that can be rotated. + unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros(); + if (C1Int.getBitWidth() - Zeros > 8) + return SDValue(); + + Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros(); + if (C2Int.getBitWidth() - Zeros > 8) + return SDValue(); + + // Check that all the users could perform the shl themselves. + SDValue BinOp = SDValue(N, 0); + for (auto U : N->uses()) { + switch(U->getOpcode()) { + default: + return SDValue(); + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::SETCC: + case ARMISD::CMP: + break; + } + } + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue X = SHL.getOperand(0); + BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, + DAG.getConstant(C1Int, dl, MVT::i32)); + // Shift left to compensate for the lshr of C1Int. + SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); + + DAG.ReplaceAllUsesWith(SDValue(N, 0), Res); + return SDValue(N, 0); +} + /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -10106,6 +10197,10 @@ SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + // Only works one way, because it needs an immediate operand. + if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) + return Result; + // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; @@ -10294,6 +10389,9 @@ // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) return Result; + + if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) + return Result; } return SDValue(); @@ -10519,17 +10617,19 @@ return Result; } - // The code below optimizes (or (and X, Y), Z). - // The AND operand needs to have a single user to make these optimizations - // profitable. SDValue N0 = N->getOperand(0); - if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) - return SDValue(); SDValue N1 = N->getOperand(1); // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + + // The code below optimizes (or (and X, Y), Z). + // The AND operand needs to have a single user to make these optimizations + // profitable. + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) + return SDValue(); + APInt SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -10567,8 +10667,13 @@ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); - if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) - return Res; + if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { + if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) + return Res; + } + + if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) + return Result; return SDValue(); } @@ -10586,6 +10691,9 @@ // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; + + if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) + return Result; } return SDValue(); Index: test/CodeGen/ARM/load-combine-big-endian.ll =================================================================== --- test/CodeGen/ARM/load-combine-big-endian.ll +++ test/CodeGen/ARM/load-combine-big-endian.ll @@ -38,12 +38,8 @@ ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: ldr r0, [r0] -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: mov pc, lr +; CHECK-NOT: rev +; CHECK: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: ; CHECK-ARMv6: ldr r0, [r0] @@ -159,16 +155,7 @@ ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: ldr{{.*}}r0 ; CHECK: ldr{{.*}}r0 -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr +; CHECK-NOT: rev ; CHECK: mov pc, lr ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -276,13 +263,13 @@ ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: ldr r0, [r0, #1] ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr +; CHECK: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK-ARMv6: ldr r0, [r0, #1] @@ -317,11 +304,11 @@ ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: ldr r0, [r0, #-4] ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr @@ -424,13 +411,13 @@ ; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16) define i32 @load_i32_by_bswap_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: -; CHECK: ldr r0, [r0] +; CHECK: ldr r0, [r0] ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr @@ -480,12 +467,12 @@ ; CHECK-LABEL: load_i32_by_i8_base_offset_index: ; CHECK: add r0, r0, r1 ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 ; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr ; @@ -530,12 +517,12 @@ ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: add r0, r0, r1 ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 ; CHECK-NEXT: ldr r0, [r0, #13] +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr ; Index: test/CodeGen/ARM/load-combine.ll =================================================================== --- test/CodeGen/ARM/load-combine.ll +++ test/CodeGen/ARM/load-combine.ll @@ -79,11 +79,7 @@ ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: ldr r0, [r0] -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr +; CHECK-NOT: rev ; CHECK: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -170,16 +166,7 @@ ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: ldr{{.*}}r0 ; CHECK: ldr{{.*}}r0 -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr +; CHECK-NOT: rev ; CHECK: mov pc, lr ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -300,11 +287,11 @@ ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: ldr r0, [r0, #1] ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr @@ -341,13 +328,13 @@ ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: ldr r0, [r0, #-4] ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr +; CHECK: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK-ARMv6: ldr r0, [r0, #-4] @@ -384,11 +371,11 @@ ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: ldr r0, [r0] ; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r2, r0, #65280 ; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 ; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r2, lsl #8 ; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr Index: test/CodeGen/ARM/unfold-shifts.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/unfold-shifts.ll @@ -0,0 +1,173 @@ +; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s +; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2 +; RUN: llc -mtriple armv7 %s -o - | FileCheck %s +; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2 +; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2 +; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2 + +; CHECK-LABEL: unfold1 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #255 +; CHECK: add r0, r1, r0, lsl #1 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #255 +; CHECK-T2: add.w r0, r1, r0, lsl #1 +define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 1 + %shl = or i32 %or, 510 + %add = add nsw i32 %shl, %b + ret i32 %add +} + +; CHECK-LABEL: unfold2 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #4080 +; CHECK: sub r0, r1, r0, lsl #2 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #4080 +; CHECK-T2: sub.w r0, r1, r0, lsl #2 +define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 2 + %shl = or i32 %or, 16320 + %sub = sub nsw i32 %b, %shl + ret i32 %sub +} + +; CHECK-LABEL: unfold3 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #65280 +; CHECK: and r0, r1, r0, lsl #4 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #65280 +; CHECK-T2: and.w r0, r1, r0, lsl #4 +define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 4 + %shl = or i32 %or, 1044480 + %and = and i32 %shl, %b + ret i32 %and +} + +; CHECK-LABEL: unfold4 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #1044480 +; CHECK: eor r0, r1, r0, lsl #5 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #1044480 +; CHECK-T2: eor.w r0, r1, r0, lsl #5 +define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 5 + %shl = or i32 %or, 33423360 + %xor = xor i32 %shl, %b + ret i32 %xor +} + +; CHECK-LABEL: unfold5 +; CHECK-NOT: mov +; CHECK: add r0, r0, #496 +; CHECK: orr r0, r1, r0, lsl #6 +; CHECK-T2: add.w r0, r0, #496 +; CHECK-T2: orr.w r0, r1, r0, lsl #6 +define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 6 + %shl = add i32 %add, 31744 + %or = or i32 %shl, %b + ret i32 %or +} + +; CHECK-LABEL: unfold6 +; CHECK-NOT: mov +; CHECK: add r0, r0, #7936 +; CHECK: and r0, r1, r0, lsl #8 +; CHECK-T2-NOT: mov +; CHECK-T2: add.w r0, r0, #7936 +; CHECK-T2: and.w r0, r1, r0, lsl #8 +define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 8 + %shl = add i32 %add, 2031616 + %and = and i32 %shl, %b + ret i32 %and +} + +; CHECK-LABEL: unfold7 +; CHECK-NOT: mov +; CHECK: and r0, r0, #256 +; CHECK: add r0, r1, r0, lsl #1 +; CHECK-T2-NOT: mov +; CHECK-T2: and r0, r0, #256 +; CHECK-T2: add.w r0, r1, r0, lsl #1 +define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) { +entry: + %shl = shl i32 %a, 1 + %and = and i32 %shl, 512 + %add = add nsw i32 %and, %b + ret i32 %add +} + +; CHECK-LABEL: unfold8 +; CHECK-NOT: mov +; CHECK: add r0, r0, #126976 +; CHECK: eor r0, r1, r0, lsl #9 +; CHECK-T2-NOT: mov +; CHECK-T2: add.w r0, r0, #126976 +; CHECK-T2: eor.w r0, r1, r0, lsl #9 +define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 9 + %shl = add i32 %add, 65011712 + %xor = xor i32 %shl, %b + ret i32 %xor +} + +; CHECK-LABEL: unfold9 +; CHECK-NOT: mov +; CHECK: eor r0, r0, #255 +; CHECK: add r0, r1, r0, lsl #1 +; CHECK-T2-NOT: mov +; CHECK-T2: eor r0, r0, #255 +; CHECK-T2: add.w r0, r1, r0, lsl #1 +define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) { +entry: + %shl = shl i32 %a, 1 + %xor = xor i32 %shl, 510 + %add = add nsw i32 %xor, %b + ret i32 %add +} + +; CHECK-LABEL: unfold10 +; CHECK-NOT: mov r2 +; CHECK: orr r2, r0, #4080 +; CHECK: cmp r1, r2, lsl #10 +; CHECK-T2-NOT: mov.w r2 +; CHECK-T2: orr r2, r0, #4080 +; CHECK-T2: cmp.w r1, r2, lsl #10 +define arm_aapcscc i32 @unfold10(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 10 + %shl = or i32 %or, 4177920 + %cmp = icmp sgt i32 %shl, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; CHECK-LABEL: unfold11 +; CHECK-NOT: mov r2 +; CHECK: add r2, r0, #7936 +; CHECK: cmp r1, r2, lsl #11 +; CHECK-T2-NOT: mov.w r2 +; CHECK-T2: add.w r2, r0, #7936 +; CHECK-T2: cmp.w r1, r2, lsl #11 +define arm_aapcscc i32 @unfold11(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 11 + %shl = add i32 %add, 16252928 + %cmp = icmp sgt i32 %shl, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} +