diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5728,6 +5728,32 @@ if (SDValue V = combineShiftAnd1ToBitTest(N, DAG)) return V; + // Recognize the following pattern: + // + // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask) + // + // where bitmask is a mask that clear all the upper bits of AndVT + // outside the bits of NarrowVT. + auto IsAndZeroExtMask = [this](SDValue LHS, SDValue RHS) { + if (LHS->getOpcode() != ISD::SIGN_EXTEND) + return false; + + auto *C = dyn_cast(RHS); + if (!C || !C->getAPIntValue().isMask()) + return false; + + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), + C->getAPIntValue().countTrailingOnes()); + if (NarrowVT != LHS.getOperand(0).getValueType()) + return false; + + return true; + }; + + // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...). + if (IsAndZeroExtMask(N0, N1)) + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); + return SDValue(); } @@ -11335,16 +11361,38 @@ } } + // Finds the pattern implementing the zero extension inreg for + // illegal values, which is rendered with an and instruction with a + // bit mask. For example, the node for zero extenting the load of an + // i8 value into a i32 value is rendered as: + // + // i32 = (and (load i8) 0xff) + auto IsZeroExtInReg = [this](SDNode *N) -> bool { + if (N->getOpcode() != ISD::AND) + return false; + + auto *AndC = dyn_cast(N->getOperand(1)); + auto *LoadN = dyn_cast(N->getOperand(0)); + if (!AndC || !LoadN) + return false; + + EVT LoadResultTy = LoadN->getMemoryVT(); + EVT ExtVT; + + return isAndLoadExtLoad(AndC, LoadN, LoadResultTy, ExtVT); + }; + // fold (sext_inreg (extload x)) -> (sextload x) - // If sextload is not supported by target, we can only do the combine when - // load has one use. Doing otherwise can block folding the extload with other - // extends that the target does support. - if (ISD::isEXTLoad(N0.getNode()) && - ISD::isUNINDEXEDLoad(N0.getNode()) && + // If sextload is not supported by target, we can only do the + // combine when load has one use. Doing otherwise can block folding + // the extload with other extends that the target does support. The + // folding does not happen if the load is used in a zero extension. + if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && ExtVT == cast(N0)->getMemoryVT() && ((!LegalOperations && cast(N0)->isSimple() && N0.hasOneUse()) || - TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) && + !llvm::any_of(N0->uses(), IsZeroExtInReg)) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5885,9 +5885,17 @@ // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. - uint64_t Mask = LHS.getValueSizeInBits() - 1; + uint64_t SignBitPos = LHS.getValueSizeInBits() - 1; + // If LHS is a sext_inreg, we can check the sign bit of the + // original unextended data. + if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SignBitPos = + cast(LHS.getOperand(1))->getVT().getFixedSizeInBits() - + 1; + LHS = LHS.getOperand(0); + } return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, - DAG.getConstant(Mask, dl, MVT::i64), Dest); + DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && @@ -5895,9 +5903,16 @@ // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. - uint64_t Mask = LHS.getValueSizeInBits() - 1; + uint64_t SignBitPos = LHS.getValueSizeInBits() - 1; + // If LHS is a sext_inreg, we can check the sign bit of the + // original unextended data. + if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SignBitPos = + cast(LHS.getOperand(1))->getVT().getFixedSizeInBits() - 1; + LHS = LHS.getOperand(0); + } return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, - DAG.getConstant(Mask, dl, MVT::i64), Dest); + DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); } SDValue CCVal; diff --git a/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll b/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll @@ -0,0 +1,107 @@ +; RUN: llc -mtriple aarch64-linux-gnu -o - -asm-verbose=0 < %s | FileCheck %s + +; The purpose of the tests `f_*` and `g_*` is to make sure that the +; zero extension of the load caused by the `zext` instuction is +; preferred over the signed extension caused by the signed comparison +; "greater than -1". The effect of prioritizing the zero extension is +; to avoid the generation of the signed extension of the data being +; loaded. This is done by making sure that the sign bit of the +; original unextended data is being checked instead of the sign bit of +; the sign extended value. +; +; The `f_*` and `g_*` differ slightly in their structure to make sure +; that all the cases that compute the position of the sign bit in +; AArch64IselLowering.cpp (LowerBR_CC) are covered. + +define i32 @f_i32_i8(i8* %p) nounwind { +; CHECK-LABEL: f_i32_i8: +; CHECK-NEXT: ldrb w[[N:[0-9]+]], [x0] +; CHECK-NEXT: tbnz w[[N]], #7, .LBB[[BB:.*]] +; CHECK-NEXT: add w0, w[[N]], w[[N]] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB[[BB]] +; CHECK-NEXT: mul w0, w[[N]], w[[N]] +; CHECK-NEXT: ret +entry: + %0 = load i8, i8* %p + %conv = zext i8 %0 to i32 + %cmp = icmp sgt i8 %0, -1 + br i1 %cmp, label %A, label %B + +A: + %retval2 = add i32 %conv, %conv + ret i32 %retval2 + +B: + %retval1 = mul i32 %conv, %conv + ret i32 %retval1 +} + +define i32 @f_i32_i16(i16* %p) nounwind { +; CHECK-LABEL: f_i32_i16: +; CHECK-NEXT: ldrh w[[N:[0-9]+]], [x0] +; CHECK-NEXT: tbnz w[[N]], #15, .LBB[[BB:.*]] +; CHECK-NEXT: add w0, w[[N]], w[[N]] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB[[BB]] +; CHECK-NEXT: mul w0, w[[N]], w[[N]] +; CHECK-NEXT: ret +entry: + %0 = load i16, i16* %p + %conv = zext i16 %0 to i32 + %cmp = icmp sgt i16 %0, -1 + br i1 %cmp, label %A, label %B + +A: + %retval2 = add i32 %conv, %conv + ret i32 %retval2 + +B: + %retval1 = mul i32 %conv, %conv + ret i32 %retval1 +} + +define i32 @g_i32_i8(i8* %p) nounwind { +; CHECK-LABEL: g_i32_i8: +; CHECK-NEXT: ldrb w0, [x0] +; CHECK-NEXT: tbnz w0, #7, .LBB[[BB:.*]] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB[[BB]] +; CHECK-NEXT: lsl w0, w0, #1 +; CHECK-NEXT: ret +entry: + %0 = load i8, i8* %p, align 1 + %conv = zext i8 %0 to i32 + %cmp1 = icmp sgt i8 %0, -1 + br i1 %cmp1, label %return, label %B + +B: ; preds = %entry + %add = shl nuw nsw i32 %conv, 1 + ret i32 %add + +return: ; preds = %entry + ret i32 %conv +} + +define i32 @g_i32_i16(i16* %p) nounwind { +; CHECK-LABEL: g_i32_i16: +; CHECK-NEXT: ldrh w0, [x0] +; CHECK-NEXT: tbnz w0, #15, .LBB[[BB:.*]] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB[[BB]] +; CHECK-NEXT: lsl w0, w0, #1 +; CHECK-NEXT: ret +entry: + %0 = load i16, i16* %p, align 1 + %conv = zext i16 %0 to i32 + %cmp1 = icmp sgt i16 %0, -1 + br i1 %cmp1, label %return, label %B + +B: ; preds = %entry + %add = shl nuw nsw i32 %conv, 1 + ret i32 %add + +return: ; preds = %entry + ret i32 %conv +} + diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -343,8 +343,8 @@ ; ; CHECK-BE-LABEL: and_user: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-BE-NEXT: .save {r4, r5, r6, lr} +; CHECK-BE-NEXT: push {r4, r5, r6, lr} ; CHECK-BE-NEXT: cmp r0, #1 ; CHECK-BE-NEXT: blt .LBB3_4 ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader @@ -355,24 +355,23 @@ ; CHECK-BE-NEXT: .p2align 2 ; CHECK-BE-NEXT: .LBB3_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! -; CHECK-BE-NEXT: ldrsh r5, [r2, #2]! -; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] -; CHECK-BE-NEXT: ldrsh.w r7, [r2, #2] -; CHECK-BE-NEXT: uxth.w r6, lr -; CHECK-BE-NEXT: smlabb r5, r5, lr, r12 -; CHECK-BE-NEXT: smlabb r12, r7, r4, r5 +; CHECK-BE-NEXT: ldrh lr, [r3, #2]! +; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! +; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] +; CHECK-BE-NEXT: ldrsh.w r6, [r2, #2] +; CHECK-BE-NEXT: smlabb r4, r4, lr, r12 +; CHECK-BE-NEXT: smlabb r12, r6, r5, r4 ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: mul r1, r6, r1 +; CHECK-BE-NEXT: mul r1, lr, r1 ; CHECK-BE-NEXT: bne .LBB3_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-BE-NEXT: add.w r0, r12, r1 -; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-BE-NEXT: pop {r4, r5, r6, pc} ; CHECK-BE-NEXT: .LBB3_4: ; CHECK-BE-NEXT: mov.w r12, #0 ; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: add.w r0, r12, r1 -; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-BE-NEXT: pop {r4, r5, r6, pc} entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/ARM/and-sext-combine.ll b/llvm/test/CodeGen/ARM/and-sext-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/and-sext-combine.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - -O3 \ +; RUN: -asm-verbose=0 | FileCheck %s + +; This tests exerts the folding of `VT = (and (sign_extend NarrowVT to +; VT) #bitmask)` into `VT = (zero_extend NarrowVT to VT)` when +; #bitmask value is the mask made by all ones that selects the value +; of type NarrowVT inside the value of type VT. The folding is +; implemented in `DAGCombiner::visitAND`. + +; With this the folding, the `and` of the "signed extended load" of +; `%b` in `f_i16_i32` is rendered as a zero extended load. + +; CHECK-LABEL: f_i16_i32: +; CHECK-NEXT: .fnstart +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrsh r0, [r0] +; CHECK-NEXT: smulbb r0, r0, r1 +; CHECK-NEXT: mul r0, r0, r1 +; CHECK-NEXT: bx lr +define i32 @f_i16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b) local_unnamed_addr #0 { + %1 = load i16, i16* %a, align 2 + %sext.1 = sext i16 %1 to i32 + %2 = load i16, i16* %b, align 2 + %sext.2 = sext i16 %2 to i32 + %masked = and i32 %sext.2, 65535 + %mul = mul nsw i32 %sext.2, %sext.1 + %count.next = mul i32 %mul, %masked + ret i32 %count.next +} + diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -82,19 +82,19 @@ ; ENABLE-NEXT: bhi .LBB0_7 ; ENABLE-NEXT: @ %bb.14: @ %while.body24.preheader ; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 -; ENABLE-NEXT: sub r3, r3, #2 +; ENABLE-NEXT: sub lr, r3, #2 ; ENABLE-NEXT: .LBB0_15: @ %while.body24 ; ENABLE-NEXT: @ Parent Loop BB0_7 Depth=1 ; ENABLE-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLE-NEXT: mov r0, r3 -; ENABLE-NEXT: cmp r3, r2 +; ENABLE-NEXT: mov r0, lr +; ENABLE-NEXT: cmp lr, r2 ; ENABLE-NEXT: bls .LBB0_7 ; ENABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge ; ENABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 -; ENABLE-NEXT: mov r3, r0 -; ENABLE-NEXT: ldrsb lr, [r3], #-1 -; ENABLE-NEXT: cmn lr, #1 -; ENABLE-NEXT: uxtb r12, lr +; ENABLE-NEXT: mov lr, r0 +; ENABLE-NEXT: ldrb r12, [lr], #-1 +; ENABLE-NEXT: sxtb r3, r12 +; ENABLE-NEXT: cmn r3, #1 ; ENABLE-NEXT: bgt .LBB0_7 ; ENABLE-NEXT: @ %bb.17: @ %while.body24.land.rhs14_crit_edge ; ENABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 @@ -172,19 +172,19 @@ ; DISABLE-NEXT: bhi .LBB0_7 ; DISABLE-NEXT: @ %bb.14: @ %while.body24.preheader ; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 -; DISABLE-NEXT: sub r3, r3, #2 +; DISABLE-NEXT: sub lr, r3, #2 ; DISABLE-NEXT: .LBB0_15: @ %while.body24 ; DISABLE-NEXT: @ Parent Loop BB0_7 Depth=1 ; DISABLE-NEXT: @ => This Inner Loop Header: Depth=2 -; DISABLE-NEXT: mov r0, r3 -; DISABLE-NEXT: cmp r3, r2 +; DISABLE-NEXT: mov r0, lr +; DISABLE-NEXT: cmp lr, r2 ; DISABLE-NEXT: bls .LBB0_7 ; DISABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge ; DISABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 -; DISABLE-NEXT: mov r3, r0 -; DISABLE-NEXT: ldrsb lr, [r3], #-1 -; DISABLE-NEXT: cmn lr, #1 -; DISABLE-NEXT: uxtb r12, lr +; DISABLE-NEXT: mov lr, r0 +; DISABLE-NEXT: ldrb r12, [lr], #-1 +; DISABLE-NEXT: sxtb r3, r12 +; DISABLE-NEXT: cmn r3, #1 ; DISABLE-NEXT: bgt .LBB0_7 ; DISABLE-NEXT: @ %bb.17: @ %while.body24.land.rhs14_crit_edge ; DISABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -218,38 +218,65 @@ ; ARM scheduler emits icmp/zext before both calls, so isn't relevant ; ARMT2-LABEL: t9: -; ARMT2: bl f -; ARMT2: uxtb r0, r4 -; ARMT2: cmp r0, r0 -; ARMT2: add r1, r4, #1 -; ARMT2: mov r2, r0 -; ARMT2: add r2, r2, #1 -; ARMT2: add r1, r1, #1 -; ARMT2: uxtb r3, r2 -; ARMT2: cmp r3, r0 +; ARMT2: .save {r4, lr} +; ARMT2: push {r4, lr} +; ARMT2: ldrb r4, [r0] +; ARMT2: mov r0, #1 +; ARMT2: bl f +; ARMT2: cmp r4, r4 +; ARMT2: popne {r4, pc} +; ARMT2: .LBB8_1: +; ARMT2: sxtb r0, r4 +; ARMT2: add r0, r0, #1 +; ARMT2: mov r1, r4 +; ARMT2: .LBB8_2: +; ARMT2: add r1, r1, #1 +; ARMT2: add r0, r0, #1 +; ARMT2: uxtb r2, r1 +; ARMT2: cmp r2, r4 +; ARMT2: blt .LBB8_2 +; ARMT2: pop {r4, pc} ; THUMB1-LABEL: t9: -; THUMB1: bl f -; THUMB1: sxtb r1, r4 -; THUMB1: uxtb r0, r1 -; THUMB1: cmp r0, r0 -; THUMB1: adds r1, r1, #1 -; THUMB1: mov r2, r0 -; THUMB1: adds r1, r1, #1 -; THUMB1: adds r2, r2, #1 -; THUMB1: uxtb r3, r2 -; THUMB1: cmp r3, r0 +; THUMB1: .save {r4, lr} +; THUMB1: push {r4, lr} +; THUMB1: ldrb r4, [r0] +; THUMB1: movs r0, #1 +; THUMB1: bl f +; THUMB1: cmp r4, r4 +; THUMB1: bne .LBB8_3 +; THUMB1: sxtb r0, r4 +; THUMB1: adds r0, r0, #1 +; THUMB1: mov r1, r4 +; THUMB1: .LBB8_2: +; THUMB1: adds r0, r0, #1 +; THUMB1: adds r1, r1, #1 +; THUMB1: uxtb r2, r1 +; THUMB1: cmp r2, r4 +; THUMB1: blt .LBB8_2 +; THUMB1: .LBB8_3: +; THUMB1: pop {r4, pc} ; THUMB2-LABEL: t9: -; THUMB2: bl f -; THUMB2: uxtb r0, r4 -; THUMB2: cmp r0, r0 -; THUMB2: adds r1, r4, #1 -; THUMB2: mov r2, r0 -; THUMB2: adds r2, #1 -; THUMB2: adds r1, #1 -; THUMB2: uxtb r3, r2 -; THUMB2: cmp r3, r0 +; THUMB2: .save {r4, lr} +; THUMB2: push {r4, lr} +; THUMB2: ldrb r4, [r0] +; THUMB2: movs r0, #1 +; THUMB2: bl f +; THUMB2: cmp r4, r4 +; THUMB2: it ne +; THUMB2: popne {r4, pc} +; THUMB2: .LBB8_1: +; THUMB2: sxtb r0, r4 +; THUMB2: adds r0, #1 +; THUMB2: mov r1, r4 +; THUMB2: .LBB8_2: +; THUMB2: adds r1, #1 +; THUMB2: adds r0, #1 +; THUMB2: uxtb r2, r1 +; THUMB2: cmp r2, r4 +; THUMB2: blt .LBB8_2 +; THUMB2: pop {r4, pc} %0 = load i8, i8* %a %conv = sext i8 %0 to i32