Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -574,6 +574,7 @@ SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); SDValue CombineExtLoad(SDNode *N); SDValue CombineZExtLogicopShiftLoad(SDNode *N); + SDValue CombineZExtLogicopDoubleExtLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); @@ -6505,6 +6506,11 @@ } } + // fold (and (ext (and/or/xor (extload x, extload x))) C) -> + // (zext (and/or/xor (zextload x, zextload x))) + if (SDValue R = CombineZExtLogicopDoubleExtLoad(N)) + return R; + // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. if (SimplifyDemandedBits(SDValue(N, 0))) @@ -11971,6 +11977,63 @@ return SDValue(N,0); // Return N so it doesn't get rechecked! } +// fold (and (ext (and/or/xor (extload x, extload x))) C)-> +// (zext (and/or/xor (zextload x, zextload x))) +SDValue DAGCombiner::CombineZExtLogicopDoubleExtLoad(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + if (!N0.hasOneUse() || N0.getOpcode() != ISD::ANY_EXTEND || + N1.getOpcode() != ISD::Constant) + return SDValue(); + + EVT VT = N0.getNode()->getValueType(0); + EVT OrigVT = N0.getOperand(0).getValueType(); + + // one-use of and/or/xor + SDValue Logicop = N0.getOperand(0); + if ((Logicop.getOpcode() != ISD::AND && Logicop.getOpcode() != ISD::OR && + Logicop.getOpcode() != ISD::XOR) || + !Logicop.hasOneUse()) + return SDValue(); + + // loads + if (!isa(Logicop->getOperand(0)) || + !isa(Logicop->getOperand(1))) + return SDValue(); + + LoadSDNode *Load0 = cast(Logicop->getOperand(0)); + LoadSDNode *Load1 = cast(Logicop->getOperand(1)); + EVT MemVT0 = Load0->getMemoryVT(); + EVT MemVT1 = Load1->getMemoryVT(); + unsigned ExtBitSize = N1.getScalarValueSizeInBits(); + unsigned MemBitSize = MemVT0.getScalarSizeInBits(); + APInt Mask = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize); + APInt N1Val(ExtBitSize, + (unsigned)cast(N1)->getZExtValue() + 1); + if (!Load0->hasOneUse() || !Load1->hasOneUse() || MemVT0 != MemVT1 || + !DAG.MaskedValueIsZero(N1, Mask) || !N1Val.isPowerOf2()) + return SDValue(); + + // Ensure a ZEXTLOAD is actually legal. + if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, OrigVT, MemVT0) || + Load0->getExtensionType() != ISD::EXTLOAD || + Load1->getExtensionType() != ISD::EXTLOAD) + return SDValue(); + + // Actually do the transformation. + SDValue ExtLoad0 = + DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load0), OrigVT, Load0->getChain(), + Load0->getBasePtr(), MemVT0, Load0->getMemOperand()); + SDValue ExtLoad1 = + DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load1), OrigVT, Load1->getChain(), + Load1->getBasePtr(), MemVT1, Load1->getMemOperand()); + + SDValue Logic = + DAG.getNode(Logicop.getOpcode(), SDLoc(N), OrigVT, ExtLoad0, ExtLoad1); + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Logic); +} + /// If we're narrowing or widening the result of a vector select and the final /// size is the same size as a setcc (compare) feeding the select, then try to /// apply the cast operation to the select's operands because matching vector Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8564,6 +8564,10 @@ if (Num == MaxXors) return false; + // Skip the one-use zext + if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse()) + N = N->getOperand(0); + // The leaf node must be XOR if (N->getOpcode() == ISD::XOR) { WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); Index: llvm/test/CodeGen/AArch64/bcmp.ll =================================================================== --- llvm/test/CodeGen/AArch64/bcmp.ll +++ llvm/test/CodeGen/AArch64/bcmp.ll @@ -133,19 +133,16 @@ ret i1 %r } -; TODO: or (xor a, b), (and (xor c, d), C2) +; or (xor a, b), (and (xor c, d), C2) define i1 @bcmp9(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp9: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w9, [x0, #8] -; CHECK-NEXT: ldrb w10, [x1, #8] ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x11, [x1] -; CHECK-NEXT: eor w9, w9, w10 -; CHECK-NEXT: and x9, x9, #0xff -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: ldr x9, [x1] +; CHECK-NEXT: ldrb w10, [x0, #8] +; CHECK-NEXT: ldrb w11, [x1, #8] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 9) @@ -156,15 +153,12 @@ define i1 @bcmp10(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp10: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w9, [x0, #8] -; CHECK-NEXT: ldrh w10, [x1, #8] ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x11, [x1] -; CHECK-NEXT: eor w9, w9, w10 -; CHECK-NEXT: and x9, x9, #0xffff -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: ldr x9, [x1] +; CHECK-NEXT: ldrh w10, [x0, #8] +; CHECK-NEXT: ldrh w11, [x1, #8] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 10) @@ -195,10 +189,8 @@ ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: ldr w10, [x0, #8] ; CHECK-NEXT: ldr w11, [x1, #8] -; CHECK-NEXT: eor x8, x8, x9 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 12) @@ -274,13 +266,10 @@ ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] ; CHECK-NEXT: ldr w12, [x0, #16] -; CHECK-NEXT: ldr w13, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: eor w10, w12, w13 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ldr w8, [x1, #16] +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ccmp w12, w8, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20) @@ -311,17 +300,13 @@ ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] ; CHECK-NEXT: ldr x12, [x0, #16] -; CHECK-NEXT: ldr x13, [x1, #16] -; CHECK-NEXT: ldr w14, [x0, #24] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: ldr w15, [x1, #24] -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: eor x10, x12, x13 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: eor w11, w14, w15 -; CHECK-NEXT: orr x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ldr x8, [x1, #16] +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ldr w9, [x0, #24] +; CHECK-NEXT: ldr w10, [x1, #24] +; CHECK-NEXT: ccmp x12, x8, #0, eq +; CHECK-NEXT: ccmp w9, w10, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28) @@ -334,21 +319,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] -; CHECK-NEXT: ldp x12, x13, [x0, #16] -; CHECK-NEXT: ldp x14, x15, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: ldrb w16, [x0, #32] -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: ldrb w17, [x1, #32] -; CHECK-NEXT: eor x10, x12, x14 -; CHECK-NEXT: eor x11, x13, x15 -; CHECK-NEXT: eor w12, w16, w17 -; CHECK-NEXT: orr x9, x10, x11 -; CHECK-NEXT: and x10, x12, #0xff -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ldrb w11, [x1, #32] +; CHECK-NEXT: ldp x8, x9, [x0, #16] +; CHECK-NEXT: ldp x12, x10, [x1, #16] +; CHECK-NEXT: ccmp x8, x12, #0, eq +; CHECK-NEXT: ldrb w8, [x0, #32] +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: ccmp w8, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)