Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -500,6 +500,9 @@ /// the load would need to be narrowed in order to match. bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, EVT LoadResultTy, EVT &ExtVT, bool &NarrowLoad); + bool SearchForNarrowLoads(SDNode *N, SmallPtrSetImpl &Loads, + ConstantSDNode *Mask); + bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG); /// Helper function for MergeConsecutiveStores which merges the /// component store chains. @@ -3730,6 +3733,92 @@ return true; } +bool DAGCombiner::SearchForNarrowLoads(SDNode *N, + SmallPtrSetImpl &Loads, + ConstantSDNode *Mask) { + // Recursively search for the operands, looking for loads which can be + // narrowed. + for (unsigned i = 0; i < N->getNumOperands(); ++i) { + SDNode *Op = N->getOperand(i).getNode(); + + // Constants should already be fixed up... + if (isa(Op)) + continue; + + if (!Op->hasOneUse() || Op->getValueType(0).isVector()) + return false; + + if (auto *Load = dyn_cast(Op)) { + EVT ExtVT; + bool NarrowLoad = false; + if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT, + NarrowLoad)) { + if (!NarrowLoad) + return false; + Loads.insert(Load); + continue; + } else + return false; + } + + switch(Op->getOpcode()) { + default: + return false; + case ISD::OR: + case ISD::XOR: + case ISD::AND: + SearchForNarrowLoads(Op, Loads, Mask); + break; + } + } + return true; +} + +bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { + // The search function will check whether this const is a suitable mask. + auto *Mask = dyn_cast(N->getOperand(1)); + if (!Mask) + return false; + + // No need to do anything if the and directly uses a load. + if (isa(N->getOperand(0))) + return false; + + SmallPtrSet Loads; + if (SearchForNarrowLoads(N, Loads, Mask)) { + if (Loads.size() == 0) + return false; + + for (auto *Load : Loads) { + assert(Load->hasOneUse() && "Can't transform this load"); + SDNode::use_iterator Use = Load->use_begin(); + SDNode *User = *Use; + + // Create the and which will enable the narrowing. + SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), N->getValueType(0), + SDValue(Load, 0), N->getOperand(1)); + + SDValue OtherOp = Use.getOperandNo() == 0 ? User->getOperand(1) : + User->getOperand(0); + // Supported opcodes are commutable. + SDValue NewUser = DAG.getNode(User->getOpcode(), SDLoc(User), + User->getValueType(0), And, OtherOp); + DAG.ReplaceAllUsesWith(User, NewUser.getNode()); + DAG.RemoveDeadNode(User); + + // Reduce the load width, this needs to happen once the original user + // is removed because the load can only have one use. + SDValue NewLoad = ReduceLoadWidth(And.getNode()); + assert(NewLoad && + "Shouldn't be masking the load if it can't be narrowed"); + CombineTo(Load, NewLoad, NewLoad.getValue(1)); + } + DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); + return true; + } + return false; +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -3928,6 +4017,16 @@ } } + if (Level >= AfterLegalizeTypes) { + // Attempt to propagate the AND back up to the leaves which, if they're + // loads, can be combined to narrow loads and the AND node can be removed. + // Perform after legalization so that extend nodes will already be + // combined into the loads. + if (BackwardsPropagateMask(N, DAG)) { + return SDValue(N, 0); + } + } + if (SDValue Combined = visitANDLike(N0, N1, N)) return Combined; Index: test/CodeGen/ARM/and-load-combine.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/and-load-combine.ll @@ -0,0 +1,199 @@ +; RUN: llc -mtriple=armv7 %s -o - | FileCheck %s +; RUN: llc -mtriple=armv7eb %s -o - | FileCheck %s +; RUN: llc -mtriple=armv6m %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s + +; CHECK-LABEL: cmp_xor8_short_short +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a, + i16* nocapture readonly %b) { +entry: + %0 = load i16, i16* %a, align 2 + %1 = load i16, i16* %b, align 2 + %xor2 = xor i16 %1, %0 + %2 = and i16 %xor2, 255 + %cmp = icmp eq i16 %2, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_xor8_short_int +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = zext i16 %0 to i32 + %1 = load i32, i32* %b, align 4 + %xor = xor i32 %1, %conv + %and = and i32 %xor, 255 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_xor8_int_int +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %xor = xor i32 %1, %0 + %and = and i32 %xor, 255 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_xor16 +; CHECK: ldrh +; CHECK: ldrh +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %xor = xor i32 %1, %0 + %and = and i32 %xor, 65535 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_or8_short_short +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a, + i16* nocapture readonly %b) { +entry: + %0 = load i16, i16* %a, align 2 + %1 = load i16, i16* %b, align 2 + %or2 = or i16 %1, %0 + %2 = and i16 %or2, 255 + %cmp = icmp eq i16 %2, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_or8_short_int +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = zext i16 %0 to i32 + %1 = load i32, i32* %b, align 4 + %or = or i32 %1, %conv + %and = and i32 %or, 255 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_or8_int_int +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %or = or i32 %1, %0 + %and = and i32 %or, 255 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_or16 +; CHECK: ldrh +; CHECK: ldrh +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %or = or i32 %1, %0 + %and = and i32 %or, 65535 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_and8_short_short +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a, + i16* nocapture readonly %b) { +entry: + %0 = load i16, i16* %a, align 2 + %1 = load i16, i16* %b, align 2 + %and3 = and i16 %0, 255 + %2 = and i16 %and3, %1 + %cmp = icmp eq i16 %2, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_and8_short_int +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i16, i16* %a, align 2 + %1 = load i32, i32* %b, align 4 + %2 = and i16 %0, 255 + %and = zext i16 %2 to i32 + %and1 = and i32 %1, %and + %cmp = icmp eq i32 %and1, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_and8_int_int +; CHECK: ldrb +; CHECK: ldrb +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %and = and i32 %0, 255 + %and1 = and i32 %and, %1 + %cmp = icmp eq i32 %and1, 0 + ret i1 %cmp +} + +; CHECK-LABEL: cmp_and16 +; CHECK: ldrh +; CHECK: ldrh +; CHECK-NOT: lsl +; CHECK: bx +define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a, + i32* nocapture readonly %b) { +entry: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %and = and i32 %0, 65535 + %and1 = and i32 %and, %1 + %cmp = icmp eq i32 %and1, 0 + ret i1 %cmp +}