Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -449,6 +449,7 @@ SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue foldRedudantShiftedMasks(SDNode *N); + SDNode *shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); @@ -12998,6 +12999,83 @@ St->getPointerInfo().getWithOffset(StOffset), NewAlign) .getNode(); } +/// Replaces patterns such as: +// store 2 ( or (load 1 from M), (shl (load 1 from M) 8 ) ) into M +// by store 1 ( load 1 from M) into M+1 +SDNode * +DAGCombiner::shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store) { + SDValue OR = Store->getValue(); + if (OR.getOpcode() != ISD::OR) + llvm_unreachable("Assuming to be called with an OR operad."); + SDValue LoadSD = OR.getOperand(0); + LoadSDNode *Load = dyn_cast(LoadSD); + SDNode *OtherORop = OR->getOperand(1).getNode(); + if (!Load) { + OtherORop = OR->getOperand(0).getNode(); + LoadSD = OR->getOperand(1); + Load = dyn_cast(LoadSD); + if (!Load) + return nullptr; + } + LLVM_DEBUG(dbgs() << "\tGot load: "; Load->dump()); + + unsigned BytesShift = 0; + + if ((OtherORop->getOpcode() == ISD::SHL) && + (OtherORop->getOperand(0).getNode() == Load) && + isa(OtherORop->getOperand(1))) + BytesShift = cast(OtherORop->getOperand(1).getNode()) + ->getAPIntValue() + .getSExtValue() / + 8; + LLVM_DEBUG(dbgs() << "\tThat is shifted by: "; OtherORop->dump()); + LLVM_DEBUG(dbgs() << "\tAnd stored by: "; Store->dump()); + // TODO: Accept other shifting operations such as srl, sra. Can use a negative + // value for BytesShift + + unsigned StoreMemSz = Store->getMemoryVT().getStoreSize(); + // For now we only accept chains that moves half of the loaded value to the + // other half. + if (2 * BytesShift != StoreMemSz) + return nullptr; + + const SDValue LoadPtr = Load->getBasePtr(); + SDValue Ptr = Store->getBasePtr(); + // TODO: Detect when both LOAD and STORE memory addresses are both ADD + // instructions to a common base address, with a known constant difference + bool SamePtr = LoadPtr == Ptr; + if (!SamePtr && (LoadPtr.getOpcode() != ISD::ADD)) + return nullptr; + + // Detect if we are moving M[A+k] to M[A]: + if (!((LoadPtr.getOperand(0) == Ptr) || (LoadPtr.getOperand(1) == Ptr))) + return nullptr; + + ConstantSDNode *Offset = dyn_cast(LoadPtr.getOperand(1)); + if (!Offset) + Offset = dyn_cast(LoadPtr.getOperand(0)); + + if (!Offset) + return nullptr; + + unsigned LoadByteOffset = Offset->getAPIntValue().getZExtValue(); + + unsigned LoadMemSz = Load->getMemoryVT().getStoreSize(); + bool UpperHalfLoad = + ((LoadByteOffset == LoadMemSz) && (2 * LoadMemSz == StoreMemSz)); + + if (!(UpperHalfLoad || SamePtr)) + return nullptr; + + // if (SamePtr) {TODO + // if ((LoadMemSz == StoreMemSz) && (Load->use_size() == 2)) { + // LLVM_DEBUG(dbgs() << "Reduce load width\n"); + // } + // LLVM_DEBUG(dbgs() << "Move lower to upper half\n"); + // } + LLVM_DEBUG(dbgs() << "\tReduce store width to half width.\n"); + return ShrinkLoadReplaceStoreWithStore({LoadMemSz, 0}, LoadSD, Store, this); +} /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try @@ -13013,11 +13091,19 @@ SDValue Ptr = ST->getBasePtr(); EVT VT = Value.getValueType(); - if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) + if (VT.isVector() || !Value.hasOneUse()) return SDValue(); unsigned Opc = Value.getOpcode(); + if ((Opc == ISD::OR) && VT.isScalarInteger()) { + if (SDNode *NewSt = shrinkLoadShiftOrStoreWithLoadNewStore(ST)) + return SDValue(NewSt, 0); + } + + if (ST->isTruncatingStore()) + return SDValue(); + // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst // is a byte mask indicating a consecutive number of bytes, check to see if // Y is known to provide just those bytes. If so, we try to replace the Index: test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll =================================================================== --- test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll +++ test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll @@ -0,0 +1,67 @@ +; RUN: llc -O3 -march=arm %s -o - | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t-arm-none-eabi" +define void @foo1(i16* %b){ +entry: + %0 = load i16, i16* %b, align 2 + %conv = sext i16 %0 to i32 + %and = and i32 %conv, 65280 + %1 = lshr i32 %conv, 8 + %and3 = and i32 %1, 255 + %or = or i32 %and3, %and + %conv4 = trunc i32 %or to i16 + store i16 %conv4, i16* %b, align 2 + ret void +} +; CHECK-LABEL: foo1 +; CHECK: ldrb r1, [r0, #1] +; CHECK-NEXT: strb r1, [r0] + +define void @foo2(i32* %b){ +entry: + %0 = load i32, i32* %b, align 4 + %1 = lshr i32 %0, 16 + %and2 = and i32 %1, 65535 + %and = and i32 %0, 4294901760 + %or = or i32 %and2, %and + store i32 %or, i32* %b, align 4 + ret void +} +; CHECK-LABEL: foo2 +; CHECK: ldrh r1, [r0, #2] +; CHECK-NEXT: strh r1, [r0] + +define void @test_1x4p1(i32* %M, i32 %I) { +entry: + %0 = getelementptr inbounds i32, i32* %M, i32 %I + %1 = load i32, i32* %0, align 4 + %2 = and i32 %1, 65280 + %3 = lshr i32 %1, 8 + %4 = and i32 %3, 255 + %5 = or i32 %2, %4 + store i32 %5, i32* %0, align 4 + ret void +} +; CHECK-LABEL: test_1x4p1: +; CHECK: ldrb{{.*}} +; CHECK-NEXT: orr{{.*}} +; CHECK-NEXT: str{{.*}} + + +define void @test_1x4p1_shl(i32* %M, i32 %I) { +entry: + %0 = getelementptr inbounds i32, i32* %M, i32 %I + %1 = load i32, i32* %0, align 4 + %2 = and i32 %1, 65280 + %3 = shl i32 %1, 8 + %4 = and i32 %3, 16711680 + %5 = or i32 %2, %4 + store i32 %5, i32* %0, align 4 + ret void +} +; CHECK-LABEL: test_1x4p1_shl: +; CHECK: ldrb{{.*}} +; CHECK-NEXT: lsl{{.*}} +; CHECK-NEXT: orr{{.*}} +; CHECK-NEXT: str{{.*}} +