Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -448,6 +448,7 @@ SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue foldRedundantShiftedMasks(SDNode *N); + SDNode *shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); @@ -13042,6 +13043,108 @@ St->getPointerInfo().getWithOffset(StOffset), NewAlign) .getNode(); } +/// Replaces patterns that copy half of an memory element to the other half. Ex: +// store i16 (or (load i8 from M), (shl (load i8 from M) 8 ) ) into M +// can be replaced by store i8 (load i8 from M) into M+1 +SDNode * +DAGCombiner::shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store) { + SDValue OR = Store->getValue(); + assert(OR.getOpcode() == ISD::OR && "Expecting ISD::OR"); + if (OR.getOpcode() != ISD::OR) + return nullptr; + + SDValue LoadSD = OR.getOperand(0); + LoadSDNode *Load = dyn_cast(LoadSD); + SDNode *OtherORop = OR->getOperand(1).getNode(); + if (!Load) { + OtherORop = OR->getOperand(0).getNode(); + LoadSD = OR->getOperand(1); + Load = dyn_cast(LoadSD); + if (!Load) + return nullptr; + } + + unsigned BytesShift = 0; + + if ((OtherORop->getOpcode() == ISD::SHL) && + (OtherORop->getOperand(0).getNode() == Load) && + isa(OtherORop->getOperand(1))) + BytesShift = cast(OtherORop->getOperand(1).getNode()) + ->getAPIntValue() + .getSExtValue() / + 8; + // TODO: Accept other shifting operations such as srl, sra. Could + // use negative values for BytesShift + LLVM_DEBUG(dbgs() << "\tGot load: "; Load->dump()); + LLVM_DEBUG(dbgs() << "\tThat is shifted by: "; OtherORop->dump()); + LLVM_DEBUG(dbgs() << "\tOR combined by: "; OR->dump()); + LLVM_DEBUG(dbgs() << "\tAnd stored by: "; Store->dump()); + + unsigned StoreMemSz = Store->getMemoryVT().getStoreSize(); + // For now we only accept chains that moves half of the loaded value to the + // other half. + if (2 * BytesShift != StoreMemSz) + return nullptr; + + const SDValue LoadPtr = Load->getBasePtr(); + SDValue Ptr = Store->getBasePtr(); + unsigned LoadMemSz = Load->getMemoryVT().getStoreSize(); + // TODO: Detect when both LOAD and STORE memory addresses are both ADD + // instructions to a common base address, with a known constant difference + // Ex: load i8 [M+3] and store i16 [M+2] + if (LoadPtr == Ptr) { + if ((LoadMemSz < StoreMemSz) && Load->getExtensionType() != ISD::ZEXTLOAD) + return nullptr; + + if (LoadMemSz == BytesShift) { + // replace something like + // store i16( or(ld i8 [M] zext i16), (shl ( ld i8 [M] ), 8) ),[M] + // by + // store i8( ld i8 [M]) [M+1] + LLVM_DEBUG(dbgs() << "\tNot writing the lower half of the store\n"); + return ShrinkLoadReplaceStoreWithStore({LoadMemSz, 0}, LoadSD, Store, + this); + } + return nullptr; + } + + if (LoadPtr.getOpcode() != ISD::ADD) + return nullptr; + + // Detect if we are moving M[A+k] to M[A]: + if (!((LoadPtr.getOperand(0) == Ptr) || (LoadPtr.getOperand(1) == Ptr))) + return nullptr; + + ConstantSDNode *Offset = dyn_cast(LoadPtr.getOperand(1)); + if (!Offset) + Offset = dyn_cast(LoadPtr.getOperand(0)); + + if (!Offset) + return nullptr; + + unsigned LoadByteOffset = Offset->getAPIntValue().getZExtValue(); + if ((LoadByteOffset == LoadMemSz) && (2 * LoadMemSz == StoreMemSz) && + Load->getExtensionType() == ISD::ZEXTLOAD) { + // Replace something like + // store i16( or(ld i8 [M+1]), (shl ( ld i8 [M+1]), 8) ),[M] + // by + // store i8( ld i8 [M+1]) [M]. The ld must be zext. + MVT VT = MVT::getIntegerVT(LoadMemSz * 8); + if (!isTypeLegal(VT)) + return nullptr; + + // Truncate down to the new size. + SDValue IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(OR), VT, LoadSD); + + ++OpsNarrowed; + LLVM_DEBUG(dbgs() << "\tNot writing the upper half of the store\n"); + return DAG + .getStore(Store->getChain(), SDLoc(Store), IVal, Ptr, + Store->getPointerInfo(), LoadMemSz) + .getNode(); + } + return nullptr; +} /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try @@ -13057,11 +13160,19 @@ SDValue Ptr = ST->getBasePtr(); EVT VT = Value.getValueType(); - if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) + if (VT.isVector() || !Value.hasOneUse()) return SDValue(); unsigned Opc = Value.getOpcode(); + if ((Opc == ISD::OR) && VT.isScalarInteger()) { + if (SDNode *NewSt = shrinkLoadShiftOrStoreWithLoadNewStore(ST)) + return SDValue(NewSt, 0); + } + + if (ST->isTruncatingStore()) + return SDValue(); + // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst // is a byte mask indicating a consecutive number of bytes, check to see if // Y is known to provide just those bytes. If so, we try to replace the Index: lib/Transforms/Utils/SimplifyIndVar.cpp =================================================================== --- lib/Transforms/Utils/SimplifyIndVar.cpp +++ lib/Transforms/Utils/SimplifyIndVar.cpp @@ -784,7 +784,7 @@ for (unsigned N = 0; IVOperand; ++N) { assert(N <= Simplified.size() && "runaway iteration"); - Value *NewOper = foldIVUser(UseInst, IVOperand); + Value *NewOper = foldIVUser(UseOper.first, IVOperand); if (!NewOper) break; // done folding IVOperand = dyn_cast(NewOper); @@ -792,12 +792,12 @@ if (!IVOperand) continue; - if (eliminateIVUser(UseInst, IVOperand)) { + if (eliminateIVUser(UseOper.first, IVOperand)) { pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers); continue; } - if (BinaryOperator *BO = dyn_cast(UseInst)) { + if (BinaryOperator *BO = dyn_cast(UseOper.first)) { if ((isa(BO) && strengthenOverflowingOperation(BO, IVOperand)) || (isa(BO) && strengthenRightShift(BO, IVOperand))) { @@ -807,13 +807,13 @@ } } - CastInst *Cast = dyn_cast(UseInst); + CastInst *Cast = dyn_cast(UseOper.first); if (V && Cast) { V->visitCast(Cast); continue; } - if (isSimpleIVUser(UseInst, L, SE)) { - pushIVUsers(UseInst, L, Simplified, SimpleIVUsers); + if (isSimpleIVUser(UseOper.first, L, SE)) { + pushIVUsers(UseOper.first, L, Simplified, SimpleIVUsers); } } } Index: test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll @@ -0,0 +1,67 @@ +; RUN: llc -O3 -march=arm %s -o - | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t-arm-none-eabi" +define void @foo1(i16* %b){ +entry: + %0 = load i16, i16* %b, align 2 + %conv = sext i16 %0 to i32 + %and = and i32 %conv, 65280 + %1 = lshr i32 %conv, 8 + %and3 = and i32 %1, 255 + %or = or i32 %and3, %and + %conv4 = trunc i32 %or to i16 + store i16 %conv4, i16* %b, align 2 + ret void +} +; CHECK-LABEL: foo1 +; CHECK: ldrb r1, [r0, #1] +; CHECK-NEXT: strb r1, [r0] + +define void @foo2(i32* %b){ +entry: + %0 = load i32, i32* %b, align 4 + %1 = lshr i32 %0, 16 + %and2 = and i32 %1, 65535 + %and = and i32 %0, 4294901760 + %or = or i32 %and2, %and + store i32 %or, i32* %b, align 4 + ret void +} +; CHECK-LABEL: foo2 +; CHECK: ldrh r1, [r0, #2] +; CHECK-NEXT: strh r1, [r0] + +define void @test_1x4p1(i32* %M, i32 %I) { +entry: + %0 = getelementptr inbounds i32, i32* %M, i32 %I + %1 = load i32, i32* %0, align 4 + %2 = and i32 %1, 65280 + %3 = lshr i32 %1, 8 + %4 = and i32 %3, 255 + %5 = or i32 %2, %4 + store i32 %5, i32* %0, align 4 + ret void +} +; CHECK-LABEL: test_1x4p1: +; CHECK: ldrb{{.*}} +; CHECK-NEXT: orr{{.*}} +; CHECK-NEXT: str{{.*}} + + +define void @test_1x4p1_shl(i32* %M, i32 %I) { +entry: + %0 = getelementptr inbounds i32, i32* %M, i32 %I + %1 = load i32, i32* %0, align 4 + %2 = and i32 %1, 65280 + %3 = shl i32 %1, 8 + %4 = and i32 %3, 16711680 + %5 = or i32 %2, %4 + store i32 %5, i32* %0, align 4 + ret void +} +; CHECK-LABEL: test_1x4p1_shl: +; CHECK: ldrb{{.*}} +; CHECK-NEXT: lsl{{.*}} +; CHECK-NEXT: orr{{.*}} +; CHECK-NEXT: str{{.*}} +