Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -524,6 +524,7 @@ const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); + SDValue MatchStoreCombine(StoreSDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -6257,6 +6258,181 @@ return BigEndian; } +static SDValue stripTruncAndExt(SDValue Value) { + switch (Value.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: + return stripTruncAndExt(Value.getOperand(0)); + } + return Value; +} + +/// Match a pattern where a wide type scalar value is stored by several narrow +/// stores. Fold it into a single store or a BSWAP and a store if the targets +/// supports it. +/// +/// Assuming little endian target: +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 0) & 0xFF; +/// p[1] = (val >> 8) & 0xFF; +/// p[2] = (val >> 16) & 0xFF; +/// p[3] = (val >> 24) & 0xFF; +/// => +/// *((i32)p) = val; +/// +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 24) & 0xFF; +/// p[1] = (val >> 16) & 0xFF; +/// p[2] = (val >> 8) & 0xFF; +/// p[3] = (val >> 0) & 0xFF; +/// => +/// *((i32)p) = BSWAP(val); +SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { + // Collect all the stores in the chain. + SDValue Chain; + SmallVector Stores; + for (StoreSDNode *Store = N; Store; Store = dyn_cast(Chain)) { + if (Store->getMemoryVT() != MVT::i8 || + Store->isVolatile() || Store->isIndexed()) + return SDValue(); + Stores.push_back(Store); + Chain = Store->getChain(); + } + // Handle the simple type only. + unsigned Width = Stores.size(); + EVT VT = EVT::getIntegerVT( + *DAG.getContext(), Width * N->getMemoryVT().getSizeInBits()); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT)) + return SDValue(); + + // Check if all the bytes of the combined value we are looking at are stored + // to the same base address. Collect bytes offsets from Base address into + // ByteOffsets. + SDValue CombinedValue; + SmallVector ByteOffsets(Width, INT64_MAX); + int64_t FirstOffset = INT64_MAX; + StoreSDNode *FirstStore = nullptr; + Optional Base; + for (auto Store : Stores) { + // All the stores store different byte of the CombinedValue. A truncate is + // required to get that byte value. + SDValue Trunc = Store->getValue(); + if (Trunc.getOpcode() != ISD::TRUNCATE) + return SDValue(); + // A shift operation is required to get the right byte offset, except the + // first byte. + int64_t Offset = 0; + SDValue Value = Trunc.getOperand(0); + if (Value.getOpcode() == ISD::SRL || + Value.getOpcode() == ISD::SRA) { + ConstantSDNode *ShiftOffset = + dyn_cast(Value.getOperand(1)); + // Trying to match the following pattern. The shift offset must be + // a constant and a multiple of 8. It is the byte offset in "y". + // + // x = srl y, offset + // i8 z = trunc x + // store z, ... + if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8)) + return SDValue(); + + Offset = ShiftOffset->getSExtValue()/8; + Value = Value.getOperand(0); + } + + // Stores must share the same combined value with different offsets. + if (!CombinedValue) + CombinedValue = Value; + else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value)) + return SDValue(); + + // The trunc and all the extend operation should be stripped to get the + // real value we are stored. + else if (CombinedValue.getValueType() != VT) { + if (Value.getValueType() == VT || + Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits()) + CombinedValue = Value; + // Give up if the combined value type is smaller than the store size. + if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits()) + return SDValue(); + } + + // Stores must share the same base address + BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG); + int64_t ByteOffsetFromBase = 0; + if (!Base) + Base = Ptr; + else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) + return SDValue(); + + // Remember the first byte store + if (ByteOffsetFromBase < FirstOffset) { + FirstStore = Store; + FirstOffset = ByteOffsetFromBase; + } + // Map the offset in the store and the offset in the combined value, and + // early return if it has been set before. + if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX) + return SDValue(); + ByteOffsets[Offset] = ByteOffsetFromBase; + } + + assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + assert(FirstStore && "First store must be set"); + + // Check if the bytes of the combined value we are looking at match with + // either big or little endian value store. + Optional IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); + if (!IsBigEndian.hasValue()) + return SDValue(); + + // The node we are looking at matches with the pattern, check if we can + // replace it with a single bswap if needed and store. + + // If the store needs byte swap check if the target supports it + bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian; + + // Before legalize we can introduce illegal bswaps which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // store and byte shuffling instead of several stores and byte shuffling. + if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // Check that a store of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + VT, FirstStore->getAddressSpace(), + FirstStore->getAlignment(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + + if (VT != CombinedValue.getValueType()) { + assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() && + "Get unexpected store value to combine"); + CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, + CombinedValue); + } + + if (NeedsBswap) + CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue); + + SDValue NewStore = + DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(), + FirstStore->getPointerInfo(), FirstStore->getAlignment()); + + // Rely on other DAG combine rules to remove the other individual stores. + DAG.ReplaceAllUsesWith(N, NewStore.getNode()); + return NewStore; +} + /// Match a pattern where a wide type scalar value is loaded by several narrow /// loads and combined by shifts and ors. Fold it into a single load or a load /// and a BSWAP if the targets supports it. @@ -15791,6 +15967,10 @@ if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; + // Try transforming several stores into STORE (BSWAP). + if (SDValue Store = MatchStoreCombine(ST)) + return Store; + if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. Index: llvm/trunk/test/CodeGen/PowerPC/store-combine.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/store-combine.ll +++ llvm/trunk/test/CodeGen/PowerPC/store-combine.ll @@ -10,24 +10,12 @@ define void @store_i32_by_i8(i32 signext %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 3, 0(4) -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: stb 3, 3(4) +; CHECK-PPC64LE-NEXT: stw 3, 0(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 3, 0(4) -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 3, 3, 24 -; CHECK-PPC64-NEXT: stb 5, 2(4) -; CHECK-PPC64-NEXT: stb 3, 3(4) +; CHECK-PPC64-NEXT: stwbrx 3, 0, 4 ; CHECK-PPC64-NEXT: blr entry: %conv = trunc i32 %m to i8 @@ -55,24 +43,12 @@ define void @store_i32_by_i8_bswap(i32 signext %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 0(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: stb 3, 3(4) +; CHECK-PPC64LE-NEXT: stwbrx 3, 0, 4 ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 24 -; CHECK-PPC64-NEXT: srwi 6, 3, 16 -; CHECK-PPC64-NEXT: stb 5, 0(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 6, 1(4) -; CHECK-PPC64-NEXT: stb 5, 2(4) -; CHECK-PPC64-NEXT: stb 3, 3(4) +; CHECK-PPC64-NEXT: stw 3, 0(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 24 @@ -104,40 +80,12 @@ define void @store_i64_by_i8(i64 %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i64_by_i8: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 56, 8 -; CHECK-PPC64LE-NEXT: stb 3, 0(4) -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 48, 16 -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 40, 24 -; CHECK-PPC64LE-NEXT: stb 5, 3(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 32, 32 -; CHECK-PPC64LE-NEXT: stb 5, 4(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 24, 40 -; CHECK-PPC64LE-NEXT: stb 5, 5(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 16, 48 -; CHECK-PPC64LE-NEXT: rldicl 3, 3, 8, 56 -; CHECK-PPC64LE-NEXT: stb 5, 6(4) -; CHECK-PPC64LE-NEXT: stb 3, 7(4) +; CHECK-PPC64LE-NEXT: stdx 3, 0, 4 ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i64_by_i8: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: rldicl 5, 3, 56, 8 -; CHECK-PPC64-NEXT: rldicl 6, 3, 48, 16 -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 40, 24 -; CHECK-PPC64-NEXT: stb 6, 2(4) -; CHECK-PPC64-NEXT: rldicl 6, 3, 32, 32 -; CHECK-PPC64-NEXT: stb 5, 3(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 24, 40 -; CHECK-PPC64-NEXT: stb 6, 4(4) -; CHECK-PPC64-NEXT: stb 3, 0(4) -; CHECK-PPC64-NEXT: stb 5, 5(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 16, 48 -; CHECK-PPC64-NEXT: rldicl 3, 3, 8, 56 -; CHECK-PPC64-NEXT: stb 5, 6(4) -; CHECK-PPC64-NEXT: stb 3, 7(4) +; CHECK-PPC64-NEXT: stdbrx 3, 0, 4 ; CHECK-PPC64-NEXT: blr entry: %conv = trunc i64 %m to i8 @@ -185,40 +133,12 @@ define void @store_i64_by_i8_bswap(i64 %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i64_by_i8_bswap: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 56, 8 -; CHECK-PPC64LE-NEXT: stb 3, 7(4) -; CHECK-PPC64LE-NEXT: stb 5, 6(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 48, 16 -; CHECK-PPC64LE-NEXT: stb 5, 5(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 40, 24 -; CHECK-PPC64LE-NEXT: stb 5, 4(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 32, 32 -; CHECK-PPC64LE-NEXT: stb 5, 3(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 24, 40 -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 16, 48 -; CHECK-PPC64LE-NEXT: rldicl 3, 3, 8, 56 -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: stb 3, 0(4) +; CHECK-PPC64LE-NEXT: stdbrx 3, 0, 4 ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i64_by_i8_bswap: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: rldicl 5, 3, 56, 8 -; CHECK-PPC64-NEXT: rldicl 6, 3, 48, 16 -; CHECK-PPC64-NEXT: stb 5, 6(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 40, 24 -; CHECK-PPC64-NEXT: stb 6, 5(4) -; CHECK-PPC64-NEXT: rldicl 6, 3, 32, 32 -; CHECK-PPC64-NEXT: stb 5, 4(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 24, 40 -; CHECK-PPC64-NEXT: stb 6, 3(4) -; CHECK-PPC64-NEXT: stb 3, 7(4) -; CHECK-PPC64-NEXT: stb 5, 2(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 16, 48 -; CHECK-PPC64-NEXT: rldicl 3, 3, 8, 56 -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: stb 3, 0(4) +; CHECK-PPC64-NEXT: stdx 3, 0, 4 ; CHECK-PPC64-NEXT: blr entry: %conv = trunc i64 %m to i8 @@ -267,46 +187,18 @@ define void @store_i64_by_i8_bswap_uses(i32 signext %t, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i64_by_i8_bswap_uses: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: slwi 5, 3, 3 -; CHECK-PPC64LE-NEXT: subf 3, 3, 5 -; CHECK-PPC64LE-NEXT: extsw 3, 3 -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 56, 8 -; CHECK-PPC64LE-NEXT: stb 3, 7(4) -; CHECK-PPC64LE-NEXT: stb 5, 6(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 48, 16 -; CHECK-PPC64LE-NEXT: stb 5, 5(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 40, 24 -; CHECK-PPC64LE-NEXT: stb 5, 4(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 32, 32 -; CHECK-PPC64LE-NEXT: stb 5, 3(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 24, 40 -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: rldicl 5, 3, 16, 48 -; CHECK-PPC64LE-NEXT: rldicl 3, 3, 8, 56 -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: stb 3, 0(4) +; CHECK-PPC64LE-NEXT: slwi [[REG:[0-9]+]], 3, 3 +; CHECK-PPC64LE-NEXT: subf [[REG1:[0-9]+]], 3, [[REG]] +; CHECK-PPC64LE-NEXT: extsw [[REG2:[0-9]+]], [[REG1]] +; CHECK-PPC64LE-NEXT: stdbrx [[REG2]], 0, 4 ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i64_by_i8_bswap_uses: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: slwi 5, 3, 3 -; CHECK-PPC64-NEXT: subf 3, 3, 5 -; CHECK-PPC64-NEXT: extsw 3, 3 -; CHECK-PPC64-NEXT: rldicl 5, 3, 56, 8 -; CHECK-PPC64-NEXT: rldicl 6, 3, 48, 16 -; CHECK-PPC64-NEXT: stb 5, 6(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 40, 24 -; CHECK-PPC64-NEXT: stb 6, 5(4) -; CHECK-PPC64-NEXT: rldicl 6, 3, 32, 32 -; CHECK-PPC64-NEXT: stb 5, 4(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 24, 40 -; CHECK-PPC64-NEXT: stb 6, 3(4) -; CHECK-PPC64-NEXT: stb 3, 7(4) -; CHECK-PPC64-NEXT: stb 5, 2(4) -; CHECK-PPC64-NEXT: rldicl 5, 3, 16, 48 -; CHECK-PPC64-NEXT: rldicl 3, 3, 8, 56 -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: stb 3, 0(4) +; CHECK-PPC64-NEXT: slwi [[REG:[0-9]+]], 3, 3 +; CHECK-PPC64-NEXT: subf [[REG1:[0-9]+]], 3, [[REG]] +; CHECK-PPC64-NEXT: extsw [[REG2:[0-9]+]], [[REG1]] +; CHECK-PPC64-NEXT: stdx [[REG2]], 0, 4 ; CHECK-PPC64-NEXT: blr entry: %mul = mul nsw i32 %t, 7 @@ -356,25 +248,11 @@ define void @store_i32_by_i8_bswap_volatile(i32 signext %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_volatile: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 3, 3(4) -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: stb 3, 0(4) -; CHECK-PPC64LE-NEXT: blr +; CHECK-PPC64LE-NOT: stwbrx ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_volatile: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 3, 3(4) -; CHECK-PPC64-NEXT: stb 5, 2(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 3, 3, 24 -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: stb 3, 0(4) -; CHECK-PPC64-NEXT: blr +; CHECK-PPC64-NOT: stw entry: %conv = trunc i32 %m to i8 %arrayidx = getelementptr inbounds i8, i8* %p, i64 3 @@ -403,29 +281,11 @@ define void @store_i32_by_i8_bswap_store_in_between(i32 signext %m, i8* %p, i8* %q) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_store_in_between: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 6, 3, 8 -; CHECK-PPC64LE-NEXT: stb 3, 3(4) -; CHECK-PPC64LE-NEXT: stb 6, 2(4) -; CHECK-PPC64LE-NEXT: li 6, 3 -; CHECK-PPC64LE-NEXT: stb 6, 0(5) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: stb 3, 0(4) -; CHECK-PPC64LE-NEXT: blr +; CHECK-PPC64LE-NOT: stwbrx ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_store_in_between: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: li 6, 3 -; CHECK-PPC64-NEXT: srwi 7, 3, 8 -; CHECK-PPC64-NEXT: stb 7, 2(4) -; CHECK-PPC64-NEXT: stb 3, 3(4) -; CHECK-PPC64-NEXT: stb 6, 0(5) -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 3, 3, 24 -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: stb 3, 0(4) -; CHECK-PPC64-NEXT: blr +; CHECK-PPC64-NOT: stw entry: %conv = trunc i32 %m to i8 %arrayidx = getelementptr inbounds i8, i8* %p, i64 3 @@ -448,25 +308,11 @@ define void @store_i32_by_i8_bswap_unrelated_store(i32 signext %m, i8* %p, i8* %q) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_unrelated_store: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 6, 3, 8 -; CHECK-PPC64LE-NEXT: stb 3, 3(4) -; CHECK-PPC64LE-NEXT: stb 6, 2(5) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 1(4) -; CHECK-PPC64LE-NEXT: stb 3, 0(4) -; CHECK-PPC64LE-NEXT: blr +; CHECK-PPC64LE-NOT: stwbrx ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_unrelated_store: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 6, 3, 8 -; CHECK-PPC64-NEXT: stb 3, 3(4) -; CHECK-PPC64-NEXT: stb 6, 2(5) -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 3, 3, 24 -; CHECK-PPC64-NEXT: stb 5, 1(4) -; CHECK-PPC64-NEXT: stb 3, 0(4) -; CHECK-PPC64-NEXT: blr +; CHECK-PPC64-NOT: stw entry: %conv = trunc i32 %m to i8 %arrayidx = getelementptr inbounds i8, i8* %p, i64 3 @@ -493,24 +339,13 @@ define void @store_i32_by_i8_bswap_nonzero_offset(i32 signext %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_nonzero_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, 3(4) -; CHECK-PPC64LE-NEXT: stb 3, 4(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 2(4) -; CHECK-PPC64LE-NEXT: stb 3, 1(4) +; CHECK-PPC64LE-NEXT: addi [[REG1:[0-9]+]], 4, 1 +; CHECK-PPC64LE-NEXT: stwbrx 3, 0, [[REG1]] ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_nonzero_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 3, 4(4) -; CHECK-PPC64-NEXT: stb 5, 3(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 3, 3, 24 -; CHECK-PPC64-NEXT: stb 5, 2(4) -; CHECK-PPC64-NEXT: stb 3, 1(4) +; CHECK-PPC64-NEXT: stw 3, 1(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 8 @@ -539,24 +374,13 @@ define void @store_i32_by_i8_neg_offset(i32 signext %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_neg_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, -3(4) -; CHECK-PPC64LE-NEXT: stb 3, -4(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, -2(4) -; CHECK-PPC64LE-NEXT: stb 3, -1(4) +; CHECK-PPC64LE-NEXT: stw 3, -4(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_neg_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 3, -4(4) -; CHECK-PPC64-NEXT: stb 5, -3(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 3, 3, 24 -; CHECK-PPC64-NEXT: stb 5, -2(4) -; CHECK-PPC64-NEXT: stb 3, -1(4) +; CHECK-PPC64-NEXT: addi [[REG1:[0-9]+]], 4, -4 +; CHECK-PPC64-NEXT: stwbrx 3, 0, [[REG1]] ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 8 @@ -585,24 +409,13 @@ define void @store_i32_by_i8_bswap_neg_offset(i32 signext %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_neg_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: stb 5, -3(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, -4(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, -2(4) -; CHECK-PPC64LE-NEXT: stb 3, -1(4) +; CHECK-PPC64LE-NEXT: addi [[REG1:[0-9]+]], 4, -4 +; CHECK-PPC64LE-NEXT: stwbrx 3, 0, [[REG1]] ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_neg_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: srwi 6, 3, 24 -; CHECK-PPC64-NEXT: stb 5, -3(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 6, -4(4) -; CHECK-PPC64-NEXT: stb 5, -2(4) -; CHECK-PPC64-NEXT: stb 3, -1(4) +; CHECK-PPC64-NEXT: stw 3, -4(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 16 @@ -631,28 +444,17 @@ define void @store_i32_by_i8_bswap_base_index_offset(i32 %m, i32 %i, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_base_index_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: extsw 4, 4 -; CHECK-PPC64LE-NEXT: srwi 6, 3, 16 -; CHECK-PPC64LE-NEXT: add 4, 5, 4 -; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 -; CHECK-PPC64LE-NEXT: stb 6, -3(4) -; CHECK-PPC64LE-NEXT: stb 5, -4(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, -2(4) -; CHECK-PPC64LE-NEXT: stb 3, -1(4) +; CHECK-PPC64LE-NEXT: extsw [[REG1:[0-9]+]], 4 +; CHECK-PPC64LE-NEXT: add [[REG2:[0-9]+]], 5, [[REG1]] +; CHECK-PPC64LE-NEXT: addi [[REG3:[0-9]+]], [[REG2]], -4 +; CHECK-PPC64LE-NEXT: stwbrx 3, 0, [[REG3]] ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_base_index_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: extsw 4, 4 -; CHECK-PPC64-NEXT: srwi 6, 3, 16 -; CHECK-PPC64-NEXT: add 4, 5, 4 -; CHECK-PPC64-NEXT: srwi 5, 3, 24 -; CHECK-PPC64-NEXT: stb 6, -3(4) -; CHECK-PPC64-NEXT: srwi 6, 3, 8 -; CHECK-PPC64-NEXT: stb 5, -4(4) -; CHECK-PPC64-NEXT: stb 6, -2(4) -; CHECK-PPC64-NEXT: stb 3, -1(4) +; CHECK-PPC64-NEXT: extsw [[REG1:[0-9]+]], 4 +; CHECK-PPC64-NEXT: add [[REG2:[0-9]+]], 5, [[REG1]] +; CHECK-PPC64-NEXT: stw 3, -4([[REG2]]) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 16 @@ -694,28 +496,17 @@ define void @store_i32_by_i8_bswap_complicated(i32 %m, i32 %i, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_complicated: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: extsw 4, 4 -; CHECK-PPC64LE-NEXT: add 4, 5, 4 -; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 -; CHECK-PPC64LE-NEXT: stb 5, 3(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 -; CHECK-PPC64LE-NEXT: stb 5, 4(4) -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, 5(4) -; CHECK-PPC64LE-NEXT: stb 3, 6(4) +; CHECK-PPC64LE-NEXT: extsw [[REG1:[0-9]+]], 4 +; CHECK-PPC64LE-NEXT: add [[REG2:[0-9]+]], 5, [[REG1]] +; CHECK-PPC64LE-NEXT: addi [[REG3:[0-9]+]], [[REG2]], 3 +; CHECK-PPC64LE-NEXT: stwbrx 3, 0, [[REG3]] ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_complicated: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: extsw 4, 4 -; CHECK-PPC64-NEXT: srwi 6, 3, 24 -; CHECK-PPC64-NEXT: add 4, 5, 4 -; CHECK-PPC64-NEXT: srwi 5, 3, 16 -; CHECK-PPC64-NEXT: stb 6, 3(4) -; CHECK-PPC64-NEXT: stb 5, 4(4) -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 5, 5(4) -; CHECK-PPC64-NEXT: stb 3, 6(4) +; CHECK-PPC64-NEXT: extsw [[REG1:[0-9]+]], 4 +; CHECK-PPC64-NEXT: add [[REG2:[0-9]+]], 5, [[REG1]] +; CHECK-PPC64-NEXT: stw 3, 3([[REG2]]) ; CHECK-PPC64-NEXT: blr entry: %idx.ext = sext i32 %i to i64 @@ -745,16 +536,12 @@ define void @store_i16_by_i8_bswap(i16 %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_i16_by_i8_bswap: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 -; CHECK-PPC64LE-NEXT: stb 5, 0(4) -; CHECK-PPC64LE-NEXT: stb 3, 1(4) +; CHECK-PPC64LE-NEXT: sthbrx 3, 0, 4 ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i16_by_i8_bswap: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 5, 0(4) -; CHECK-PPC64-NEXT: stb 3, 1(4) +; CHECK-PPC64-NEXT: sth 3, 0(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i16 %m, 8 @@ -771,16 +558,12 @@ define void @store_16_by_i8(i16 %m, i8* %p) { ; CHECK-PPC64LE-LABEL: store_16_by_i8: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: stb 3, 0(4) -; CHECK-PPC64LE-NEXT: srwi 3, 3, 8 -; CHECK-PPC64LE-NEXT: stb 3, 1(4) +; CHECK-PPC64LE-NEXT: sth 3, 0(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_16_by_i8: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: srwi 5, 3, 8 -; CHECK-PPC64-NEXT: stb 3, 0(4) -; CHECK-PPC64-NEXT: stb 5, 1(4) +; CHECK-PPC64-NEXT: sthbrx 3, 0, 4 ; CHECK-PPC64-NEXT: blr entry: %conv1 = trunc i16 %m to i8 @@ -791,3 +574,23 @@ store i8 %conv5, i8* %arrayidx6, align 1 ret void } +; This was found when testing the hexxagon in testsuite +; i8* p; i8 v; +; p[0] = v; +; p[1] = v; +define void @store_same_value_to_consecutive_mem(i8* %p, i8 zeroext %v) { +; CHECK-PPC64LE-LABEL: store_same_value_to_consecutive_mem +; CHECK-PPC64LE: # %bb.0: # %entry +; CHECK-PPC64LE-NEXT: stb 4, 0(3) +; CHECK-PPC64LE-NEXT: stb 4, 1(3) +; +; CHECK-PPC64-LABEL: store_same_value_to_consecutive_mem +; CHECK-PPC64: # %bb.0: # %entry +; CHECK-PPC64-NEXT: stb 4, 0(3) +; CHECK-PPC64-NEXT: stb 4, 1(3) +entry: + store i8 %v, i8* %p, align 1 + %arrayidx1 = getelementptr inbounds i8, i8* %p, i64 1 + store i8 %v, i8* %arrayidx1, align 1 + ret void +}