Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12854,20 +12854,24 @@ BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); int64_t Offset; + if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) + return SDValue(); + + // Normalize for Endianness. After this Offset=0 will denote that the least + // significant bit in the loaded value maps to the least significant bit in + // the stored value). With Offset=n (for n > 0) the loaded value starts at the + // n:th least significant byte of the stored value. + if (DAG.getDataLayout().isBigEndian()) + Offset = (STMemType.getStoreSizeInBits() - + LDMemType.getStoreSizeInBits()) / 8 - Offset; + // Check that the stored value cover all bits that are loaded. bool STCoversLD = - BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset) && (Offset >= 0) && - (Offset * 8 <= LDMemType.getSizeInBits()) && + (Offset >= 0) && (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); - if (!STCoversLD) return SDValue(); - // Normalize for Endianness. - if (DAG.getDataLayout().isBigEndian()) - Offset = - (STMemType.getSizeInBits() - LDMemType.getSizeInBits()) / 8 - Offset; - // Memory as copy space (potentially masked). if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding @@ -12899,7 +12903,7 @@ continue; if (STMemType != LDMemType) { // TODO: Support vectors? This requires extract_subvector/bitcast. - if (!STMemType.isVector() && !LDMemType.isVector() && + if (!STMemType.isVector() && !LDMemType.isVector() && STMemType.isInteger() && LDMemType.isInteger()) Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val); else Index: llvm/trunk/test/CodeGen/AArch64/load-store-forwarding.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/load-store-forwarding.ll +++ llvm/trunk/test/CodeGen/AArch64/load-store-forwarding.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefix CHECK-BE +; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefix CHECK-LE + +define i8 @test1(i32 %a, i8* %pa) { +; CHECK-BE-LABEL: test1: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: str w0, [x1] +; CHECK-BE-NEXT: ldrb w0, [x1] +; CHECK-BE-NEXT: ret +; +; CHECK-LE-LABEL: test1: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: str w0, [x1] +; CHECK-LE-NEXT: ret + %p32 = bitcast i8* %pa to i32* + %p8 = getelementptr i8, i8* %pa, i32 0 + store i32 %a, i32* %p32 + %res = load i8, i8* %p8 + ret i8 %res +} + +define i8 @test2(i32 %a, i8* %pa) { +; CHECK-BE-LABEL: test2: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: str w0, [x1] +; CHECK-BE-NEXT: ldrb w0, [x1, #1] +; CHECK-BE-NEXT: ret +; +; CHECK-LE-LABEL: test2: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: str w0, [x1] +; CHECK-LE-NEXT: ubfx w0, w0, #8, #8 +; CHECK-LE-NEXT: ret + %p32 = bitcast i8* %pa to i32* + %p8 = getelementptr i8, i8* %pa, i32 1 + store i32 %a, i32* %p32 + %res = load i8, i8* %p8 + ret i8 %res +} + +define i8 @test3(i32 %a, i8* %pa) { +; CHECK-BE-LABEL: test3: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: str w0, [x1] +; CHECK-BE-NEXT: ldrb w0, [x1, #2] +; CHECK-BE-NEXT: ret +; +; CHECK-LE-LABEL: test3: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: str w0, [x1] +; CHECK-LE-NEXT: ubfx w0, w0, #16, #8 +; CHECK-LE-NEXT: ret + %p32 = bitcast i8* %pa to i32* + %p8 = getelementptr i8, i8* %pa, i32 2 + store i32 %a, i32* %p32 + %res = load i8, i8* %p8 + ret i8 %res +} + +define i8 @test4(i32 %a, i8* %pa) { +; CHECK-BE-LABEL: test4: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: str w0, [x1] +; CHECK-BE-NEXT: ret +; +; CHECK-LE-LABEL: test4: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: str w0, [x1] +; CHECK-LE-NEXT: lsr w0, w0, #24 +; CHECK-LE-NEXT: ret + %p32 = bitcast i8* %pa to i32* + %p8 = getelementptr i8, i8* %pa, i32 3 + store i32 %a, i32* %p32 + %res = load i8, i8* %p8 + ret i8 %res +}