diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12197,7 +12197,8 @@ // accessing any of the loaded bytes. If the load was a zextload/extload // then the result of the shift+trunc is zero/undef (handled elsewhere). ShAmt = SRL1C->getZExtValue(); - if (ShAmt >= LN->getMemoryVT().getSizeInBits()) + uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits(); + if (ShAmt >= MemoryWidth) return SDValue(); // Because a SRL must be assumed to *need* to zero-extend the high bits @@ -12206,13 +12207,17 @@ if (LN->getExtensionType() == ISD::SEXTLOAD) return SDValue(); - unsigned ExtVTBits = ExtVT.getScalarSizeInBits(); - // Is the shift amount a multiple of size of ExtVT? - if ((ShAmt & (ExtVTBits-1)) != 0) - return SDValue(); - // Is the load width a multiple of size of ExtVT? - if ((SRL.getScalarValueSizeInBits() & (ExtVTBits - 1)) != 0) + // Avoid reading outside the memory accessed by the original load (could + // happend if we only adjust the load base pointer by ShAmt). Instead we try + // to narrow the load even further. The typical scenario here is: + // (i64 (truncate (i96 (srl (load x), 64)))) -> + // (i64 (truncate (i96 (zextload (load i32 + offset) from i32)))) + if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) { + assert(ExtType != ISD::SEXTLOAD && "Don't replace sextload by zextload."); + ExtType = ISD::ZEXTLOAD; + ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt); return SDValue(); + } // If the SRL is only used by a masking AND, we may be able to adjust // the ExtVT to make the AND redundant. @@ -12224,7 +12229,7 @@ EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countTrailingOnes()); // If the mask is smaller, recompute the type. - if ((ExtVTBits > MaskedVT.getScalarSizeInBits()) && + if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) ExtVT = MaskedVT; } diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll --- a/llvm/test/CodeGen/ARM/shift-combine.ll +++ b/llvm/test/CodeGen/ARM/shift-combine.ll @@ -302,9 +302,7 @@ ; ; CHECK-BE-LABEL: test_lshr_load64_4_unaligned: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldr r1, [r0] -; CHECK-BE-NEXT: ldrh r0, [r0, #4] -; CHECK-BE-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-BE-NEXT: ldr r0, [r0, #2] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load64_4_unaligned: @@ -341,9 +339,7 @@ ; ; CHECK-BE-LABEL: test_lshr_load64_1_lsb: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldr r1, [r0] -; CHECK-BE-NEXT: ldrb r0, [r0, #4] -; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-BE-NEXT: ldr r0, [r0, #1] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load64_1_lsb: @@ -441,23 +437,17 @@ define arm_aapcscc i32 @test_lshr_load4_fail(i64* %a) { ; CHECK-ARM-LABEL: test_lshr_load4_fail: ; CHECK-ARM: @ %bb.0: @ %entry -; CHECK-ARM-NEXT: ldrd r0, r1, [r0] -; CHECK-ARM-NEXT: lsr r0, r0, #8 -; CHECK-ARM-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARM-NEXT: ldr r0, [r0, #1] ; CHECK-ARM-NEXT: bx lr ; ; CHECK-BE-LABEL: test_lshr_load4_fail: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldrd r0, r1, [r0] -; CHECK-BE-NEXT: lsr r1, r1, #8 -; CHECK-BE-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-BE-NEXT: ldr r0, [r0, #3] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load4_fail: ; CHECK-THUMB: @ %bb.0: @ %entry -; CHECK-THUMB-NEXT: ldrd r0, r1, [r0] -; CHECK-THUMB-NEXT: lsrs r0, r0, #8 -; CHECK-THUMB-NEXT: orr.w r0, r0, r1, lsl #24 +; CHECK-THUMB-NEXT: ldr.w r0, [r0, #1] ; CHECK-THUMB-NEXT: bx lr ; ; CHECK-ALIGN-LABEL: test_lshr_load4_fail: diff --git a/llvm/test/CodeGen/PowerPC/pr39478.ll b/llvm/test/CodeGen/PowerPC/pr39478.ll --- a/llvm/test/CodeGen/PowerPC/pr39478.ll +++ b/llvm/test/CodeGen/PowerPC/pr39478.ll @@ -3,16 +3,21 @@ ; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECKBE ; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -verify-machineinstrs | FileCheck %s --check-prefix=CHECKBE +; FIXME: It should be possible to fold the addi+lwbrx into a single lbz, given +; that only 8-bits are demanded. Maybe that can be detected already when +; selecting the PPCISD::LWBRX. define void @pr39478(i64* %p64, i32* %p32) { ; CHECKLE-LABEL: pr39478: ; CHECKLE: # %bb.0: # %entry -; CHECKLE-NEXT: lbz 3, 4(3) +; CHECKLE-NEXT: addi 3, 3, 1 +; CHECKLE-NEXT: lwbrx 3, 0, 3 ; CHECKLE-NEXT: stb 3, 0(4) ; CHECKLE-NEXT: blr ; ; CHECKBE-LABEL: pr39478: ; CHECKBE: # %bb.0: # %entry -; CHECKBE-NEXT: lbz 3, 3(3) +; CHECKBE-NEXT: addi 3, 3, 3 +; CHECKBE-NEXT: lwbrx 3, 0, 3 ; CHECKBE-NEXT: stb 3, 3(4) ; CHECKBE-NEXT: blr entry: