diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12214,7 +12214,8 @@ // accessing any of the loaded bytes. If the load was a zextload/extload // then the result of the shift+trunc is zero/undef (handled elsewhere). ShAmt = SRL1C->getZExtValue(); - if (ShAmt >= LN->getMemoryVT().getSizeInBits()) + uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits(); + if (ShAmt >= MemoryWidth) return SDValue(); // Because a SRL must be assumed to *need* to zero-extend the high bits @@ -12223,13 +12224,19 @@ if (LN->getExtensionType() == ISD::SEXTLOAD) return SDValue(); - unsigned ExtVTBits = ExtVT.getScalarSizeInBits(); - // Is the shift amount a multiple of size of ExtVT? - if ((ShAmt & (ExtVTBits - 1)) != 0) - return SDValue(); - // Is the load width a multiple of size of ExtVT? - if ((SRL.getScalarValueSizeInBits() & (ExtVTBits - 1)) != 0) - return SDValue(); + // Avoid reading outside the memory accessed by the original load (could + // happened if we only adjust the load base pointer by ShAmt). Instead we + // try to narrow the load even further. The typical scenario here is: + // (i64 (truncate (i96 (srl (load x), 64)))) -> + // (i64 (truncate (i96 (zextload (load i32 + offset) from i32)))) + if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) { + // Don't replace sextload by zextload. + if (ExtType == ISD::SEXTLOAD) + return SDValue(); + // Narrow the load. + ExtType = ISD::ZEXTLOAD; + ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt); + } // If the SRL is only used by a masking AND, we may be able to adjust // the ExtVT to make the AND redundant. @@ -12241,7 +12248,7 @@ EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countTrailingOnes()); // If the mask is smaller, recompute the type. - if ((ExtVTBits > MaskedVT.getScalarSizeInBits()) && + if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) ExtVT = MaskedVT; } diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll --- a/llvm/test/CodeGen/ARM/shift-combine.ll +++ b/llvm/test/CodeGen/ARM/shift-combine.ll @@ -302,9 +302,7 @@ ; ; CHECK-BE-LABEL: test_lshr_load64_4_unaligned: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldr r1, [r0] -; CHECK-BE-NEXT: ldrh r0, [r0, #4] -; CHECK-BE-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-BE-NEXT: ldr r0, [r0, #2] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load64_4_unaligned: @@ -341,9 +339,7 @@ ; ; CHECK-BE-LABEL: test_lshr_load64_1_lsb: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldr r1, [r0] -; CHECK-BE-NEXT: ldrb r0, [r0, #4] -; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-BE-NEXT: ldr r0, [r0, #1] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load64_1_lsb: @@ -441,23 +437,17 @@ define arm_aapcscc i32 @test_lshr_load4_fail(i64* %a) { ; CHECK-ARM-LABEL: test_lshr_load4_fail: ; CHECK-ARM: @ %bb.0: @ %entry -; CHECK-ARM-NEXT: ldrd r0, r1, [r0] -; CHECK-ARM-NEXT: lsr r0, r0, #8 -; CHECK-ARM-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARM-NEXT: ldr r0, [r0, #1] ; CHECK-ARM-NEXT: bx lr ; ; CHECK-BE-LABEL: test_lshr_load4_fail: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldrd r0, r1, [r0] -; CHECK-BE-NEXT: lsr r1, r1, #8 -; CHECK-BE-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-BE-NEXT: ldr r0, [r0, #3] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load4_fail: ; CHECK-THUMB: @ %bb.0: @ %entry -; CHECK-THUMB-NEXT: ldrd r0, r1, [r0] -; CHECK-THUMB-NEXT: lsrs r0, r0, #8 -; CHECK-THUMB-NEXT: orr.w r0, r0, r1, lsl #24 +; CHECK-THUMB-NEXT: ldr.w r0, [r0, #1] ; CHECK-THUMB-NEXT: bx lr ; ; CHECK-ALIGN-LABEL: test_lshr_load4_fail: diff --git a/llvm/test/CodeGen/X86/shift-folding.ll b/llvm/test/CodeGen/X86/shift-folding.ll --- a/llvm/test/CodeGen/X86/shift-folding.ll +++ b/llvm/test/CodeGen/X86/shift-folding.ll @@ -88,9 +88,7 @@ ; CHECK-LABEL: srl_load_narrowing1: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl (%eax), %eax -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movzwl 1(%eax), %eax ; CHECK-NEXT: retl %tmp1 = load i32, i32* %arg, align 1 %tmp2 = lshr i32 %tmp1, 8