diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12206,7 +12206,8 @@ // accessing any of the loaded bytes. If the load was a zextload/extload // then the result of the shift+trunc is zero/undef (handled elsewhere). ShAmt = SRL1C->getZExtValue(); - if (ShAmt >= LN->getMemoryVT().getSizeInBits()) + uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits(); + if (ShAmt >= MemoryWidth) return SDValue(); // Because a SRL must be assumed to *need* to zero-extend the high bits @@ -12215,13 +12216,17 @@ if (LN->getExtensionType() == ISD::SEXTLOAD) return SDValue(); - unsigned ExtVTBits = ExtVT.getScalarSizeInBits(); - // Is the shift amount a multiple of size of ExtVT? - if ((ShAmt & (ExtVTBits - 1)) != 0) - return SDValue(); - // Is the load width a multiple of size of ExtVT? - if ((SRL.getScalarValueSizeInBits() & (ExtVTBits - 1)) != 0) + // Avoid reading outside the memory accessed by the original load (could + // happend if we only adjust the load base pointer by ShAmt). Instead we try + // to narrow the load even further. The typical scenario here is: + // (i64 (truncate (i96 (srl (load x), 64)))) -> + // (i64 (truncate (i96 (zextload (load i32 + offset) from i32)))) + if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) { + assert(ExtType != ISD::SEXTLOAD && "Don't replace sextload by zextload."); + ExtType = ISD::ZEXTLOAD; + ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt); return SDValue(); + } // If the SRL is only used by a masking AND, we may be able to adjust // the ExtVT to make the AND redundant. @@ -12233,7 +12238,7 @@ EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countTrailingOnes()); // If the mask is smaller, recompute the type. - if ((ExtVTBits > MaskedVT.getScalarSizeInBits()) && + if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) ExtVT = MaskedVT; } diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll --- a/llvm/test/CodeGen/ARM/shift-combine.ll +++ b/llvm/test/CodeGen/ARM/shift-combine.ll @@ -302,9 +302,7 @@ ; ; CHECK-BE-LABEL: test_lshr_load64_4_unaligned: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldr r1, [r0] -; CHECK-BE-NEXT: ldrh r0, [r0, #4] -; CHECK-BE-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-BE-NEXT: ldr r0, [r0, #2] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load64_4_unaligned: @@ -341,9 +339,7 @@ ; ; CHECK-BE-LABEL: test_lshr_load64_1_lsb: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldr r1, [r0] -; CHECK-BE-NEXT: ldrb r0, [r0, #4] -; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-BE-NEXT: ldr r0, [r0, #1] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load64_1_lsb: @@ -441,23 +437,17 @@ define arm_aapcscc i32 @test_lshr_load4_fail(i64* %a) { ; CHECK-ARM-LABEL: test_lshr_load4_fail: ; CHECK-ARM: @ %bb.0: @ %entry -; CHECK-ARM-NEXT: ldrd r0, r1, [r0] -; CHECK-ARM-NEXT: lsr r0, r0, #8 -; CHECK-ARM-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARM-NEXT: ldr r0, [r0, #1] ; CHECK-ARM-NEXT: bx lr ; ; CHECK-BE-LABEL: test_lshr_load4_fail: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldrd r0, r1, [r0] -; CHECK-BE-NEXT: lsr r1, r1, #8 -; CHECK-BE-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-BE-NEXT: ldr r0, [r0, #3] ; CHECK-BE-NEXT: bx lr ; ; CHECK-THUMB-LABEL: test_lshr_load4_fail: ; CHECK-THUMB: @ %bb.0: @ %entry -; CHECK-THUMB-NEXT: ldrd r0, r1, [r0] -; CHECK-THUMB-NEXT: lsrs r0, r0, #8 -; CHECK-THUMB-NEXT: orr.w r0, r0, r1, lsl #24 +; CHECK-THUMB-NEXT: ldr.w r0, [r0, #1] ; CHECK-THUMB-NEXT: bx lr ; ; CHECK-ALIGN-LABEL: test_lshr_load4_fail: diff --git a/llvm/test/CodeGen/X86/combine-srl-load.ll b/llvm/test/CodeGen/X86/combine-srl-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-srl-load.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i16 @load_srl_combine(i32* %arg) { +; CHECK-LABEL: load_srl_combine: +; CHECK: # %bb.0: +; CHECK-NEXT: movzwl 1(%rdi), %eax +; CHECK-NEXT: retq + %tmp1 = load i32, i32* %arg, align 1 + %tmp2 = lshr i32 %tmp1, 8 + %tmp3 = trunc i32 %tmp2 to i16 + ret i16 %tmp3 +}