diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6750,12 +6750,6 @@ return SDValue(); unsigned ByteWidth = VT.getSizeInBits() / 8; - // Before legalize we can introduce too wide illegal loads which will be later - // split into legal sized loads. This enables us to combine i64 load by i8 - // patterns to a couple of i32 loads on 32 bit targets. - if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) - return SDValue(); - bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); @@ -6778,11 +6772,21 @@ // Check if all the bytes of the OR we are looking at are loaded from the same // base address. Collect bytes offsets from Base address in ByteOffsets. SmallVector ByteOffsets(ByteWidth); - for (unsigned i = 0; i < ByteWidth; i++) { + unsigned ZeroExtendedBytes = 0; + for (int i = ByteWidth - 1; i >= 0; --i) { auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); - if (!P || !P->isMemory()) // All the bytes must be loaded from memory + if (!P) return SDValue(); + if (P->isConstantZero()) { + // It's OK for the N most significant bytes to be 0, we can just + // zero-extend the load. + if (++ZeroExtendedBytes != (ByteWidth - static_cast(i))) + return SDValue(); + continue; + } + assert(P->isMemory() && "provenance should either be memory or zero"); + LoadSDNode *L = P->Load; assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && !L->isIndexed() && @@ -6821,9 +6825,23 @@ assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + bool NeedsZext = ZeroExtendedBytes > 0; + + EVT MemVT = + EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); + + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && + !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, + MemVT)) + return SDValue(); + // Check if the bytes of the OR we are looking at match with either big or // little endian value load - Optional IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); + Optional IsBigEndian = isBigEndian( + makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); if (!IsBigEndian.hasValue()) return SDValue(); @@ -6836,7 +6854,8 @@ LoadSDNode *FirstLoad = FirstByteProvider->Load; // The node we are looking at matches with the pattern, check if we can - // replace it with a single load and bswap if needed. + // replace it with a single (possibly zero-extended) load and bswap + shift if + // needed. // If the load needs byte swap check if the target supports it bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; @@ -6844,25 +6863,45 @@ // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // load and byte shuffling instead of several loads and byte shuffling. - if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + // We do not introduce illegal bswaps when zero-extending as this tends to + // introduce too many arithmetic instructions. + if (NeedsBswap && (LegalOperations || NeedsZext) && + !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // If we need to bswap and zero extend, we have to insert a shift. Check that + // it is legal. + if (NeedsBswap && NeedsZext && LegalOperations && + !TLI.isOperationLegal(ISD::SHL, VT)) return SDValue(); // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; - bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - VT, *FirstLoad->getMemOperand(), &Fast); + bool Allowed = + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + *FirstLoad->getMemOperand(), &Fast); if (!Allowed || !Fast) return SDValue(); - SDValue NewLoad = - DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, + SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), MemVT, + FirstLoad->getAlignment()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); - return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; + if (!NeedsBswap) + return NewLoad; + + SDValue ShiftedLoad = + NeedsZext + ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, + DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, + SDLoc(N), LegalOperations)) + : NewLoad; + return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); } // If the target has andn, bsl, or a similar bit-select instruction, diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll --- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -445,10 +445,9 @@ define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: rev w0, w8 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -515,10 +514,7 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldrh w0, [x0] ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -431,10 +431,7 @@ define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldrh w0, [x0] ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -501,10 +498,9 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: rev w0, w8 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -824,25 +824,23 @@ ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrb r1, [r0] -; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #1] -; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 -; CHECK-THUMBv6-NEXT: adds r0, r0, r1 +; CHECK-THUMBv6-NEXT: ldrh r0, [r0] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #16 +; CHECK-THUMBv6-NEXT: rev r0, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrb r1, [r0] -; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #1] -; CHECK-THUMBv7-NEXT: lsls r0, r0, #8 -; CHECK-THUMBv7-NEXT: adds r0, r0, r1 +; CHECK-THUMBv7-NEXT: ldrh r0, [r0] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #16 +; CHECK-THUMBv7-NEXT: rev r0, r0 ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -962,32 +960,22 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: ldrh r0, [r0] ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #1] -; CHECK-THUMBv6-NEXT: ldrb r0, [r0] -; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 -; CHECK-THUMBv6-NEXT: adds r0, r0, r1 +; CHECK-THUMBv6-NEXT: ldrh r0, [r0] ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #1] -; CHECK-THUMBv7-NEXT: ldrb r0, [r0] -; CHECK-THUMBv7-NEXT: lsls r0, r0, #8 -; CHECK-THUMBv7-NEXT: adds r0, r0, r1 +; CHECK-THUMBv7-NEXT: ldrh r0, [r0] ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -734,31 +734,22 @@ define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: ldrh r0, [r0] ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrb r1, [r0] -; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #1] -; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 -; CHECK-THUMBv6-NEXT: adds r0, r0, r1 +; CHECK-THUMBv6-NEXT: ldrh r0, [r0] ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrb r1, [r0] -; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #1] -; CHECK-THUMBv7-NEXT: orr.w r0, r1, r0, lsl #8 +; CHECK-THUMBv7-NEXT: ldrh r0, [r0] ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -883,24 +874,23 @@ ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #1] -; CHECK-THUMBv6-NEXT: ldrb r0, [r0] -; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 -; CHECK-THUMBv6-NEXT: adds r0, r0, r1 +; CHECK-THUMBv6-NEXT: ldrh r0, [r0] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #16 +; CHECK-THUMBv6-NEXT: rev r0, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrb r1, [r0] -; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #1] -; CHECK-THUMBv7-NEXT: orr.w r0, r0, r1, lsl #8 +; CHECK-THUMBv7-NEXT: ldrh r0, [r0] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #16 +; CHECK-THUMBv7-NEXT: rev r0, r0 ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -1119,18 +1119,12 @@ ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl (%eax), %ecx -; CHECK-NEXT: movzbl 1(%eax), %eax -; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movzwl (%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: zext_load_i32_by_i8: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzbl (%rdi), %ecx -; CHECK64-NEXT: movzbl 1(%rdi), %eax -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: movzwl (%rdi), %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -1218,18 +1212,16 @@ ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl 1(%eax), %ecx -; CHECK-NEXT: movzbl (%eax), %eax -; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movzwl (%eax), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: bswapl %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: zext_load_i32_by_i8_bswap: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzbl 1(%rdi), %ecx -; CHECK64-NEXT: movzbl (%rdi), %eax -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: movzwl (%rdi), %eax +; CHECK64-NEXT: shll $16, %eax +; CHECK64-NEXT: bswapl %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1