Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1721,6 +1721,13 @@ return false; } + /// Use unaligned memory access for non-power2 types + virtual bool allowMisalignedMemForNonPow2Type( + EVT, unsigned AddrSpace = 0, Align Alignment = Align(1), + MachineMemOperand::Flags Flags = MachineMemOperand::MONone) const { + return false; + } + /// This function returns true if the memory access is aligned or if the /// target allows this specific unaligned memory access. If the access is /// allowed, the optional final parameter returns if the access is also fast Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -794,12 +794,22 @@ LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + Align ExtraLoadAlign = LD->getOriginalAlign(); + unsigned IncSizeBits = RoundWidth; + if (TLI.allowMisalignedMemForNonPow2Type(SrcVT, LD->getAddressSpace(), + ExtraLoadAlign, MMOFlags)) { + IncSizeBits = ExtraWidth; + ExtraVT = RoundVT; + ExtraWidth = RoundWidth; + ExtraLoadAlign = Align(1); + } + IncrementSize = IncSizeBits / 8; + // Load the remaining ExtraWidth bits. - IncrementSize = RoundWidth / 8; Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + ExtraVT, ExtraLoadAlign, MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of // the other one. @@ -809,7 +819,7 @@ // Move the top bits to the right place. Hi = DAG.getNode( ISD::SHL, dl, Hi.getValueType(), Hi, - DAG.getConstant(RoundWidth, dl, + DAG.getConstant(IncSizeBits, dl, TLI.getShiftAmountTy(Hi.getValueType(), DL))); // Join the hi and lo parts. Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3572,17 +3572,29 @@ N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits); - // Increment the pointer to the other half. - unsigned IncrementSize = NVT.getSizeInBits()/8; - Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); - Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, - N->getPointerInfo().getWithOffset(IncrementSize), NEVT, - N->getOriginalAlign(), MMOFlags, AAInfo); - - // Build a factor node to remember that this load is independent of the - // other one. - Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); + if (TLI.allowMisalignedMemForNonPow2Type( + N->getMemoryVT(), N->getAddressSpace(), N->getAlign(), MMOFlags)) { + // use unaligned load to simplify the non-power2 types + unsigned IncrementSize = NEVT.getSizeInBits() / 8; + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); + Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, + N->getPointerInfo().getWithOffset(IncrementSize), NVT, + Align(1), MMOFlags, AAInfo); + unsigned Opcode = ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL; + unsigned ShiftCnt = NVT.getSizeInBits() - ExcessBits; + Hi = DAG.getNode(Opcode, dl, NVT, Hi, DAG.getConstant(ShiftCnt, dl, NVT)); + } else { + // Increment the pointer to the other half. + unsigned IncrementSize = NVT.getSizeInBits() / 8; + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); + Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, + N->getPointerInfo().getWithOffset(IncrementSize), + NEVT, N->getOriginalAlign(), MMOFlags, AAInfo); + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + } } else { // Big-endian - high bits are at low addresses. Favor aligned loads at // the cost of some bit-fiddling. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -531,6 +531,11 @@ MachineMemOperand::Flags Flags, bool *Fast = nullptr) const override; + /// Use unaligned memory access for non-power2 types + bool allowMisalignedMemForNonPow2Type( + EVT, unsigned AddrSpace, Align Alignment, + MachineMemOperand::Flags Flags) const override; + /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2042,6 +2042,23 @@ return true; } +bool AArch64TargetLowering::allowMisalignedMemForNonPow2Type( + EVT VT, unsigned AddrSpace, Align Alignment, + MachineMemOperand::Flags Flags) const { + unsigned WidthBits = VT.getSizeInBits().getFixedSize(); + if (countPopulation(WidthBits) <= 2) + return false; + + if ((WidthBits & 63) == 0) + return false; + + bool fast; + if (!allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, &fast)) + return false; + + return fast; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { Index: llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -15,11 +15,9 @@ define i56 @ldi56(ptr %p) nounwind { ; CHECK-LABEL: ldi56: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #6] -; CHECK-NEXT: ldrh w9, [x0, #4] -; CHECK-NEXT: ldr w0, [x0] -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: bfi x0, x9, #32, #32 +; CHECK-NEXT: ldur w8, [x0, #3] +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: orr x0, x9, x8, lsl #24 ; CHECK-NEXT: ret %r = load i56, i56* %p ret i56 %r @@ -39,12 +37,9 @@ define i120 @ldi120(ptr %p) nounwind { ; CHECK-LABEL: ldi120: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #14] -; CHECK-NEXT: ldrh w9, [x0, #12] -; CHECK-NEXT: ldr w1, [x0, #8] +; CHECK-NEXT: ldur x8, [x0, #7] ; CHECK-NEXT: ldr x0, [x0] -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: bfi x1, x9, #32, #32 +; CHECK-NEXT: lsr x1, x8, #8 ; CHECK-NEXT: ret %r = load i120, i120* %p ret i120 %r @@ -54,11 +49,10 @@ ; CHECK-LABEL: ldi280: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x1, [x0] -; CHECK-NEXT: ldrb w9, [x0, #34] -; CHECK-NEXT: ldrh w4, [x0, #32] +; CHECK-NEXT: ldur x9, [x0, #27] ; CHECK-NEXT: ldp x2, x3, [x0, #16] ; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: bfi x4, x9, #16, #8 +; CHECK-NEXT: lsr x4, x9, #40 ; CHECK-NEXT: ret %r = load i280, i280* %p ret i280 %r