Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1721,6 +1721,12 @@ return false; } + /// Use unaligned memory access for non-power2 types + bool allowMisalignedMemForNonPow2Type( + LLVMContext &Context, unsigned SrcBits, EVT ExtraVT, + unsigned AddrSpace = 0, Align Alignment = Align(1), + MachineMemOperand::Flags Flags = MachineMemOperand::MONone) const; + /// This function returns true if the memory access is aligned or if the /// target allows this specific unaligned memory access. If the access is /// allowed, the optional final parameter returns if the access is also fast Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -781,8 +781,9 @@ assert(ExtraWidth < RoundWidth); assert(!(RoundWidth % 8) && !(ExtraWidth % 8) && "Load size not an integral number of bytes!"); - EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth); - EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth); + LLVMContext &Context = *DAG.getContext(); + EVT RoundVT = EVT::getIntegerVT(Context, RoundWidth); + EVT ExtraVT = EVT::getIntegerVT(Context, ExtraWidth); SDValue Lo, Hi, Ch; unsigned IncrementSize; auto &DL = DAG.getDataLayout(); @@ -794,8 +795,20 @@ LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + Align ExtraAlign = Align(1ull << countTrailingZeros(ExtraWidth / 8)); + unsigned IncSizeBits = RoundWidth; + if (TLI.allowMisalignedMemForNonPow2Type(Context, SrcWidthBits, RoundVT, + LD->getAddressSpace(), + ExtraAlign, MMOFlags)) { + IncSizeBits = ExtraWidth; + ExtraVT = RoundVT; + ExtraWidth = RoundWidth; + } else { + ExtraAlign = LD->getOriginalAlign(); + } + // Load the remaining ExtraWidth bits. - IncrementSize = RoundWidth / 8; + IncrementSize = IncSizeBits / 8; Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), @@ -809,7 +822,7 @@ // Move the top bits to the right place. Hi = DAG.getNode( ISD::SHL, dl, Hi.getValueType(), Hi, - DAG.getConstant(RoundWidth, dl, + DAG.getConstant(IncSizeBits, dl, TLI.getShiftAmountTy(Hi.getValueType(), DL))); // Join the hi and lo parts. Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3564,25 +3564,48 @@ Hi = DAG.getUNDEF(NVT); } } else if (DAG.getDataLayout().isLittleEndian()) { + LLVMContext &Context = *DAG.getContext(); + unsigned SrcBits = N->getMemoryVT().getSizeInBits(); + unsigned RoundBits = NVT.getSizeInBits(); // Little-endian - low bits are at low addresses. Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), N->getOriginalAlign(), MMOFlags, AAInfo); - unsigned ExcessBits = - N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); - EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits); - - // Increment the pointer to the other half. - unsigned IncrementSize = NVT.getSizeInBits()/8; - Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); - Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, - N->getPointerInfo().getWithOffset(IncrementSize), NEVT, - N->getOriginalAlign(), MMOFlags, AAInfo); - - // Build a factor node to remember that this load is independent of the - // other one. - Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); + unsigned ExcessBits = SrcBits - RoundBits; + EVT NEVT = EVT::getIntegerVT(Context, ExcessBits); + + EVT HiVT = NEVT.getRoundIntegerType(Context); + unsigned OverlapedBits = HiVT.getSizeInBits() - ExcessBits; + unsigned IncBits = RoundBits - OverlapedBits; + Align ExtraAlign = Align(1ull << countTrailingZeros((IncBits + 7) / 8)); + // We only enable the unaligned load when the extra part is + // less than the largest integer type the target support, + // which means "VT == getTypeToTransformTo(VT)" + if (HiVT == TLI.getTypeToTransformTo(Context, HiVT) && + TLI.allowMisalignedMemForNonPow2Type(Context, SrcBits, HiVT, + N->getAddressSpace(), ExtraAlign, + MMOFlags)) { + // use unaligned load to simplify the non-power2 types + unsigned IncrementSize = IncBits / 8; + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); + Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, + N->getPointerInfo().getWithOffset(IncrementSize), + HiVT, N->getOriginalAlign(), MMOFlags, AAInfo); + unsigned Opcode = ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL; + Hi = DAG.getNode(Opcode, dl, NVT, Hi, + DAG.getConstant(OverlapedBits, dl, NVT)); + } else { + // Increment the pointer to the other half. + unsigned IncrementSize = RoundBits / 8; + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl); + Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, + N->getPointerInfo().getWithOffset(IncrementSize), + NEVT, N->getOriginalAlign(), MMOFlags, AAInfo); + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + } } else { // Big-endian - high bits are at low addresses. Favor aligned loads at // the cost of some bit-fiddling. Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1760,6 +1760,26 @@ MMO.getFlags(), Fast); } +bool TargetLoweringBase::allowMisalignedMemForNonPow2Type( + LLVMContext &Context, unsigned SrcBits, EVT ExtraVT, unsigned AddrSpace, + Align Alignment, MachineMemOperand::Flags Flags) const { + // If pop count is less or equal than 2 we can't get + // any benifit from unaligned load/store + if (countPopulation(SrcBits) <= 2) + return false; + + // If load bits is not byte align we don't use unaligned load/store + if ((SrcBits & 7) != 0) + return false; + + bool fast; + if (!allowsMisalignedMemoryAccesses(ExtraVT, AddrSpace, Alignment, Flags, + &fast)) + return false; + + return fast; +} + //===----------------------------------------------------------------------===// // TargetTransformInfo Helpers //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -15,11 +15,9 @@ define i56 @ldi56(ptr %p) nounwind { ; CHECK-LABEL: ldi56: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #6] -; CHECK-NEXT: ldrh w9, [x0, #4] -; CHECK-NEXT: ldr w0, [x0] -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: bfi x0, x9, #32, #32 +; CHECK-NEXT: ldur w8, [x0, #3] +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: orr x0, x9, x8, lsl #24 ; CHECK-NEXT: ret %r = load i56, i56* %p ret i56 %r @@ -39,12 +37,9 @@ define i120 @ldi120(ptr %p) nounwind { ; CHECK-LABEL: ldi120: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #14] -; CHECK-NEXT: ldrh w9, [x0, #12] -; CHECK-NEXT: ldr w1, [x0, #8] +; CHECK-NEXT: ldur x8, [x0, #7] ; CHECK-NEXT: ldr x0, [x0] -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: bfi x1, x9, #32, #32 +; CHECK-NEXT: lsr x1, x8, #8 ; CHECK-NEXT: ret %r = load i120, i120* %p ret i120 %r @@ -54,11 +49,10 @@ ; CHECK-LABEL: ldi280: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x1, [x0] -; CHECK-NEXT: ldrb w9, [x0, #34] -; CHECK-NEXT: ldrh w4, [x0, #32] +; CHECK-NEXT: ldur w9, [x0, #31] ; CHECK-NEXT: ldp x2, x3, [x0, #16] ; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: bfi x4, x9, #16, #8 +; CHECK-NEXT: ubfx x4, x9, #8, #24 ; CHECK-NEXT: ret %r = load i280, i280* %p ret i280 %r @@ -128,15 +122,15 @@ define void @i56_or(ptr %a) { ; CHECK-LABEL: i56_or: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: ldur w8, [x0, #3] ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w10, [x8, #4]! -; CHECK-NEXT: ldrb w11, [x8, #2] +; CHECK-NEXT: lsr x10, x8, #8 +; CHECK-NEXT: orr w9, w9, w8, lsl #24 +; CHECK-NEXT: lsr x8, x8, #24 ; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: bfi w10, w11, #16, #16 +; CHECK-NEXT: strh w10, [x0, #4] +; CHECK-NEXT: strb w8, [x0, #6] ; CHECK-NEXT: str w9, [x0] -; CHECK-NEXT: strb w11, [x8, #2] -; CHECK-NEXT: strh w10, [x8] ; CHECK-NEXT: ret %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -147,16 +141,16 @@ define void @i56_and_or(ptr %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w10, [x8, #4]! -; CHECK-NEXT: ldrb w11, [x8, #2] -; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: and w9, w9, #0xffffff80 -; CHECK-NEXT: bfi w10, w11, #16, #16 -; CHECK-NEXT: strb w11, [x8, #2] -; CHECK-NEXT: str w9, [x0] -; CHECK-NEXT: strh w10, [x8] +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldur w9, [x0, #3] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: lsr x10, x9, #8 +; CHECK-NEXT: orr w8, w8, #0x180 +; CHECK-NEXT: lsr x9, x9, #24 +; CHECK-NEXT: and w8, w8, #0xffffff80 +; CHECK-NEXT: strh w10, [x0, #4] +; CHECK-NEXT: strb w9, [x0, #6] +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %b = load i56, ptr %a, align 1 %c = and i56 %b, -128 @@ -168,17 +162,16 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w11, [x0] -; CHECK-NEXT: ldrh w9, [x8, #4]! -; CHECK-NEXT: ldrb w10, [x8, #2] -; CHECK-NEXT: bfi w9, w10, #16, #8 -; CHECK-NEXT: strb w10, [x8, #2] -; CHECK-NEXT: bfi x11, x9, #32, #24 -; CHECK-NEXT: strh w9, [x8] -; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff -; CHECK-NEXT: orr w11, w11, w1, lsl #13 -; CHECK-NEXT: str w11, [x0] +; CHECK-NEXT: ldur w8, [x0, #3] +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: orr x8, x9, x8, lsl #24 +; CHECK-NEXT: and x8, x8, #0xffffffffffffdfff +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: lsr x10, x8, #48 +; CHECK-NEXT: orr w8, w8, w1, lsl #13 +; CHECK-NEXT: strh w9, [x0, #4] +; CHECK-NEXT: strb w10, [x0, #6] +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 @@ -193,8 +186,15 @@ ; CHECK-LABEL: i120_or: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ldur x9, [x0, #7] ; CHECK-NEXT: orr x8, x8, #0x180 +; CHECK-NEXT: lsr x10, x9, #56 +; CHECK-NEXT: lsr x11, x9, #40 +; CHECK-NEXT: lsr x9, x9, #8 ; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: strb w10, [x0, #14] +; CHECK-NEXT: strh w11, [x0, #12] +; CHECK-NEXT: str w9, [x0, #8] ; CHECK-NEXT: ret %aa = load i120, ptr %a, align 1 %b = or i120 %aa, 384 @@ -219,8 +219,17 @@ ; CHECK-LABEL: i248_or: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ldur x9, [x0, #23] +; CHECK-NEXT: ldr x10, [x0, #16] ; CHECK-NEXT: orr x8, x8, #0x180 +; CHECK-NEXT: lsr x11, x9, #56 +; CHECK-NEXT: str x10, [x0, #16] +; CHECK-NEXT: lsr x10, x9, #40 +; CHECK-NEXT: lsr x9, x9, #8 ; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: strb w11, [x0, #30] +; CHECK-NEXT: strh w10, [x0, #28] +; CHECK-NEXT: str w9, [x0, #24] ; CHECK-NEXT: ret %aa = load i248, ptr %a, align 1 %b = or i248 %aa, 384 @@ -231,9 +240,18 @@ define void @i304_or(ptr %a) { ; CHECK-LABEL: i304_or: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr x8, x8, #0x180 -; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ldur x8, [x0, #30] +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: ldr x10, [x0, #24] +; CHECK-NEXT: ldur q0, [x0, #8] +; CHECK-NEXT: orr x9, x9, #0x180 +; CHECK-NEXT: str x10, [x0, #24] +; CHECK-NEXT: lsr x10, x8, #48 +; CHECK-NEXT: lsr x8, x8, #16 +; CHECK-NEXT: stur q0, [x0, #8] +; CHECK-NEXT: str x9, [x0] +; CHECK-NEXT: strh w10, [x0, #36] +; CHECK-NEXT: str w8, [x0, #32] ; CHECK-NEXT: ret %aa = load i304, ptr %a, align 1 %b = or i304 %aa, 384 Index: llvm/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift.ll +++ llvm/test/CodeGen/X86/funnel-shift.ll @@ -983,21 +983,19 @@ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: leal (%eax,%eax,2), %edx -; X86-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi -; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %edi -; X86-SSE2-NEXT: shrdl $8, %esi, %edi -; X86-SSE2-NEXT: xorl %eax, %edi -; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%edx,4), %ecx -; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %esi, %ecx -; X86-SSE2-NEXT: shll $8, %ecx +; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %esi +; X86-SSE2-NEXT: movl 7(%ecx,%edx,4), %ecx ; X86-SSE2-NEXT: movl %ecx, %edx -; X86-SSE2-NEXT: sarl $8, %edx -; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: shldl $24, %edx, %ecx +; X86-SSE2-NEXT: movl %ecx, %edi +; X86-SSE2-NEXT: shrl $8, %ecx +; X86-SSE2-NEXT: shldl $24, %esi, %ecx ; X86-SSE2-NEXT: xorl %eax, %ecx -; X86-SSE2-NEXT: orl %ecx, %edi +; X86-SSE2-NEXT: sarl $31, %eax +; X86-SSE2-NEXT: sarl $8, %edx +; X86-SSE2-NEXT: sarl $31, %edi +; X86-SSE2-NEXT: shldl $24, %edx, %edi +; X86-SSE2-NEXT: xorl %eax, %edi +; X86-SSE2-NEXT: orl %edi, %ecx ; X86-SSE2-NEXT: jne .LBB46_1 ; X86-SSE2-NEXT: # %bb.2: ; X86-SSE2-NEXT: popl %esi @@ -1012,12 +1010,10 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movslq %edi, %rax ; X64-AVX2-NEXT: leaq (%rax,%rax,2), %rcx -; X64-AVX2-NEXT: movsbq 10(%rsi,%rcx,4), %rdx -; X64-AVX2-NEXT: shlq $16, %rdx -; X64-AVX2-NEXT: movzwl 8(%rsi,%rcx,4), %edi -; X64-AVX2-NEXT: orq %rdx, %rdi -; X64-AVX2-NEXT: movq (%rsi,%rcx,4), %rcx -; X64-AVX2-NEXT: shrdq $40, %rdi, %rcx +; X64-AVX2-NEXT: movq (%rsi,%rcx,4), %rdx +; X64-AVX2-NEXT: movslq 7(%rsi,%rcx,4), %rcx +; X64-AVX2-NEXT: shrq $8, %rcx +; X64-AVX2-NEXT: shldq $24, %rdx, %rcx ; X64-AVX2-NEXT: cmpq %rax, %rcx ; X64-AVX2-NEXT: jne .LBB46_1 ; X64-AVX2-NEXT: # %bb.2: Index: llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -108,19 +108,29 @@ define void @i56_or(ptr %a) { ; X86-LABEL: i56_or: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl $384, (%eax) # imm = 0x180 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 3(%ecx), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $8, %edx +; X86-NEXT: movw %dx, 4(%ecx) +; X86-NEXT: shrl $24, %eax +; X86-NEXT: movb %al, 6(%ecx) +; X86-NEXT: orl $384, (%ecx) # imm = 0x180 ; X86-NEXT: retl ; ; X64-LABEL: i56_or: ; X64: # %bb.0: -; X64-NEXT: movzbl 6(%rdi), %eax -; X64-NEXT: shll $16, %eax -; X64-NEXT: movzwl 4(%rdi), %ecx -; X64-NEXT: movw %cx, 4(%rdi) -; X64-NEXT: shrq $16, %rax +; X64-NEXT: movl 3(%rdi), %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $24, %rcx +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: orl %ecx, %edx +; X64-NEXT: orl $384, %edx # imm = 0x180 +; X64-NEXT: shrq $24, %rax ; X64-NEXT: movb %al, 6(%rdi) -; X64-NEXT: orl $384, (%rdi) # imm = 0x180 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: movl %edx, (%rdi) ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -131,30 +141,39 @@ define void @i56_and_or(ptr %a) { ; X86-LABEL: i56_and_or: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $384, %ecx # imm = 0x180 -; X86-NEXT: orl (%eax), %ecx -; X86-NEXT: andl $-128, %ecx -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl 3(%eax), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $8, %edx +; X86-NEXT: movl $384, %esi # imm = 0x180 +; X86-NEXT: orl (%eax), %esi +; X86-NEXT: andl $-128, %esi +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movw %dx, 4(%eax) +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: i56_and_or: ; X64: # %bb.0: -; X64-NEXT: movzwl 4(%rdi), %eax -; X64-NEXT: movzbl 6(%rdi), %ecx -; X64-NEXT: movb %cl, 6(%rdi) -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: shlq $32, %rcx ; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: orq $384, %rax # imm = 0x180 -; X64-NEXT: movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: movl %ecx, (%rdi) -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: movl 3(%rdi), %ecx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: shlq $24, %rdx +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: orq $384, %rdx # imm = 0x180 +; X64-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80 +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: shrq $24, %rcx +; X64-NEXT: movb %cl, 6(%rdi) +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movw %ax, 4(%rdi) ; X64-NEXT: retq %b = load i56, ptr %a, align 1 %c = and i56 %b, -128 @@ -166,32 +185,46 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) { ; X86-LABEL: i56_insert_bit: ; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $13, %ecx -; X86-NEXT: movl $-8193, %edx # imm = 0xDFFF -; X86-NEXT: andl (%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl 3(%eax), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $8, %esi +; X86-NEXT: shll $13, %edx +; X86-NEXT: movl $-8193, %edi # imm = 0xDFFF +; X86-NEXT: andl (%eax), %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movw %si, 4(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: i56_insert_bit: ; X64: # %bb.0: -; X64-NEXT: movzwl 4(%rdi), %eax -; X64-NEXT: movzbl 6(%rdi), %ecx -; X64-NEXT: movb %cl, 6(%rdi) -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: movl 3(%rdi), %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $24, %rcx +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: orl %ecx, %edx ; X64-NEXT: shll $13, %esi -; X64-NEXT: andq $-8193, %rax # imm = 0xDFFF -; X64-NEXT: orl %eax, %esi -; X64-NEXT: shrq $32, %rax -; X64-NEXT: movw %ax, 4(%rdi) -; X64-NEXT: movl %esi, (%rdi) +; X64-NEXT: andl $-8193, %edx # imm = 0xDFFF +; X64-NEXT: orl %esi, %edx +; X64-NEXT: shrq $24, %rax +; X64-NEXT: movb %al, 6(%rdi) +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: movl %edx, (%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 @@ -205,12 +238,34 @@ define void @i120_or(ptr %a) { ; X86-LABEL: i120_or: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl $384, (%eax) # imm = 0x180 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: movl 11(%ecx), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shrl $8, %esi +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: shrl $24, %eax +; X86-NEXT: movb %al, 14(%ecx) +; X86-NEXT: orl $384, (%ecx) # imm = 0x180 +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: i120_or: ; X64: # %bb.0: +; X64-NEXT: movq 7(%rdi), %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $8, %rcx +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $56, %rdx +; X64-NEXT: movb %dl, 14(%rdi) +; X64-NEXT: shrq $40, %rax +; X64-NEXT: movw %ax, 12(%rdi) +; X64-NEXT: movl %ecx, 8(%rdi) ; X64-NEXT: orq $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i120, ptr %a, align 1 @@ -239,12 +294,36 @@ define void @i248_or(ptr %a) { ; X86-LABEL: i248_or: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl $384, (%eax) # imm = 0x180 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 24(%ecx), %edx +; X86-NEXT: movl 27(%ecx), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shrl $8, %esi +; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movw %si, 28(%ecx) +; X86-NEXT: shrl $24, %eax +; X86-NEXT: movb %al, 30(%ecx) +; X86-NEXT: orl $384, (%ecx) # imm = 0x180 +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: i248_or: ; X64: # %bb.0: +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: movq 23(%rdi), %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: shrq $8, %rdx +; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $56, %rax +; X64-NEXT: movb %al, 30(%rdi) +; X64-NEXT: shrq $40, %rcx +; X64-NEXT: movw %cx, 28(%rdi) +; X64-NEXT: movl %edx, 24(%rdi) ; X64-NEXT: orq $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i248, ptr %a, align 1 @@ -262,6 +341,18 @@ ; ; X64-LABEL: i304_or: ; X64: # %bb.0: +; X64-NEXT: movq 30(%rdi), %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $16, %rcx +; X64-NEXT: movq 8(%rdi), %r8 +; X64-NEXT: movq 16(%rdi), %rsi +; X64-NEXT: movq 24(%rdi), %rdx +; X64-NEXT: movq %rdx, 24(%rdi) +; X64-NEXT: movq %rsi, 16(%rdi) +; X64-NEXT: movq %r8, 8(%rdi) +; X64-NEXT: shrq $48, %rax +; X64-NEXT: movw %ax, 36(%rdi) +; X64-NEXT: movl %ecx, 32(%rdi) ; X64-NEXT: orq $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i304, ptr %a, align 1 Index: llvm/test/CodeGen/X86/shrink-compare-pgso.ll =================================================================== --- llvm/test/CodeGen/X86/shrink-compare-pgso.ll +++ llvm/test/CodeGen/X86/shrink-compare-pgso.ll @@ -98,11 +98,9 @@ define dso_local void @test5(i32 %X) nounwind !prof !14 { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl x+6(%rip), %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movzwl x+4(%rip), %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: cmpl $1, %ecx +; CHECK-NEXT: movl x+3(%rip), %eax +; CHECK-NEXT: shrq $8, %rax +; CHECK-NEXT: cmpl $1, %eax ; CHECK-NEXT: jne bar # TAILCALL ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/shrink-compare.ll =================================================================== --- llvm/test/CodeGen/X86/shrink-compare.ll +++ llvm/test/CodeGen/X86/shrink-compare.ll @@ -98,11 +98,9 @@ define dso_local void @test5(i32 %X) nounwind minsize { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl x+6(%rip), %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movzwl x+4(%rip), %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: cmpl $1, %ecx +; CHECK-NEXT: movl x+3(%rip), %eax +; CHECK-NEXT: shrq $8, %rax +; CHECK-NEXT: cmpl $1, %eax ; CHECK-NEXT: jne bar # TAILCALL ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: retq