Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4127,11 +4127,12 @@ // cost model for unaligned load / store. bool Fast; unsigned AS = 0; - if (NumMemOps && AllowOverlap && - VTSize >= 8 && NewVTSize < Size && - TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast) + if (NumMemOps && AllowOverlap && VTSize >= 8 && NewVTSize < Size && + // Specify 1-byte alignment because we know this access must + // be unaligned: it overlaps with the previous aligned access. + TLI.allowsMisalignedMemoryAccesses(VT, AS, 1, &Fast) && Fast) { VTSize = Size; - else { + } else { VT = NewVT; VTSize = NewVTSize; } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1795,12 +1795,17 @@ } bool -X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, +X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddressSpace, + unsigned Alignment, bool *Fast) const { - if (Fast) - *Fast = Subtarget->isUnalignedMemAccessFast(); + if (Fast) { + if (Alignment == 0 || Alignment >= VT.getSizeInBits() / 8) + *Fast = true; + else if (VT.getSizeInBits() == 256) + *Fast = !Subtarget->isUnalignedMem32Slow(); + else + *Fast = Subtarget->isUnalignedMemAccessFast(); + } return true; } @@ -10932,14 +10937,24 @@ // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector() && - !Subtarget->isUnalignedMem32Slow()) { - SDValue SubVec2 = Vec.getOperand(1); - if (auto *Idx2 = dyn_cast(Vec.getOperand(2))) { - if (Idx2->getZExtValue() == 0) { - SDValue Ops[] = { SubVec2, SubVec }; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + if (auto *FirstLd = dyn_cast(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMisalignedMemoryAccesses(OpVT, AS, Alignment, &Fast) && + Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } } } } Index: test/CodeGen/X86/unaligned-32-byte-memops.ll =================================================================== --- test/CodeGen/X86/unaligned-32-byte-memops.ll +++ test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -75,12 +75,12 @@ ret <8 x float> %v3 } +; If the first load is 32-byte aligned, then the loads should be merged in all cases. + define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) { -;; FIXME: The first load is 32-byte aligned, so the second load should get merged. ; AVXSLOW-LABEL: combine_16_byte_loads_aligned: ; AVXSLOW: # BB#0: -; AVXSLOW-NEXT: vmovaps 48(%rdi), %xmm0 -; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0 ; AVXSLOW-NEXT: retq ; ; AVXFAST-LABEL: combine_16_byte_loads_aligned: