diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18658,32 +18658,35 @@ if (!VecEltVT.isByteSized()) return SDValue(); - Align Alignment = OriginalLoad->getAlign(); - Align NewAlign = DAG.getDataLayout().getABITypeAlign( - VecEltVT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Alignment || - !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) + ISD::LoadExtType ExtTy = + ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; + if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || + !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); - ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? - ISD::NON_EXTLOAD : ISD::EXTLOAD; - if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) - return SDValue(); - - Alignment = NewAlign; - + Align Alignment = OriginalLoad->getAlign(); MachinePointerInfo MPI; SDLoc DL(EVE); if (auto *ConstEltNo = dyn_cast(EltNo)) { int Elt = ConstEltNo->getZExtValue(); unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); + Alignment = commonAlignment(Alignment, PtrOff); } else { // Discard the pointer info except the address space because the memory // operand can't represent this new access since the offset is variable. MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); + Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8); } + + bool IsFast = false; + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT, + OriginalLoad->getAddressSpace(), Alignment, + OriginalLoad->getMemOperand()->getFlags(), + &IsFast) || + !IsFast) + return SDValue(); + SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -9143,18 +9143,12 @@ define i32 @load_single_extract_variable_index_v3i32_small_align(<3 x i32>* %A, i32 %idx) { ; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: ld1.s { v0 }[2], [x8] -; CHECK-NEXT: and x8, x1, #0x3 -; CHECK-NEXT: bfi x9, x8, #2, #2 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ldr w0, [x9] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] ; CHECK-NEXT: ret %lv = load <3 x i32>, <3 x i32>* %A, align 2 %e = extractelement <3 x i32> %lv, i32 %idx diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -332,14 +332,14 @@ ; X86-SSE4A: # %bb.0: ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE4A-NEXT: movups (%ecx), %xmm0 -; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) +; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) -; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) +; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl ; ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: @@ -360,14 +360,14 @@ ; ; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: ; X64-SSE4A: # %bb.0: -; X64-SSE4A-NEXT: movups (%rdi), %xmm0 -; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) +; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) -; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq ; ; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: @@ -445,14 +445,14 @@ ; X86-SSE4A: # %bb.0: ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE4A-NEXT: movups (%ecx), %xmm0 -; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) +; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) -; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) +; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl ; ; X64-SSE2-LABEL: merge_2_v4f32_align1: @@ -473,14 +473,14 @@ ; ; X64-SSE4A-LABEL: merge_2_v4f32_align1: ; X64-SSE4A: # %bb.0: -; X64-SSE4A-NEXT: movups (%rdi), %xmm0 -; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) +; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) -; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq ; ; X64-SSE41-LABEL: merge_2_v4f32_align1: