diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18658,32 +18658,35 @@ if (!VecEltVT.isByteSized()) return SDValue(); - Align Alignment = OriginalLoad->getAlign(); - Align NewAlign = DAG.getDataLayout().getABITypeAlign( - VecEltVT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Alignment || - !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) - return SDValue(); - - ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? - ISD::NON_EXTLOAD : ISD::EXTLOAD; - if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) + ISD::LoadExtType ExtTy = + ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; + if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || + !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); - Alignment = NewAlign; - + Align Alignment; MachinePointerInfo MPI; SDLoc DL(EVE); if (auto *ConstEltNo = dyn_cast(EltNo)) { int Elt = ConstEltNo->getZExtValue(); unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); + Alignment = commonAlignment(OriginalLoad->getAlign(), PtrOff); } else { // Discard the pointer info except the address space because the memory // operand can't represent this new access since the offset is variable. MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); + Alignment = Align(); } + + bool IsFast = false; + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT, + OriginalLoad->getAddressSpace(), Alignment, + OriginalLoad->getMemOperand()->getFlags(), + &IsFast) || + !IsFast) + return SDValue(); + SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -9143,18 +9143,12 @@ define i32 @load_single_extract_variable_index_v3i32_small_align(<3 x i32>* %A, i32 %idx) { ; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: ld1.s { v0 }[2], [x8] -; CHECK-NEXT: and x8, x1, #0x3 -; CHECK-NEXT: bfi x9, x8, #2, #2 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ldr w0, [x9] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] ; CHECK-NEXT: ret %lv = load <3 x i32>, <3 x i32>* %A, align 2 %e = extractelement <3 x i32> %lv, i32 %idx diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -332,14 +332,14 @@ ; X86-SSE4A: # %bb.0: ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE4A-NEXT: movups (%ecx), %xmm0 -; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) +; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) -; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) +; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl ; ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: @@ -360,14 +360,14 @@ ; ; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: ; X64-SSE4A: # %bb.0: -; X64-SSE4A-NEXT: movups (%rdi), %xmm0 -; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) +; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) -; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq ; ; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: @@ -445,14 +445,14 @@ ; X86-SSE4A: # %bb.0: ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE4A-NEXT: movups (%ecx), %xmm0 -; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) +; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) -; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) +; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl ; ; X64-SSE2-LABEL: merge_2_v4f32_align1: @@ -473,14 +473,14 @@ ; ; X64-SSE4A-LABEL: merge_2_v4f32_align1: ; X64-SSE4A: # %bb.0: -; X64-SSE4A-NEXT: movups (%rdi), %xmm0 -; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) +; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) -; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq ; ; X64-SSE41-LABEL: merge_2_v4f32_align1: diff --git a/llvm/test/CodeGen/X86/vecloadextract.ll b/llvm/test/CodeGen/X86/vecloadextract.ll --- a/llvm/test/CodeGen/X86/vecloadextract.ll +++ b/llvm/test/CodeGen/X86/vecloadextract.ll @@ -22,7 +22,7 @@ ; CHECK: [[INDEX:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0) ; CHECK: [[MASKED_INDEX:%[0-9]+]]:gr32_nosp = AND32ri8 [[INDEX]], 7, implicit-def dead $eflags ; CHECK: [[POINTER:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1) -; CHECK: [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 4, killed [[MASKED_INDEX]], 0, $noreg :: (load (s32)) +; CHECK: [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 4, killed [[MASKED_INDEX]], 0, $noreg :: (load (s32), align 1) ; CHECK: $eax = COPY [[LOAD]] ; CHECK: RET 0, $eax define i32 @variable_index(<8 x i32>* %v, i32 %i) { @@ -36,7 +36,7 @@ ; CHECK: [[INDEX:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0) ; CHECK: [[MASKED_INDEX:%[0-9]+]]:gr32_nosp = AND32ri8 [[INDEX]], 7, implicit-def dead $eflags ; CHECK: [[POINTER:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1) -; CHECK: [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 4, killed [[MASKED_INDEX]], 0, $noreg :: (load (s32), addrspace 1) +; CHECK: [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 4, killed [[MASKED_INDEX]], 0, $noreg :: (load (s32), align 1, addrspace 1) ; CHECK: $eax = COPY [[LOAD]] ; CHECK: RET 0, $eax define i32 @variable_index_with_addrspace(<8 x i32> addrspace(1)* %v, i32 %i) {