Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4810,6 +4810,27 @@ if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + + // If this is an (1) AVX vector load with (2) multiple uses and (3) all of + // those uses are extracted directly into a store, then the extract + store + // can be store-folded. Therefore, it's probably not worth splitting the load. + EVT VT = Load->getValueType(0); + if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { + for (SDNode *U : Load->uses()) { + // Skip uses of the chain value. Result 0 of the node is the load value. + SDValue LoadVal = SDValue(Load, 0); + if (!LoadVal.isOperandOf(U)) + continue; + + // If this use is not an extract + store, it's probably worth splitting. + if (U->getOpcode() != ISD::EXTRACT_SUBVECTOR || !U->hasOneUse() || + U->use_begin()->getOpcode() != ISD::STORE) + return true; + } + // All non-chain uses are extract + store. + return false; + } + return true; } Index: llvm/test/CodeGen/X86/sandybridge-loads.ll =================================================================== --- llvm/test/CodeGen/X86/sandybridge-loads.ll +++ llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -30,10 +30,9 @@ ; CHECK-LABEL: widestores: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %xmm1 -; CHECK-NEXT: vmovaps 16(%rsi), %xmm2 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 ; CHECK-NEXT: vmovaps %ymm0, (%rsi) -; CHECK-NEXT: vmovaps %xmm2, 16(%rdi) +; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) ; CHECK-NEXT: vmovaps %xmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/widen_load-3.ll =================================================================== --- llvm/test/CodeGen/X86/widen_load-3.ll +++ llvm/test/CodeGen/X86/widen_load-3.ll @@ -146,10 +146,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovups (%edx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%edx), %xmm1 +; X86-AVX-NEXT: vmovups (%edx), %ymm0 ; X86-AVX-NEXT: vmovups %xmm0, (%ecx) -; X86-AVX-NEXT: vmovups %xmm1, (%eax) +; X86-AVX-NEXT: vextractf128 $1, %ymm0, (%eax) +; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: load_split: @@ -162,10 +162,10 @@ ; ; X64-AVX-LABEL: load_split: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %xmm0 -; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovups %xmm0, (%rsi) -; X64-AVX-NEXT: vmovups %xmm1, (%rdx) +; X64-AVX-NEXT: vextractf128 $1, %ymm0, (%rdx) +; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq %t256 = load <8 x float>, <8 x float>* %ld, align 1 %b128 = shufflevector <8 x float> %t256, <8 x float> undef, <4 x i32> @@ -178,39 +178,35 @@ define void @load_split_more(float* %src, i32* %idx, float* %dst) nounwind { ; X86-SSE-LABEL: load_split_more: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl (%edx), %esi -; X86-SSE-NEXT: movups (%ecx), %xmm0 -; X86-SSE-NEXT: movups 16(%ecx), %xmm1 -; X86-SSE-NEXT: movups %xmm0, (%eax,%esi,4) -; X86-SSE-NEXT: movl 4(%edx), %ecx +; X86-SSE-NEXT: movups (%edx), %xmm0 +; X86-SSE-NEXT: movups 16(%edx), %xmm1 +; X86-SSE-NEXT: movl (%ecx), %edx +; X86-SSE-NEXT: movups %xmm0, (%eax,%edx,4) +; X86-SSE-NEXT: movl 4(%ecx), %ecx ; X86-SSE-NEXT: movups %xmm1, (%eax,%ecx,4) -; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: load_split_more: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl (%edx), %esi -; X86-AVX-NEXT: vmovups (%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm0, (%eax,%esi,4) -; X86-AVX-NEXT: movl 4(%edx), %ecx -; X86-AVX-NEXT: vmovups %xmm1, (%eax,%ecx,4) -; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: vmovups (%edx), %ymm0 +; X86-AVX-NEXT: movl (%ecx), %edx +; X86-AVX-NEXT: vmovups %xmm0, (%eax,%edx,4) +; X86-AVX-NEXT: movl 4(%ecx), %ecx +; X86-AVX-NEXT: vextractf128 $1, %ymm0, (%eax,%ecx,4) +; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: load_split_more: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movslq (%rsi), %rax ; X64-SSE-NEXT: movups (%rdi), %xmm0 ; X64-SSE-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE-NEXT: movslq (%rsi), %rax ; X64-SSE-NEXT: movups %xmm0, (%rdx,%rax,4) ; X64-SSE-NEXT: movslq 4(%rsi), %rax ; X64-SSE-NEXT: movups %xmm1, (%rdx,%rax,4) @@ -218,12 +214,12 @@ ; ; X64-AVX-LABEL: load_split_more: ; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: movslq (%rsi), %rax -; X64-AVX-NEXT: vmovups (%rdi), %xmm0 -; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 ; X64-AVX-NEXT: vmovups %xmm0, (%rdx,%rax,4) ; X64-AVX-NEXT: movslq 4(%rsi), %rax -; X64-AVX-NEXT: vmovups %xmm1, (%rdx,%rax,4) +; X64-AVX-NEXT: vextractf128 $1, %ymm0, (%rdx,%rax,4) +; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq %v.i = bitcast float* %src to <8 x float>* %tmp = load <8 x float>, <8 x float>* %v.i, align 1