Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -410,8 +410,9 @@ return false; } - /// Should we merge stores after Legalization (generally - /// better quality) or before (simpler) + /// Allow store merging after legalization in addition to before legalization. + /// This may catch stores that do not exist earlier (eg, stores created from + /// intrinsics). virtual bool mergeStoresAfterLegalization() const { return false; } /// Returns if it's reasonable to merge stores to MemVT size. Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13565,10 +13565,10 @@ Ptr, ST->getMemoryVT(), ST->getMemOperand()); } - // Only perform this optimization before the types are legal, because we - // don't want to perform this optimization on every DAGCombine invocation. - if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG - : !LegalTypes) { + // Always perform this optimization before types are legal. If the target + // prefers, also try this after legalization to catch stores that were created + // by intrinsics or other nodes. + if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) { for (;;) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -1046,20 +1046,21 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx ; AVX1-NEXT: movq %rcx, %r8 ; AVX1-NEXT: movq %rcx, %r9 ; AVX1-NEXT: movq %rcx, %r10 ; AVX1-NEXT: movq %rcx, %r11 ; AVX1-NEXT: movq %rcx, %r14 ; AVX1-NEXT: movq %rcx, %r15 -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx ; AVX1-NEXT: movq %rdx, %r12 ; AVX1-NEXT: movq %rdx, %r13 -; AVX1-NEXT: movq %rdx, %rbx -; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: movq %rdx, %rbx ; AVX1-NEXT: movq %rdx, %rbp ; AVX1-NEXT: andb $15, %dl ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp) @@ -1069,18 +1070,18 @@ ; AVX1-NEXT: shrq $56, %rbp ; AVX1-NEXT: andb $15, %bpl ; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $48, %rsi +; AVX1-NEXT: shrq $48, %rbx +; AVX1-NEXT: andb $15, %bl +; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $40, %rsi ; AVX1-NEXT: andb $15, %sil ; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $40, %rdi -; AVX1-NEXT: andb $15, %dil -; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: shrq $32, %rax ; AVX1-NEXT: andb $15, %al ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $24, %rbx -; AVX1-NEXT: andb $15, %bl -; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $24, %rdi +; AVX1-NEXT: andb $15, %dil +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: shrq $16, %r13 ; AVX1-NEXT: andb $15, %r13b ; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp) @@ -1216,20 +1217,21 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: movq %rcx, %r8 ; AVX2-NEXT: movq %rcx, %r9 ; AVX2-NEXT: movq %rcx, %r10 ; AVX2-NEXT: movq %rcx, %r11 ; AVX2-NEXT: movq %rcx, %r14 ; AVX2-NEXT: movq %rcx, %r15 -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx ; AVX2-NEXT: movq %rdx, %r12 ; AVX2-NEXT: movq %rdx, %r13 -; AVX2-NEXT: movq %rdx, %rbx -; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: movq %rdx, %rbx ; AVX2-NEXT: movq %rdx, %rbp ; AVX2-NEXT: andb $15, %dl ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) @@ -1239,18 +1241,18 @@ ; AVX2-NEXT: shrq $56, %rbp ; AVX2-NEXT: andb $15, %bpl ; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: shrq $48, %rbx +; AVX2-NEXT: andb $15, %bl +; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $40, %rsi ; AVX2-NEXT: andb $15, %sil ; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $40, %rdi -; AVX2-NEXT: andb $15, %dil -; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: shrq $32, %rax ; AVX2-NEXT: andb $15, %al ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $24, %rbx -; AVX2-NEXT: andb $15, %bl -; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $24, %rdi +; AVX2-NEXT: andb $15, %dil +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: shrq $16, %r13 ; AVX2-NEXT: andb $15, %r13b ; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp) Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -544,8 +544,8 @@ ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx ; X32-SSE1-NEXT: movw %di, 10(%eax) ; X32-SSE1-NEXT: movw %cx, 14(%eax) -; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movl %esi, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: popl %esi ; X32-SSE1-NEXT: popl %edi ; X32-SSE1-NEXT: retl $4 @@ -626,8 +626,8 @@ ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE1-NEXT: movl 8(%ecx), %edx ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx -; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movw %cx, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movl $0, 12(%eax) ; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: retl $4 @@ -698,8 +698,8 @@ ; X32-SSE1-NEXT: movb %cl, 15(%eax) ; X32-SSE1-NEXT: movw %bx, 11(%eax) ; X32-SSE1-NEXT: movl %edi, 7(%eax) -; X32-SSE1-NEXT: movw %bp, (%eax) ; X32-SSE1-NEXT: movl %esi, 3(%eax) +; X32-SSE1-NEXT: movw %bp, (%eax) ; X32-SSE1-NEXT: popl %esi ; X32-SSE1-NEXT: popl %edi ; X32-SSE1-NEXT: popl %ebx @@ -773,8 +773,8 @@ ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE1-NEXT: movzwl (%ecx), %edx ; X32-SSE1-NEXT: movb 3(%ecx), %cl -; X32-SSE1-NEXT: movw %dx, (%eax) ; X32-SSE1-NEXT: movb %cl, 3(%eax) +; X32-SSE1-NEXT: movw %dx, (%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) ; X32-SSE1-NEXT: movw $0, 13(%eax) ; X32-SSE1-NEXT: movw $0, 6(%eax) Index: test/CodeGen/X86/stores-merging.ll =================================================================== --- test/CodeGen/X86/stores-merging.ll +++ test/CodeGen/X86/stores-merging.ll @@ -13,8 +13,9 @@ define void @redundant_stores_merging() { ; CHECK-LABEL: redundant_stores_merging: ; CHECK: # BB#0: -; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001 +; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001 ; CHECK-NEXT: movq %rax, e+{{.*}}(%rip) +; CHECK-NEXT: movl $456, e+{{.*}}(%rip) # imm = 0x1C8 ; CHECK-NEXT: retq store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4 store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 @@ -26,8 +27,9 @@ define void @redundant_stores_merging_reverse() { ; CHECK-LABEL: redundant_stores_merging_reverse: ; CHECK: # BB#0: -; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001 +; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001 ; CHECK-NEXT: movq %rax, e+{{.*}}(%rip) +; CHECK-NEXT: movl $456, e+{{.*}}(%rip) # imm = 0x1C8 ; CHECK-NEXT: retq store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 @@ -57,22 +59,7 @@ define void @extract_vector_store_16_consecutive_bytes(<2 x i64> %v, i8* %ptr) #0 { ; CHECK-LABEL: extract_vector_store_16_consecutive_bytes: ; CHECK: # BB#0: -; CHECK-NEXT: vpextrb $0, %xmm0, (%rdi) -; CHECK-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; CHECK-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; CHECK-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; CHECK-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; CHECK-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; CHECK-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; CHECK-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; CHECK-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; CHECK-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; CHECK-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; CHECK-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; CHECK-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; CHECK-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; CHECK-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; CHECK-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: retq %bc = bitcast <2 x i64> %v to <16 x i8> %ext00 = extractelement <16 x i8> %bc, i32 0 @@ -131,39 +118,7 @@ define void @extract_vector_store_32_consecutive_bytes(<4 x i64> %v, i8* %ptr) #0 { ; CHECK-LABEL: extract_vector_store_32_consecutive_bytes: ; CHECK: # BB#0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpextrb $0, %xmm0, (%rdi) -; CHECK-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; CHECK-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; CHECK-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; CHECK-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; CHECK-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; CHECK-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; CHECK-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; CHECK-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; CHECK-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; CHECK-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; CHECK-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; CHECK-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; CHECK-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; CHECK-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; CHECK-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; CHECK-NEXT: vpextrb $0, %xmm1, 16(%rdi) -; CHECK-NEXT: vpextrb $1, %xmm1, 17(%rdi) -; CHECK-NEXT: vpextrb $2, %xmm1, 18(%rdi) -; CHECK-NEXT: vpextrb $3, %xmm1, 19(%rdi) -; CHECK-NEXT: vpextrb $4, %xmm1, 20(%rdi) -; CHECK-NEXT: vpextrb $5, %xmm1, 21(%rdi) -; CHECK-NEXT: vpextrb $6, %xmm1, 22(%rdi) -; CHECK-NEXT: vpextrb $7, %xmm1, 23(%rdi) -; CHECK-NEXT: vpextrb $8, %xmm1, 24(%rdi) -; CHECK-NEXT: vpextrb $9, %xmm1, 25(%rdi) -; CHECK-NEXT: vpextrb $10, %xmm1, 26(%rdi) -; CHECK-NEXT: vpextrb $11, %xmm1, 27(%rdi) -; CHECK-NEXT: vpextrb $12, %xmm1, 28(%rdi) -; CHECK-NEXT: vpextrb $13, %xmm1, 29(%rdi) -; CHECK-NEXT: vpextrb $14, %xmm1, 30(%rdi) -; CHECK-NEXT: vpextrb $15, %xmm1, 31(%rdi) +; CHECK-NEXT: vmovups %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %bc = bitcast <4 x i64> %v to <32 x i8>