Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -6540,14 +6540,20 @@ } } - auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) { + SmallVector Loads; + for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i) + if (LoadMask[i]) + Loads.push_back(cast(peekThroughBitcasts(Elts[i]))); + + auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && "Cannot merge volatile loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); - DAG.makeEquivalentMemoryOrdering(LDBase, NewLd); + for (auto *LD : Loads) + DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; @@ -6612,7 +6618,8 @@ LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); - DAG.makeEquivalentMemoryOrdering(LDBase, ResNode); + for (auto *LD : Loads) + DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } Index: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -409,6 +409,124 @@ ret <4 x i32> %res3 } +define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp { +; SSE-LABEL: merge_4i32_i32_23u5_inc2: +; SSE: # BB#0: +; SSE-NEXT: movups 8(%rdi), %xmm0 +; SSE-NEXT: incl 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: merge_4i32_i32_23u5_inc2: +; AVX: # BB#0: +; AVX-NEXT: vmovups 8(%rdi), %xmm0 +; AVX-NEXT: incl 8(%rdi) +; AVX-NEXT: retq +; +; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc2: +; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi6: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi7: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: .Lcfi8: +; X32-SSE1-NEXT: .cfi_offset %esi, -12 +; X32-SSE1-NEXT: .Lcfi9: +; X32-SSE1-NEXT: .cfi_offset %edi, -8 +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE1-NEXT: movl 8(%ecx), %edx +; X32-SSE1-NEXT: movl 12(%ecx), %esi +; X32-SSE1-NEXT: leal 1(%edx), %edi +; X32-SSE1-NEXT: movl %edi, 8(%ecx) +; X32-SSE1-NEXT: movl 20(%ecx), %ecx +; X32-SSE1-NEXT: movl %esi, 4(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl %ecx, 12(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: retl $4 +; +; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movups 8(%eax), %xmm0 +; X32-SSE41-NEXT: incl 8(%eax) +; X32-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 + %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 + %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 + %val0 = load i32, i32* %ptr0 + %inc = add i32 %val0, 1 + store i32 %inc, i32* %ptr0 + %val1 = load i32, i32* %ptr1 + %val3 = load i32, i32* %ptr3 + %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 + %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 + %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 + ret <4 x i32> %res3 +} + +define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp { +; SSE-LABEL: merge_4i32_i32_23u5_inc3: +; SSE: # BB#0: +; SSE-NEXT: movups 8(%rdi), %xmm0 +; SSE-NEXT: incl 12(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: merge_4i32_i32_23u5_inc3: +; AVX: # BB#0: +; AVX-NEXT: vmovups 8(%rdi), %xmm0 +; AVX-NEXT: incl 12(%rdi) +; AVX-NEXT: retq +; +; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc3: +; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi10: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi11: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: .Lcfi12: +; X32-SSE1-NEXT: .cfi_offset %esi, -12 +; X32-SSE1-NEXT: .Lcfi13: +; X32-SSE1-NEXT: .cfi_offset %edi, -8 +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE1-NEXT: movl 8(%ecx), %edx +; X32-SSE1-NEXT: movl 12(%ecx), %esi +; X32-SSE1-NEXT: leal 1(%esi), %edi +; X32-SSE1-NEXT: movl %edi, 12(%ecx) +; X32-SSE1-NEXT: movl 20(%ecx), %ecx +; X32-SSE1-NEXT: movl %esi, 4(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl %ecx, 12(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: retl $4 +; +; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movups 8(%eax), %xmm0 +; X32-SSE41-NEXT: incl 12(%eax) +; X32-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 + %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 + %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 + %val0 = load i32, i32* %ptr0 + %val1 = load i32, i32* %ptr1 + %inc = add i32 %val1, 1 + store i32 %inc, i32* %ptr1 + %val3 = load i32, i32* %ptr3 + %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 + %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 + %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 + ret <4 x i32> %res3 +} + define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4i32_i32_3zuu: ; SSE: # BB#0: @@ -513,6 +631,118 @@ ret <4 x i32> %res1 } +define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp { +; SSE-LABEL: merge_4i32_i32_45zz_inc4: +; SSE: # BB#0: +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: incl 16(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: merge_4i32_i32_45zz_inc4: +; AVX: # BB#0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: incl 16(%rdi) +; AVX-NEXT: retq +; +; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc4: +; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi14: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi15: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: .Lcfi16: +; X32-SSE1-NEXT: .cfi_offset %esi, -12 +; X32-SSE1-NEXT: .Lcfi17: +; X32-SSE1-NEXT: .cfi_offset %edi, -8 +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE1-NEXT: movl 16(%ecx), %edx +; X32-SSE1-NEXT: movl 20(%ecx), %esi +; X32-SSE1-NEXT: leal 1(%edx), %edi +; X32-SSE1-NEXT: movl %edi, 16(%ecx) +; X32-SSE1-NEXT: movl %esi, 4(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl $0, 12(%eax) +; X32-SSE1-NEXT: movl $0, 8(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: retl $4 +; +; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-SSE41-NEXT: incl 16(%eax) +; X32-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4 + %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5 + %val0 = load i32, i32* %ptr0 + %inc = add i32 %val0, 1 + store i32 %inc, i32* %ptr0 + %val1 = load i32, i32* %ptr1 + %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 + %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 + ret <4 x i32> %res1 +} + +define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp { +; SSE-LABEL: merge_4i32_i32_45zz_inc5: +; SSE: # BB#0: +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: incl 20(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: merge_4i32_i32_45zz_inc5: +; AVX: # BB#0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: incl 20(%rdi) +; AVX-NEXT: retq +; +; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc5: +; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi18: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi19: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: .Lcfi20: +; X32-SSE1-NEXT: .cfi_offset %esi, -12 +; X32-SSE1-NEXT: .Lcfi21: +; X32-SSE1-NEXT: .cfi_offset %edi, -8 +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE1-NEXT: movl 16(%ecx), %edx +; X32-SSE1-NEXT: movl 20(%ecx), %esi +; X32-SSE1-NEXT: leal 1(%esi), %edi +; X32-SSE1-NEXT: movl %edi, 20(%ecx) +; X32-SSE1-NEXT: movl %esi, 4(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl $0, 12(%eax) +; X32-SSE1-NEXT: movl $0, 8(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: retl $4 +; +; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-SSE41-NEXT: incl 20(%eax) +; X32-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4 + %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5 + %val0 = load i32, i32* %ptr0 + %val1 = load i32, i32* %ptr1 + %inc = add i32 %val1, 1 + store i32 %inc, i32* %ptr1 + %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 + %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 + ret <4 x i32> %res1 +} + define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_8i16_i16_23u567u9: ; SSE: # BB#0: @@ -527,14 +757,14 @@ ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi6: +; X32-SSE1-NEXT: .Lcfi22: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi7: +; X32-SSE1-NEXT: .Lcfi23: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi8: +; X32-SSE1-NEXT: .Lcfi24: ; X32-SSE1-NEXT: .cfi_offset %esi, -12 -; X32-SSE1-NEXT: .Lcfi9: +; X32-SSE1-NEXT: .Lcfi25: ; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -667,24 +897,24 @@ ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %ebp -; X32-SSE1-NEXT: .Lcfi10: +; X32-SSE1-NEXT: .Lcfi26: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %ebx -; X32-SSE1-NEXT: .Lcfi11: +; X32-SSE1-NEXT: .Lcfi27: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi12: +; X32-SSE1-NEXT: .Lcfi28: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi13: +; X32-SSE1-NEXT: .Lcfi29: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi14: +; X32-SSE1-NEXT: .Lcfi30: ; X32-SSE1-NEXT: .cfi_offset %esi, -20 -; X32-SSE1-NEXT: .Lcfi15: +; X32-SSE1-NEXT: .Lcfi31: ; X32-SSE1-NEXT: .cfi_offset %edi, -16 -; X32-SSE1-NEXT: .Lcfi16: +; X32-SSE1-NEXT: .Lcfi32: ; X32-SSE1-NEXT: .cfi_offset %ebx, -12 -; X32-SSE1-NEXT: .Lcfi17: +; X32-SSE1-NEXT: .Lcfi33: ; X32-SSE1-NEXT: .cfi_offset %ebp, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -917,14 +1147,14 @@ ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi18: +; X32-SSE1-NEXT: .Lcfi34: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi19: +; X32-SSE1-NEXT: .Lcfi35: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi20: +; X32-SSE1-NEXT: .Lcfi36: ; X32-SSE1-NEXT: .cfi_offset %esi, -12 -; X32-SSE1-NEXT: .Lcfi21: +; X32-SSE1-NEXT: .Lcfi37: ; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx