This is an archive of the discontinued LLVM Phabricator instance.

[X86] Fix chains update when lowering BUILD_VECTOR to a vector load
ClosedPublic

Authored by apilipenko on Oct 4 2017, 9:44 AM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper
spatel
niravd

Commits

rG7b15254c8fe0: [X86] Fix chains update when lowering BUILD_VECTOR to a vector load
rL314988: [X86] Fix chains update when lowering BUILD_VECTOR to a vector load

Summary

The code which lowers BUILD_VECTOR of consecutive loads into a single vector load doesn't update chains properly. As a result the vector load can be reordered with the store to the same location.

Consider the test case.

define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
  %val0 = load i32, i32* %ptr0
  %val1 = load i32, i32* %ptr1
  %inc = add i32 %val1, 1
  store i32 %inc, i32* %ptr1
  %val3 = load i32, i32* %ptr3
  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
  ret <4 x i32> %res3
}

llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 -debug-only=isel

Type-legalized selection DAG: BB#0 'merge_4i32_i32_23u5_inc3:'

SelectionDAG has 22 nodes:
  t0: ch = EntryToken
  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
  t6: i64 = add t2, Constant:i64<12>
  t11: i32,ch = load<LD4[%ptr1]> t0, t6, undef:i64
      t13: i32 = add t11, Constant:i32<1>
    t14: ch = store<ST4[%ptr1]> t11:1, t13, t6, undef:i64
        t4: i64 = add t2, Constant:i64<8>
      t35: i32,ch = load<LD4[%ptr0]> t0, t4, undef:i64
        t8: i64 = add t2, Constant:i64<20>
      t33: i32,ch = load<LD4[%ptr3]> t0, t8, undef:i64
    t32: v4i32 = BUILD_VECTOR t35, t11, undef:i32, t33
  t27: ch,glue = CopyToReg t14, Register:v4i32 %XMM0, t32
  t28: ch = X86ISD::RET_FLAG t27, TargetConstant:i32<0>, Register:v4i32 %XMM0, t27:1

Legalized selection DAG: BB#0 'merge_4i32_i32_23u5_inc3:'

SelectionDAG has 18 nodes:
  t0: ch = EntryToken
  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
  t6: i64 = add t2, Constant:i64<12>
  t11: i32,ch = load<LD4[%ptr1]> t0, t6, undef:i64
      t13: i32 = add t11, Constant:i32<1>
    t14: ch = store<ST4[%ptr1]> t11:1, t13, t6, undef:i64
        t4: i64 = add t2, Constant:i64<8>
      t38: v2i64,ch = load<LD16[%ptr0](align=4)> t0, t4, undef:i64
    t39: v4i32 = bitcast t38
  t27: ch,glue = CopyToReg t14, Register:v4i32 %XMM0, t39
  t28: ch = X86ISD::RET_FLAG t27, TargetConstant:i32<0>, Register:v4i32 %XMM0, t27:1

In the last DAG there are two chains:
t0 - t11 (scalar load) - t14 (scalar store)
t0 - t38 (vector load)

There is no ordering constraint between t38 (vector load) and t14 (scalar store), so they can be reordered. That's what happens in the final assembly:

# BB#0:
  incl  12(%rdi)
  movups  8(%rdi), %xmm0
  retq

Here the scalar store happens before the vector load. The vector load reloads the incremented value for 12(%rdi) location, while in the original program it uses the pre-increment value.

The current code in EltsFromConsecutiveLoads tries to update the chains but fails becuase it only updates the chain following the first load. Had the incremented location been the first load we wouldn't miscompile. The fix is to update the chains following all the loads comprising the vector.

Diff Detail

Repository: rL LLVM

Event Timeline

apilipenko created this revision.Oct 4 2017, 9:44 AM

Is this PR10114 ?

RKSimon added a reviewer: niravd.Oct 4 2017, 9:48 AM

In D38547#888359, @RKSimon wrote:

Is this PR10114 ?

The problem was found in our internal testing. But it looks like PR10114 describes the same problem.

This is the same fix as in D18663 (which is now way out of date) but with a test case! This should be all of PR10114 and we can finally mark it fixed.

LGTM.

This revision is now accepted and ready to land.Oct 4 2017, 10:42 AM

Closed by commit rL314988: [X86] Fix chains update when lowering BUILD_VECTOR to a vector load (authored by apilipenko). · Explain WhyOct 5 2017, 9:30 AM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in D18663: Cleanup Chain Handling in X86ISelLowering.Oct 6 2017, 2:22 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

13 lines

test/

CodeGen/

X86/

merge-consecutive-loads-128.ll

262 lines

Diff 117839

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,534 Lines • ▼ Show 20 Lines	if (LoadMask[i]) {
IsConsecutiveLoadWithZeros = false;		IsConsecutiveLoadWithZeros = false;
break;		break;
}		}
} else if (ZeroMask[i]) {		} else if (ZeroMask[i]) {
IsConsecutiveLoad = false;		IsConsecutiveLoad = false;
}		}
}		}

auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {		SmallVector<LoadSDNode *, 8> Loads;
		for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
		if (LoadMask[i])
		Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));

		auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();		auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&		assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
"Cannot merge volatile loads.");		"Cannot merge volatile loads.");
SDValue NewLd =		SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),		DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);		LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);		for (auto *LD : Loads)
		DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;		return NewLd;
};		};

// LOAD - all consecutive load/undefs (must start/end with a load).		// LOAD - all consecutive load/undefs (must start/end with a load).
// If we have found an entire vector of loads and undefs, then return a large		// If we have found an entire vector of loads and undefs, then return a large
// load of the entire vector width starting at the base pointer.		// load of the entire vector width starting at the base pointer.
// If the vector contains zeros, then attempt to shuffle those elements.		// If the vector contains zeros, then attempt to shuffle those elements.
if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&		if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);		SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };		SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =		SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,		DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
LDBase->getPointerInfo(),		LDBase->getPointerInfo(),
LDBase->getAlignment(),		LDBase->getAlignment(),
false/isVolatile/, true/ReadMem/,		false/isVolatile/, true/ReadMem/,
false/WriteMem/);		false/WriteMem/);
DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);		for (auto *LD : Loads)
		DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);		return DAG.getBitcast(VT, ResNode);
}		}
}		}

return SDValue();		return SDValue();
}		}

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,		static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
▲ Show 20 Lines • Show All 30,774 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll

Show First 20 Lines • Show All 403 Lines • ▼ Show 20 Lines	; X32-SSE41-NEXT: retl
%val1 = load i32, i32* %ptr1		%val1 = load i32, i32* %ptr1
%val3 = load i32, i32* %ptr3		%val3 = load i32, i32* %ptr3
%res0 = insertelement <4 x i32> undef, i32 %val0, i32 0		%res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1		%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
%res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3		%res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
ret <4 x i32> %res3		ret <4 x i32> %res3
}		}

		define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp {
		; SSE-LABEL: merge_4i32_i32_23u5_inc2:
		; SSE: # BB#0:
		; SSE-NEXT: movups 8(%rdi), %xmm0
		; SSE-NEXT: incl 8(%rdi)
		; SSE-NEXT: retq
		;
		; AVX-LABEL: merge_4i32_i32_23u5_inc2:
		; AVX: # BB#0:
		; AVX-NEXT: vmovups 8(%rdi), %xmm0
		; AVX-NEXT: incl 8(%rdi)
		; AVX-NEXT: retq
		;
		; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
		; X32-SSE1: # BB#0:
		; X32-SSE1-NEXT: pushl %edi
		; X32-SSE1-NEXT: .Lcfi6:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
		; X32-SSE1-NEXT: pushl %esi
		; X32-SSE1-NEXT: .Lcfi7:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
		; X32-SSE1-NEXT: .Lcfi8:
		; X32-SSE1-NEXT: .cfi_offset %esi, -12
		; X32-SSE1-NEXT: .Lcfi9:
		; X32-SSE1-NEXT: .cfi_offset %edi, -8
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
		; X32-SSE1-NEXT: movl 8(%ecx), %edx
		; X32-SSE1-NEXT: movl 12(%ecx), %esi
		; X32-SSE1-NEXT: leal 1(%edx), %edi
		; X32-SSE1-NEXT: movl %edi, 8(%ecx)
		; X32-SSE1-NEXT: movl 20(%ecx), %ecx
		; X32-SSE1-NEXT: movl %esi, 4(%eax)
		; X32-SSE1-NEXT: movl %edx, (%eax)
		; X32-SSE1-NEXT: movl %ecx, 12(%eax)
		; X32-SSE1-NEXT: popl %esi
		; X32-SSE1-NEXT: popl %edi
		; X32-SSE1-NEXT: retl $4
		;
		; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
		; X32-SSE41: # BB#0:
		; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE41-NEXT: movups 8(%eax), %xmm0
		; X32-SSE41-NEXT: incl 8(%eax)
		; X32-SSE41-NEXT: retl
		%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
		%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
		%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
		%val0 = load i32, i32* %ptr0
		%inc = add i32 %val0, 1
		store i32 %inc, i32* %ptr0
		%val1 = load i32, i32* %ptr1
		%val3 = load i32, i32* %ptr3
		%res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
		%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
		%res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
		ret <4 x i32> %res3
		}

		define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
		; SSE-LABEL: merge_4i32_i32_23u5_inc3:
		; SSE: # BB#0:
		; SSE-NEXT: movups 8(%rdi), %xmm0
		; SSE-NEXT: incl 12(%rdi)
		; SSE-NEXT: retq
		;
		; AVX-LABEL: merge_4i32_i32_23u5_inc3:
		; AVX: # BB#0:
		; AVX-NEXT: vmovups 8(%rdi), %xmm0
		; AVX-NEXT: incl 12(%rdi)
		; AVX-NEXT: retq
		;
		; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
		; X32-SSE1: # BB#0:
		; X32-SSE1-NEXT: pushl %edi
		; X32-SSE1-NEXT: .Lcfi10:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
		; X32-SSE1-NEXT: pushl %esi
		; X32-SSE1-NEXT: .Lcfi11:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
		; X32-SSE1-NEXT: .Lcfi12:
		; X32-SSE1-NEXT: .cfi_offset %esi, -12
		; X32-SSE1-NEXT: .Lcfi13:
		; X32-SSE1-NEXT: .cfi_offset %edi, -8
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
		; X32-SSE1-NEXT: movl 8(%ecx), %edx
		; X32-SSE1-NEXT: movl 12(%ecx), %esi
		; X32-SSE1-NEXT: leal 1(%esi), %edi
		; X32-SSE1-NEXT: movl %edi, 12(%ecx)
		; X32-SSE1-NEXT: movl 20(%ecx), %ecx
		; X32-SSE1-NEXT: movl %esi, 4(%eax)
		; X32-SSE1-NEXT: movl %edx, (%eax)
		; X32-SSE1-NEXT: movl %ecx, 12(%eax)
		; X32-SSE1-NEXT: popl %esi
		; X32-SSE1-NEXT: popl %edi
		; X32-SSE1-NEXT: retl $4
		;
		; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
		; X32-SSE41: # BB#0:
		; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE41-NEXT: movups 8(%eax), %xmm0
		; X32-SSE41-NEXT: incl 12(%eax)
		; X32-SSE41-NEXT: retl
		%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
		%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
		%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
		%val0 = load i32, i32* %ptr0
		%val1 = load i32, i32* %ptr1
		%inc = add i32 %val1, 1
		store i32 %inc, i32* %ptr1
		%val3 = load i32, i32* %ptr3
		%res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
		%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
		%res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
		ret <4 x i32> %res3
		}

define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {		define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4i32_i32_3zuu:		; SSE-LABEL: merge_4i32_i32_3zuu:
; SSE: # BB#0:		; SSE: # BB#0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero		; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: merge_4i32_i32_3zuu:		; AVX-LABEL: merge_4i32_i32_3zuu:
; AVX: # BB#0:		; AVX: # BB#0:
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	; X32-SSE41-NEXT: retl
%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5		%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
%val0 = load i32, i32* %ptr0		%val0 = load i32, i32* %ptr0
%val1 = load i32, i32* %ptr1		%val1 = load i32, i32* %ptr1
%res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0		%res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1		%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
ret <4 x i32> %res1		ret <4 x i32> %res1
}		}

		define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp {
		; SSE-LABEL: merge_4i32_i32_45zz_inc4:
		; SSE: # BB#0:
		; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
		; SSE-NEXT: incl 16(%rdi)
		; SSE-NEXT: retq
		;
		; AVX-LABEL: merge_4i32_i32_45zz_inc4:
		; AVX: # BB#0:
		; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
		; AVX-NEXT: incl 16(%rdi)
		; AVX-NEXT: retq
		;
		; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
		; X32-SSE1: # BB#0:
		; X32-SSE1-NEXT: pushl %edi
		; X32-SSE1-NEXT: .Lcfi14:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
		; X32-SSE1-NEXT: pushl %esi
		; X32-SSE1-NEXT: .Lcfi15:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
		; X32-SSE1-NEXT: .Lcfi16:
		; X32-SSE1-NEXT: .cfi_offset %esi, -12
		; X32-SSE1-NEXT: .Lcfi17:
		; X32-SSE1-NEXT: .cfi_offset %edi, -8
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
		; X32-SSE1-NEXT: movl 16(%ecx), %edx
		; X32-SSE1-NEXT: movl 20(%ecx), %esi
		; X32-SSE1-NEXT: leal 1(%edx), %edi
		; X32-SSE1-NEXT: movl %edi, 16(%ecx)
		; X32-SSE1-NEXT: movl %esi, 4(%eax)
		; X32-SSE1-NEXT: movl %edx, (%eax)
		; X32-SSE1-NEXT: movl $0, 12(%eax)
		; X32-SSE1-NEXT: movl $0, 8(%eax)
		; X32-SSE1-NEXT: popl %esi
		; X32-SSE1-NEXT: popl %edi
		; X32-SSE1-NEXT: retl $4
		;
		; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
		; X32-SSE41: # BB#0:
		; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
		; X32-SSE41-NEXT: incl 16(%eax)
		; X32-SSE41-NEXT: retl
		%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
		%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
		%val0 = load i32, i32* %ptr0
		%inc = add i32 %val0, 1
		store i32 %inc, i32* %ptr0
		%val1 = load i32, i32* %ptr1
		%res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
		%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
		ret <4 x i32> %res1
		}

		define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp {
		; SSE-LABEL: merge_4i32_i32_45zz_inc5:
		; SSE: # BB#0:
		; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
		; SSE-NEXT: incl 20(%rdi)
		; SSE-NEXT: retq
		;
		; AVX-LABEL: merge_4i32_i32_45zz_inc5:
		; AVX: # BB#0:
		; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
		; AVX-NEXT: incl 20(%rdi)
		; AVX-NEXT: retq
		;
		; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
		; X32-SSE1: # BB#0:
		; X32-SSE1-NEXT: pushl %edi
		; X32-SSE1-NEXT: .Lcfi18:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
		; X32-SSE1-NEXT: pushl %esi
		; X32-SSE1-NEXT: .Lcfi19:
		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
		; X32-SSE1-NEXT: .Lcfi20:
		; X32-SSE1-NEXT: .cfi_offset %esi, -12
		; X32-SSE1-NEXT: .Lcfi21:
		; X32-SSE1-NEXT: .cfi_offset %edi, -8
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
		; X32-SSE1-NEXT: movl 16(%ecx), %edx
		; X32-SSE1-NEXT: movl 20(%ecx), %esi
		; X32-SSE1-NEXT: leal 1(%esi), %edi
		; X32-SSE1-NEXT: movl %edi, 20(%ecx)
		; X32-SSE1-NEXT: movl %esi, 4(%eax)
		; X32-SSE1-NEXT: movl %edx, (%eax)
		; X32-SSE1-NEXT: movl $0, 12(%eax)
		; X32-SSE1-NEXT: movl $0, 8(%eax)
		; X32-SSE1-NEXT: popl %esi
		; X32-SSE1-NEXT: popl %edi
		; X32-SSE1-NEXT: retl $4
		;
		; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
		; X32-SSE41: # BB#0:
		; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
		; X32-SSE41-NEXT: incl 20(%eax)
		; X32-SSE41-NEXT: retl
		%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
		%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
		%val0 = load i32, i32* %ptr0
		%val1 = load i32, i32* %ptr1
		%inc = add i32 %val1, 1
		store i32 %inc, i32* %ptr1
		%res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
		%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
		ret <4 x i32> %res1
		}

define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {		define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_8i16_i16_23u567u9:		; SSE-LABEL: merge_8i16_i16_23u567u9:
; SSE: # BB#0:		; SSE: # BB#0:
; SSE-NEXT: movups 4(%rdi), %xmm0		; SSE-NEXT: movups 4(%rdi), %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: merge_8i16_i16_23u567u9:		; AVX-LABEL: merge_8i16_i16_23u567u9:
; AVX: # BB#0:		; AVX: # BB#0:
; AVX-NEXT: vmovups 4(%rdi), %xmm0		; AVX-NEXT: vmovups 4(%rdi), %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
;		;
; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:		; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:
; X32-SSE1: # BB#0:		; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: pushl %edi		; X32-SSE1-NEXT: pushl %edi
; X32-SSE1-NEXT: .Lcfi6:		; X32-SSE1-NEXT: .Lcfi22:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: pushl %esi		; X32-SSE1-NEXT: pushl %esi
; X32-SSE1-NEXT: .Lcfi7:		; X32-SSE1-NEXT: .Lcfi23:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 12		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
; X32-SSE1-NEXT: .Lcfi8:		; X32-SSE1-NEXT: .Lcfi24:
; X32-SSE1-NEXT: .cfi_offset %esi, -12		; X32-SSE1-NEXT: .cfi_offset %esi, -12
; X32-SSE1-NEXT: .Lcfi9:		; X32-SSE1-NEXT: .Lcfi25:
; X32-SSE1-NEXT: .cfi_offset %edi, -8		; X32-SSE1-NEXT: .cfi_offset %edi, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl 4(%ecx), %edx		; X32-SSE1-NEXT: movl 4(%ecx), %edx
; X32-SSE1-NEXT: movl 10(%ecx), %esi		; X32-SSE1-NEXT: movl 10(%ecx), %esi
; X32-SSE1-NEXT: movzwl 14(%ecx), %edi		; X32-SSE1-NEXT: movzwl 14(%ecx), %edi
; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx		; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx
; X32-SSE1-NEXT: movw %di, 10(%eax)		; X32-SSE1-NEXT: movw %di, 10(%eax)
▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines
; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:		; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
; AVX: # BB#0:		; AVX: # BB#0:
; AVX-NEXT: vmovups (%rdi), %xmm0		; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
;		;
; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:		; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
; X32-SSE1: # BB#0:		; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: pushl %ebp		; X32-SSE1-NEXT: pushl %ebp
; X32-SSE1-NEXT: .Lcfi10:		; X32-SSE1-NEXT: .Lcfi26:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: pushl %ebx		; X32-SSE1-NEXT: pushl %ebx
; X32-SSE1-NEXT: .Lcfi11:		; X32-SSE1-NEXT: .Lcfi27:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 12		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
; X32-SSE1-NEXT: pushl %edi		; X32-SSE1-NEXT: pushl %edi
; X32-SSE1-NEXT: .Lcfi12:		; X32-SSE1-NEXT: .Lcfi28:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 16		; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
; X32-SSE1-NEXT: pushl %esi		; X32-SSE1-NEXT: pushl %esi
; X32-SSE1-NEXT: .Lcfi13:		; X32-SSE1-NEXT: .Lcfi29:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 20		; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
; X32-SSE1-NEXT: .Lcfi14:		; X32-SSE1-NEXT: .Lcfi30:
; X32-SSE1-NEXT: .cfi_offset %esi, -20		; X32-SSE1-NEXT: .cfi_offset %esi, -20
; X32-SSE1-NEXT: .Lcfi15:		; X32-SSE1-NEXT: .Lcfi31:
; X32-SSE1-NEXT: .cfi_offset %edi, -16		; X32-SSE1-NEXT: .cfi_offset %edi, -16
; X32-SSE1-NEXT: .Lcfi16:		; X32-SSE1-NEXT: .Lcfi32:
; X32-SSE1-NEXT: .cfi_offset %ebx, -12		; X32-SSE1-NEXT: .cfi_offset %ebx, -12
; X32-SSE1-NEXT: .Lcfi17:		; X32-SSE1-NEXT: .Lcfi33:
; X32-SSE1-NEXT: .cfi_offset %ebp, -8		; X32-SSE1-NEXT: .cfi_offset %ebp, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movzwl (%ecx), %ebp		; X32-SSE1-NEXT: movzwl (%ecx), %ebp
; X32-SSE1-NEXT: movl 3(%ecx), %esi		; X32-SSE1-NEXT: movl 3(%ecx), %esi
; X32-SSE1-NEXT: movl 7(%ecx), %edi		; X32-SSE1-NEXT: movl 7(%ecx), %edi
; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx		; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx
; X32-SSE1-NEXT: movb 13(%ecx), %dl		; X32-SSE1-NEXT: movb 13(%ecx), %dl
▲ Show 20 Lines • Show All 216 Lines • ▼ Show 20 Lines
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero		; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero		; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]		; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq		; AVX-NEXT: retq
;		;
; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:		; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:
; X32-SSE1: # BB#0:		; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: pushl %edi		; X32-SSE1-NEXT: pushl %edi
; X32-SSE1-NEXT: .Lcfi18:		; X32-SSE1-NEXT: .Lcfi34:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8		; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: pushl %esi		; X32-SSE1-NEXT: pushl %esi
; X32-SSE1-NEXT: .Lcfi19:		; X32-SSE1-NEXT: .Lcfi35:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 12		; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
; X32-SSE1-NEXT: .Lcfi20:		; X32-SSE1-NEXT: .Lcfi36:
; X32-SSE1-NEXT: .cfi_offset %esi, -12		; X32-SSE1-NEXT: .cfi_offset %esi, -12
; X32-SSE1-NEXT: .Lcfi21:		; X32-SSE1-NEXT: .Lcfi37:
; X32-SSE1-NEXT: .cfi_offset %edi, -8		; X32-SSE1-NEXT: .cfi_offset %edi, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx		; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl 8(%ecx), %edx		; X32-SSE1-NEXT: movl 8(%ecx), %edx
; X32-SSE1-NEXT: movl 12(%ecx), %esi		; X32-SSE1-NEXT: movl 12(%ecx), %esi
; X32-SSE1-NEXT: movl 16(%ecx), %edi		; X32-SSE1-NEXT: movl 16(%ecx), %edi
; X32-SSE1-NEXT: movl 20(%ecx), %ecx		; X32-SSE1-NEXT: movl 20(%ecx), %ecx
; X32-SSE1-NEXT: movl %ecx, 12(%eax)		; X32-SSE1-NEXT: movl %ecx, 12(%eax)
▲ Show 20 Lines • Show All 158 Lines • Show Last 20 Lines