This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Transform truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine.
ClosedPublic

Authored by congh on Nov 11 2015, 2:11 PM.

Download Raw Diff

Details

Reviewers

RKSimon
davidxl
hfinkel
dexonsmith

Commits

rG8df93ce4557e: [X86][SSE] Transform truncations between vectors of integers into X86ISD…
rL256194: [X86][SSE] Transform truncations between vectors of integers into X86ISD…

Summary

This patch transforms truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine. We don't do it in lowering phase because after type legalization, the original truncation will be turned into a BUILD_VECTOR with each element that is extracted from a vector and then truncated, and from them it is difficult to do this optimization. This greatly improves the performance of those two truncations. For example, for the following IR:

define void @truncate_v16i32_to_v16i8(<16 x i32> %a) {

%1 = trunc <16 x i32> %a to <16 x i8>
store <16 x i8> %1, <16 x i8>* undef, align 4
ret void

}

On SSE2 previously it will be compiled into 33 instructions:

movdqa %xmm3, -24(%rsp)
movdqa %xmm1, -56(%rsp)
movdqa %xmm2, -40(%rsp)
movdqa %xmm0, -72(%rsp)
punpcklbw %xmm3, %xmm1
punpcklbw %xmm2, %xmm0
punpcklbw %xmm1, %xmm0
movd -20(%rsp), %xmm1
movd -52(%rsp), %xmm2
movd -16(%rsp), %xmm3
movd -48(%rsp), %xmm4
punpcklbw %xmm3, %xmm4
movd -36(%rsp), %xmm3
movd -68(%rsp), %xmm5
movd -32(%rsp), %xmm6
movd -64(%rsp), %xmm7
punpcklbw %xmm6, %xmm7
punpcklbw %xmm4, %xmm7
punpcklbw %xmm7, %xmm0
punpcklbw %xmm1, %xmm2
punpcklbw %xmm3, %xmm5
punpcklbw %xmm2, %xmm5
movd -12(%rsp), %xmm1
movd -44(%rsp), %xmm2
punpcklbw %xmm1, %xmm2
movd -28(%rsp), %xmm1
movd -60(%rsp), %xmm3
punpcklbw %xmm1, %xmm3
punpcklbw %xmm2, %xmm3
punpcklbw %xmm3, %xmm5
punpcklbw %xmm5, %xmm0
movdqu %xmm0, (%rax)
retq

and now it is compiled into 10 instructions:

movdqa LCPI0_0(%rip), %xmm4
pand %xmm4, %xmm3
pand %xmm4, %xmm2
packuswb %xmm3, %xmm2
pand %xmm4, %xmm1
pand %xmm4, %xmm0
packuswb %xmm1, %xmm0
packuswb %xmm2, %xmm0
movdqu %xmm0, (%rax)
retq

which saves 22 instructions (many of them are memops).

Diff Detail

Event Timeline

congh updated this revision to Diff 39971.Nov 11 2015, 2:11 PM

congh retitled this revision from to [X86][SSE] Transform truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine..

congh updated this object.

congh added reviewers: hfinkel, dexonsmith, RKSimon, davidxl.

congh added a subscriber: llvm-commits.

RKSimon mentioned this in rL253132: [X86][SSE] Added extra vector truncation tests.Nov 14 2015, 7:26 AM

I've added the current codegen to the vector-trunc.ll tests for comparison so please can you rebase against that?

I wonder if it would be better to combine to bitcast/shuffle pairs instead of specific X86ISD nodes? And then focus on improving the existing shuffle lowering with PACKUS (e.g. I don't think we're making use of PACKUSDW at all yet).

In D14588#289556, @RKSimon wrote:

I've added the current codegen to the vector-trunc.ll tests for comparison so please can you rebase against that?

I have rebased my repo and all tests are passes with this patch.

I wonder if it would be better to combine to bitcast/shuffle pairs instead of specific X86ISD nodes? And then focus on improving the existing shuffle lowering with PACKUS (e.g. I don't think we're making use of PACKUSDW at all yet).

Without this patch, the truncation from v16i32 to v16i8 is first converted to many extract-element, truncation on scalars, and a build_vector after type legalization, which is very difficult to be lowered into PACKUS (at least the lowering is far from elegant). So what you are suggesting is first combining it into a shuffle on v64i8 that is bitcast from v16i32, right? I need to try this method and see if it is easy to lower the instructions after type legalization.

I have attempted to combine the truncation from v16i32 to v16i8 into bitcasts and shuffles, and found that after type legalization they are still converted into extract_vector_elt and BUILD_VECTOR (shown below). In order to do the same optimization, we have to check the pattern again on them and I think the resulted code is tedious. So why not do the same thing earlier with a simpler approach?

t0: ch = EntryToken
  t6: v4i32,ch = CopyFromReg t0, Register:v4i32 %vreg2
t33: v16i8 = bitcast t6
  t8: v4i32,ch = CopyFromReg t0, Register:v4i32 %vreg3
t34: v16i8 = bitcast t8
  t2: v4i32,ch = CopyFromReg t0, Register:v4i32 %vreg0
t35: v16i8 = bitcast t2
  t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %vreg1
t36: v16i8 = bitcast t4
      t37: i8 = extract_vector_elt t35, Constant:i64<0>
      t39: i8 = extract_vector_elt t35, Constant:i64<4>
      t41: i8 = extract_vector_elt t35, Constant:i64<8>
      t43: i8 = extract_vector_elt t35, Constant:i64<12>
      t44: i8 = extract_vector_elt t36, Constant:i64<0>
      t45: i8 = extract_vector_elt t36, Constant:i64<4>
      t46: i8 = extract_vector_elt t36, Constant:i64<8>
      t47: i8 = extract_vector_elt t36, Constant:i64<12>
      t48: i8 = extract_vector_elt t33, Constant:i64<0>
      t49: i8 = extract_vector_elt t33, Constant:i64<4>
      t50: i8 = extract_vector_elt t33, Constant:i64<8>
      t51: i8 = extract_vector_elt t33, Constant:i64<12>
      t52: i8 = extract_vector_elt t34, Constant:i64<0>
      t53: i8 = extract_vector_elt t34, Constant:i64<4>
      t54: i8 = extract_vector_elt t34, Constant:i64<8>
      t55: i8 = extract_vector_elt t34, Constant:i64<12>
    t56: v16i8 = BUILD_VECTOR t37, t39, t41, t43, t44, t45, t46, t47, t48, t49, t50, t51, t52, t53, t54, t55

In D14588#290278, @congh wrote:

In D14588#289556, @RKSimon wrote:

I've added the current codegen to the vector-trunc.ll tests for comparison so please can you rebase against that?

I have rebased my repo and all tests are passes with this patch.

Your patch still doesn't appear to be based on rL253132

Update the patch after merging with trunk.

In D14588#290437, @RKSimon wrote:

In D14588#290278, @congh wrote:

In D14588#289556, @RKSimon wrote:

I've added the current codegen to the vector-trunc.ll tests for comparison so please can you rebase against that?

I have rebased my repo and all tests are passes with this patch.

Your patch still doesn't appear to be based on rL253132

Ah, sorry for forgetting to update the patch. Updated.

Fix commented code.

Ping?

I think you need to rebase after rL253952

You should be able to support packing with packssdw as well - it would allow us to truncate vXi32 -> vXi16 on SSE2 targets

test/CodeGen/X86/vector-trunc.ll
366–367	Aren't the SSE2/SSSE3/SSSE41 checks the same now? Shouldn't they just use SSE-CHECK ? Try to use update_llc_test_checks.py if you can.

Update the patch according to Simon's comments.

The optimization in this patch now covers more truncations with vXi32/vXi64 to vXi8/vXi16 for X >= 8.

In D14588#296349, @congh wrote:

Update the patch according to Simon's comments.

The optimization in this patch now covers more truncations with vXi32/vXi64 to vXi8/vXi16 for X >= 8.

Agree

In D14588#295639, @RKSimon wrote:

I think you need to rebase after rL253952

You should be able to support packing with packssdw as well - it would allow us to truncate vXi32 -> vXi16 on SSE2 targets

Agree. I have updated the patch to support this truncation as well. PTAL.

test/CodeGen/X86/vector-trunc.ll
366–367	Thanks so much for the advice of using update_llc_test_checks.py! It automatically combines tests of SSE2/SSSE3/SSSE41 into SSE test only.

Please can you rebase this - your PAVG patches will have affected PerformTRUNCATECombine etc.

In D14588#298059, @RKSimon wrote:

Please can you rebase this - your PAVG patches will have affected PerformTRUNCATECombine etc.

OK. I just found my implementation of using PACKSS is incorrect in this patch. I need to use shift-left then shift-right to set the bits of the higher part for negative values before using PACKSS. I will fix it soon.

Fix the bug when truncating v4i32 to v2i16 using PACKSS: we need to set proper sign bits by using shift-left plus ashift-right.

When truncating from v2i64, first use SHUFP to extract lower i32 parts from 2 x v2i64 to 1 x v4i32.

In D14588#298059, @RKSimon wrote:

Please can you rebase this - your PAVG patches will have affected PerformTRUNCATECombine etc.

I have rebased my patch and also fixed the bug in the patch (see the description of the patch update). PTAL. Thanks!

RKSimon added inline comments.Nov 30 2015, 5:36 AM

lib/Target/X86/X86ISelLowering.cpp
26228	I understand that packus(ymm) won't do what we want but - won't AVX2 still benefit for cases where packus(xmm) is used? Why not just early out if (!hasSSE2 \|\| VT.getSizeInBits() > 128) ?
26229	Please can you add AVX512 as a test target to prove that its using vpmovdb etc.
26268	Please don't use domain switches, they can cause massive stalls on pipes. Why not just use DAG.getVectorShuffle()?

congh added inline comments.Nov 30 2015, 12:23 PM

lib/Target/X86/X86ISelLowering.cpp
26228	I have tested the instructions generated on AVX2 by using your suggested method and found we can save one instruction for v16i32 to v16i8 truncation. However, I found in this case we could still use ymm to get better results: what we need to do is doing packus on ymm but later shuffle the result. This can save us more than 40% instructions . I think this can be a follow-up patch.
26229	OK. I have updated the test file.
26268	Thanks for pointing out this problem! Unfortunately, there is no similar instruction as shufps for integers. Therefore, I could not find any better solution for vXi64 to vXi16 truncation and hence have removed this case from this patch.

Stop using SHUFP as it is for floating points and can cause domain switch.

Also update the test case by adding AVX512 test and v16i64 to v16i8 truncation.

Ping?

Almost there I think - just a suggestion for the trunc8i64_8i8 test

test/CodeGen/X86/vector-trunc.ll
337	Please can you change this to a store <8 x i8> instead? Legalizing a <8 x i8> return means that its technically a <8 x i16> with undef upper bytes - which makes the truncation shuffle code hardware to track, especially on pre SSE41.

Update a test case as suggested by Simon.

The previous update is incorrect. Update it again.

congh added inline comments.Dec 9 2015, 4:52 PM

test/CodeGen/X86/vector-trunc.ll
337	You are right. I have update this test case.

Ping?

Ping again?

LGTM with a couple of (optional) minor corrections.

lib/Target/X86/X86ISelLowering.cpp
26141	Maybe use the same naming convention for the out / in VTs? OutVT + OutSVT InVT + InSVT Makes it easier to track.
26146	Can you please assert that OutSVT is a i8 / i16 before this? Maybe use APInt to create the mask?
test/CodeGen/X86/vector-trunc.ll
156	Its a pity that the SSSE3/SSE41 codegen is still so poor - can you add a FIXME explaining what still needs doing?

This revision is now accepted and ready to land.Dec 19 2015, 9:01 AM

Thanks a lot for the review, Simon!

test/CodeGen/X86/vector-trunc.ll
156	I'll adjust the conditions to let this SSSE3/SSE41 codegen be as good as SSE2.

Closed by commit rL256194: [X86][SSE] Transform truncations between vectors of integers into X86ISD… (authored by conghou). · Explain WhyDec 21 2015, 12:46 PM

This revision was automatically updated to reflect the committed changes.

jevinskie added a subscriber: jevinskie.Dec 23 2015, 10:16 AM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

160 lines

test/

CodeGen/

X86/

vector-trunc.ll

359 lines

Diff 42357

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 25,480 Lines • ▼ Show 20 Lines	for (int i = 0; i < 3; ++i) {
// The pattern is detected, emit X86ISD::AVG instruction.		// The pattern is detected, emit X86ISD::AVG instruction.
return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),		return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
Operands[1].getOperand(0));		Operands[1].getOperand(0));
}		}

return SDValue();		return SDValue();
}		}

static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget,
SDLoc(N));
}

/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.		/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);		LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);		EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();		EVT MemVT = Ld->getMemoryVT();
SDLoc dl(Ld);		SDLoc dl(Ld);
▲ Show 20 Lines • Show All 627 Lines • ▼ Show 20 Lines	static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal subs from subs of shuffles.		// Try to synthesize horizontal subs from subs of shuffles.
if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|		if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
(Subtarget->hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&		(Subtarget->hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, false))		isHorizontalBinOp(LHS, RHS, false))
return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);		return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
return SDValue();		return SDValue();
}		}

		/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
		static SDValue
		combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
		SmallVector<SDValue, 8> &Regs) {
		assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|
		Regs[0].getValueType() == MVT::v2i64));
		EVT OutVT = N->getValueType(0);
		EVT ElemVT = OutVT.getVectorElementType();
		EVT InVT = Regs[0].getValueType();
		EVT WideElemVT = InVT.getVectorElementType();
		RKSimonUnsubmitted Done Reply Inline Actions Maybe use the same naming convention for the out / in VTs? OutVT + OutSVT InVT + InSVT Makes it easier to track. RKSimon: Maybe use the same naming convention for the out / in VTs? OutVT + OutSVT InVT + InSVT Makes…
		SDLoc DL(N);

		// First, use mask to unset all bits that won't appear in the result.
		SDValue MaskVal =
		DAG.getConstant(ElemVT == MVT::i8 ? 255 : 65535, DL, WideElemVT);
		RKSimonUnsubmitted Done Reply Inline Actions Can you please assert that OutSVT is a i8 / i16 before this? Maybe use APInt to create the mask? RKSimon: Can you please assert that OutSVT is a i8 / i16 before this? Maybe use APInt to create the mask?
		SDValue MaskVec = DAG.getNode(
		ISD::BUILD_VECTOR, DL, InVT,
		SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
		for (auto &Reg : Regs)
		Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);

		MVT UnpackedVT, PackedVT;
		if (ElemVT == MVT::i8) {
		UnpackedVT = MVT::v8i16;
		PackedVT = MVT::v16i8;
		} else {
		UnpackedVT = MVT::v4i32;
		PackedVT = MVT::v8i16;
		}

		// In each iteration, truncate the type by a half size.
		auto RegNum = Regs.size();
		for (unsigned j = 1, e = WideElemVT.getSizeInBits() / ElemVT.getSizeInBits();
		j < e; j *= 2, RegNum /= 2) {
		for (unsigned i = 0; i < RegNum; i++)
		Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
		for (unsigned i = 0; i < RegNum / 2; i++)
		Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
		Regs[i * 2 + 1]);
		}

		// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
		// then extract a subvector as the result since v8i8 is not a legal type.
		if (OutVT == MVT::v8i8) {
		Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
		Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
		DAG.getIntPtrConstant(0, DL));
		return Regs[0];
		} else if (RegNum > 1) {
		Regs.resize(RegNum);
		return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
		} else
		return Regs[0];
		}

		/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
		static SDValue
		combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
		SmallVector<SDValue, 8> &Regs) {
		assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
		EVT OutVT = N->getValueType(0);
		SDLoc DL(N);

		// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
		SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
		for (auto &Reg : Regs) {
		Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
		Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
		}

		for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
		Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
		Regs[i * 2 + 1]);

		if (Regs.size() > 2) {
		Regs.resize(Regs.size() / 2);
		return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
		} else
		return Regs[0];
		}

		/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
		/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
		/// legalization the truncation will be translated into a BUILD_VECTOR with each
		/// element that is extracted from a vector and then truncated, and it is
		/// diffcult to do this optimization based on them.
		static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
		const X86Subtarget *Subtarget) {
		EVT OutVT = N->getValueType(0);
		if (!OutVT.isVector())
		return SDValue();

		SDValue In = N->getOperand(0);
		if (!In.getValueType().isSimple())
		return SDValue();

		EVT InVT = In.getValueType();
		RKSimonUnsubmitted Not Done Reply Inline Actions I understand that packus(ymm) won't do what we want but - won't AVX2 still benefit for cases where packus(xmm) is used? Why not just early out if (!hasSSE2 \|\| VT.getSizeInBits() > 128) ? RKSimon: I understand that packus(ymm) won't do what we want but - won't AVX2 still benefit for cases…
		conghAuthorUnsubmitted Not Done Reply Inline Actions I have tested the instructions generated on AVX2 by using your suggested method and found we can save one instruction for v16i32 to v16i8 truncation. However, I found in this case we could still use ymm to get better results: what we need to do is doing packus on ymm but later shuffle the result. This can save us more than 40% instructions . I think this can be a follow-up patch. congh: I have tested the instructions generated on AVX2 by using your suggested method and found we…
		unsigned NumElems = OutVT.getVectorNumElements();
		RKSimonUnsubmitted Not Done Reply Inline Actions Please can you add AVX512 as a test target to prove that its using vpmovdb etc. RKSimon: Please can you add AVX512 as a test target to prove that its using vpmovdb etc.
		conghAuthorUnsubmitted Not Done Reply Inline Actions OK. I have updated the test file. congh: OK. I have updated the test file.

		// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
		// SSE2, and we need to take care of it specially.
		// AVX512 provides vpmovdb.
		if (!Subtarget->hasSSE2() \|\| Subtarget->hasAVX2())
		return SDValue();

		EVT ElemVT = OutVT.getVectorElementType();
		EVT WideElemVT = InVT.getVectorElementType();
		if (!((WideElemVT == MVT::i32 \|\| WideElemVT == MVT::i64) &&
		(ElemVT == MVT::i8 \|\| ElemVT == MVT::i16) && isPowerOf2_32(NumElems) &&
		NumElems >= 8))
		return SDValue();

		// SSSE3's pshufb results in less instructions in the cases below.
		if (Subtarget->hasSSSE3() && NumElems == 8 &&
		(ElemVT == MVT::i8 \|\| (WideElemVT == MVT::i32 && ElemVT == MVT::i16)))
		return SDValue();

		SDLoc DL(N);

		// Split a long vector into vectors of legal type.
		unsigned RegNum = InVT.getSizeInBits() / 128;
		SmallVector<SDValue, 8> SubVec(RegNum);
		if (WideElemVT == MVT::i32) {
		for (unsigned i = 0; i < RegNum; i++)
		SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
		DAG.getIntPtrConstant(i * 4, DL));
		} else {
		for (unsigned i = 0; i < RegNum; i++)
		SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
		DAG.getIntPtrConstant(i * 2, DL));
		}

		// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
		// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
		// truncate 2 x v4i32 to v8i16.
		if (Subtarget->hasSSE41() \|\| ElemVT == MVT::i8)
		return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
		RKSimonUnsubmitted Not Done Reply Inline Actions Please don't use domain switches, they can cause massive stalls on pipes. Why not just use DAG.getVectorShuffle()? RKSimon: Please don't use domain switches, they can cause massive stalls on pipes. Why not just use DAG.
		conghAuthorUnsubmitted Not Done Reply Inline Actions Thanks for pointing out this problem! Unfortunately, there is no similar instruction as shufps for integers. Therefore, I could not find any better solution for vXi64 to vXi16 truncation and hence have removed this case from this patch. congh: Thanks for pointing out this problem! Unfortunately, there is no similar instruction as shufps…
		else if (WideElemVT == MVT::i32)
		return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
		else
		return SDValue();
		}

		static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
		const X86Subtarget *Subtarget) {
		// Try to detect AVG pattern first.
		SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
		Subtarget, SDLoc(N));
		if (Avg.getNode())
		return Avg;

		return combineVectorTruncation(N, DAG, Subtarget);
		}

/// Do target-specific dag combines on floating point negations.		/// Do target-specific dag combines on floating point negations.
static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue Arg = N->getOperand(0);		SDValue Arg = N->getOperand(0);

// If we're negating a FMA node, then we can adjust the		// If we're negating a FMA node, then we can adjust the
// instruction to include the extra negation.		// instruction to include the extra negation.
▲ Show 20 Lines • Show All 1,770 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-trunc.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 \| FileCheck %s --check-prefix=SSE --check-prefix=SSE2			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 \| FileCheck %s --check-prefix=SSE --check-prefix=SSE2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 \| FileCheck %s --check-prefix=SSE --check-prefix=SSSE3			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 \| FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 \| FileCheck %s --check-prefix=SSE --check-prefix=SSE41			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 \| FileCheck %s --check-prefix=SSE --check-prefix=SSE41
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx \| FileCheck %s --check-prefix=AVX --check-prefix=AVX1			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx \| FileCheck %s --check-prefix=AVX --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX2			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX2
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw \| FileCheck %s --check-prefix=AVX512BW

	define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {			define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
	; SSE2-LABEL: trunc8i64_8i32:			; SSE2-LABEL: trunc8i64_8i32:
	; SSE2: # BB#0: # %entry			; SSE2: # BB#0: # %entry
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]			; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]			; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
	Show All 36 Lines
	;			;
	; AVX2-LABEL: trunc8i64_8i32:			; AVX2-LABEL: trunc8i64_8i32:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>			; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
	; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
	; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1			; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc8i64_8i32:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <8 x i64> %a to <8 x i32>			%0 = trunc <8 x i64> %a to <8 x i32>
	ret <8 x i32> %0			ret <8 x i32> %0
	}			}

	define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {			define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
	; SSE2-LABEL: trunc8i64_8i16:			; SSE2-LABEL: trunc8i64_8i16:
	; SSE2: # BB#0: # %entry			; SSE2: # BB#0: # %entry
	Show All 30 Lines
	; SSSE3-NEXT: movd %ecx, %xmm2			; SSSE3-NEXT: movd %ecx, %xmm2
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]			; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]			; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]			; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: trunc8i64_8i16:			; SSE41-LABEL: trunc8i64_8i16:
	; SSE41: # BB#0: # %entry			; SSE41: # BB#0: # %entry
				; SSE41-NEXT: pxor %xmm4, %xmm4
				; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
				; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
				; SSE41-NEXT: packusdw %xmm3, %xmm2
				; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
				; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
				; SSE41-NEXT: packusdw %xmm1, %xmm0
				; SSE41-NEXT: packusdw %xmm2, %xmm0
				; SSE41-NEXT: retq
				;
				; AVX1-LABEL: trunc8i64_8i16:
				; AVX1: # BB#0: # %entry
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
				; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
				; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
				; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
				; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
				; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
				; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
				; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
				; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
				; AVX1-NEXT: vzeroupper
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: trunc8i64_8i16:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
				; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
				; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
				; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
				; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
				; AVX2-NEXT: vzeroupper
				; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc8i64_8i16:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
				; AVX512BW-NEXT: retq
				entry:
				%0 = trunc <8 x i64> %a to <8 x i16>
				ret <8 x i16> %0
				}

				define void @trunc8i64_8i8(<8 x i64> %a) {
				RKSimonUnsubmitted Not Done Reply Inline Actions Its a pity that the SSSE3/SSE41 codegen is still so poor - can you add a FIXME explaining what still needs doing? RKSimon: Its a pity that the SSSE3/SSE41 codegen is still so poor - can you add a FIXME explaining what…
				conghAuthorUnsubmitted Not Done Reply Inline Actions I'll adjust the conditions to let this SSSE3/SSE41 codegen be as good as SSE2. congh: I'll adjust the conditions to let this SSSE3/SSE41 codegen be as good as SSE2.
				; SSE2-LABEL: trunc8i64_8i8:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
				; SSE2-NEXT: pand %xmm4, %xmm3
				; SSE2-NEXT: pand %xmm4, %xmm2
				; SSE2-NEXT: packuswb %xmm3, %xmm2
				; SSE2-NEXT: pand %xmm4, %xmm1
				; SSE2-NEXT: pand %xmm4, %xmm0
				; SSE2-NEXT: packuswb %xmm1, %xmm0
				; SSE2-NEXT: packuswb %xmm2, %xmm0
				; SSE2-NEXT: packuswb %xmm0, %xmm0
				; SSE2-NEXT: movq %xmm0, (%rax)
				; SSE2-NEXT: retq
				;
				; SSSE3-LABEL: trunc8i64_8i8:
				; SSSE3: # BB#0: # %entry
				; SSSE3-NEXT: pextrw $4, %xmm1, %eax
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
				; SSSE3-NEXT: pextrw $4, %xmm0, %ecx
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
				; SSSE3-NEXT: pextrw $4, %xmm3, %edx
				; SSSE3-NEXT: movd %edx, %xmm1
				; SSSE3-NEXT: movd %eax, %xmm3
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
				; SSSE3-NEXT: pextrw $4, %xmm2, %eax
				; SSSE3-NEXT: movd %eax, %xmm1
				; SSSE3-NEXT: movd %ecx, %xmm2
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
				; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
				; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
				; SSSE3-NEXT: movq %xmm0, (%rax)
				; SSSE3-NEXT: retq
				;
				; SSE41-LABEL: trunc8i64_8i8:
				; SSE41: # BB#0: # %entry
	; SSE41-NEXT: pextrw $4, %xmm0, %eax			; SSE41-NEXT: pextrw $4, %xmm0, %eax
	; SSE41-NEXT: pinsrw $1, %eax, %xmm0			; SSE41-NEXT: pinsrw $1, %eax, %xmm0
	; SSE41-NEXT: movd %xmm1, %eax			; SSE41-NEXT: movd %xmm1, %eax
	; SSE41-NEXT: pinsrw $2, %eax, %xmm0			; SSE41-NEXT: pinsrw $2, %eax, %xmm0
	; SSE41-NEXT: pextrw $4, %xmm1, %eax			; SSE41-NEXT: pextrw $4, %xmm1, %eax
	; SSE41-NEXT: pinsrw $3, %eax, %xmm0			; SSE41-NEXT: pinsrw $3, %eax, %xmm0
	; SSE41-NEXT: movd %xmm2, %eax			; SSE41-NEXT: movd %xmm2, %eax
	; SSE41-NEXT: pinsrw $4, %eax, %xmm0			; SSE41-NEXT: pinsrw $4, %eax, %xmm0
	; SSE41-NEXT: pextrw $4, %xmm2, %eax			; SSE41-NEXT: pextrw $4, %xmm2, %eax
	; SSE41-NEXT: pinsrw $5, %eax, %xmm0			; SSE41-NEXT: pinsrw $5, %eax, %xmm0
	; SSE41-NEXT: movd %xmm3, %eax			; SSE41-NEXT: movd %xmm3, %eax
	; SSE41-NEXT: pinsrw $6, %eax, %xmm0			; SSE41-NEXT: pinsrw $6, %eax, %xmm0
	; SSE41-NEXT: pextrw $4, %xmm3, %eax			; SSE41-NEXT: pextrw $4, %xmm3, %eax
	; SSE41-NEXT: pinsrw $7, %eax, %xmm0			; SSE41-NEXT: pinsrw $7, %eax, %xmm0
				; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
				; SSE41-NEXT: movq %xmm0, (%rax)
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: trunc8i64_8i16:			; AVX1-LABEL: trunc8i64_8i8:
	; AVX1: # BB#0: # %entry			; AVX1: # BB#0: # %entry
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
				; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
				; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]			; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]			; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]			; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]			; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1			; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0			; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
				; AVX1-NEXT: vmovq %xmm0, (%rax)
	; AVX1-NEXT: vzeroupper			; AVX1-NEXT: vzeroupper
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: trunc8i64_8i16:			; AVX2-LABEL: trunc8i64_8i8:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>			; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
	; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
	; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1			; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero			; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]			; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
				; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
				; AVX2-NEXT: vmovq %xmm0, (%rax)
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc8i64_8i8:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovqb %zmm0, (%rax)
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <8 x i64> %a to <8 x i16>			%0 = trunc <8 x i64> %a to <8 x i8>
	ret <8 x i16> %0			store <8 x i8> %0, <8 x i8>* undef, align 4
				ret void
	}			}

	define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {			define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
	; SSE2-LABEL: trunc8i32_8i16:			; SSE2-LABEL: trunc8i32_8i16:
	; SSE2: # BB#0: # %entry			; SSE2: # BB#0: # %entry
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]			; SSE2-NEXT: pslld $16, %xmm1
	; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]			; SSE2-NEXT: psrad $16, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]			; SSE2-NEXT: pslld $16, %xmm0
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]			; SSE2-NEXT: psrad $16, %xmm0
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]			; SSE2-NEXT: packssdw %xmm1, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: trunc8i32_8i16:			; SSSE3-LABEL: trunc8i32_8i16:
	; SSSE3: # BB#0: # %entry			; SSSE3: # BB#0: # %entry
	; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
	; SSSE3-NEXT: pshufb %xmm2, %xmm1			; SSSE3-NEXT: pshufb %xmm2, %xmm1
	; SSSE3-NEXT: pshufb %xmm2, %xmm0			; SSSE3-NEXT: pshufb %xmm2, %xmm0
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	Show All 18 Lines
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: trunc8i32_8i16:			; AVX2-LABEL: trunc8i32_8i16:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero			; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]			; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc8i32_8i16:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <8 x i32> %a to <8 x i16>			%0 = trunc <8 x i32> %a to <8 x i16>
	ret <8 x i16> %0			ret <8 x i16> %0
	}			}

	define void @trunc8i32_8i8(<8 x i32> %a) {			define void @trunc8i32_8i8(<8 x i32> %a) {
	; SSE2-LABEL: trunc8i32_8i8:			; SSE2-LABEL: trunc8i32_8i8:
	; SSE2: # BB#0: # %entry			; SSE2: # BB#0: # %entry
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
	; SSE2-NEXT: pand %xmm2, %xmm1			; SSE2-NEXT: pand %xmm2, %xmm1
	; SSE2-NEXT: pand %xmm2, %xmm0			; SSE2-NEXT: pand %xmm2, %xmm0
	; SSE2-NEXT: packuswb %xmm1, %xmm0			; SSE2-NEXT: packuswb %xmm1, %xmm0
	; SSE2-NEXT: packuswb %xmm0, %xmm0			; SSE2-NEXT: packuswb %xmm0, %xmm0
	; SSE2-NEXT: movq %xmm0, (%rax)			; SSE2-NEXT: movq %xmm0, (%rax)
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: trunc8i32_8i8:			; SSSE3-LABEL: trunc8i32_8i8:
	Show All 12 Lines
	; SSE41-NEXT: pshufb %xmm2, %xmm0			; SSE41-NEXT: pshufb %xmm2, %xmm0
	; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE41-NEXT: movq %xmm0, (%rax)			; SSE41-NEXT: movq %xmm0, (%rax)
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: trunc8i32_8i8:			; AVX1-LABEL: trunc8i32_8i8:
	; AVX1: # BB#0: # %entry			; AVX1: # BB#0: # %entry
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>			; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
				RKSimonUnsubmitted Not Done Reply Inline Actions Please can you change this to a store <8 x i8> instead? Legalizing a <8 x i8> return means that its technically a <8 x i16> with undef upper bytes - which makes the truncation shuffle code hardware to track, especially on pre SSE41. RKSimon: Please can you change this to a store <8 x i8> instead? Legalizing a <8 x i8> return means that…
				conghAuthorUnsubmitted Not Done Reply Inline Actions You are right. I have update this test case. congh: You are right. I have update this test case.
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1			; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0			; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; AVX1-NEXT: vmovq %xmm0, (%rax)			; AVX1-NEXT: vmovq %xmm0, (%rax)
	; AVX1-NEXT: vzeroupper			; AVX1-NEXT: vzeroupper
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: trunc8i32_8i8:			; AVX2-LABEL: trunc8i32_8i8:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero			; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]			; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]			; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
	; AVX2-NEXT: vmovq %xmm0, (%rax)			; AVX2-NEXT: vmovq %xmm0, (%rax)
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc8i32_8i8:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
				; AVX512BW-NEXT: vmovq %xmm0, (%rax)
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <8 x i32> %a to <8 x i8>			%0 = trunc <8 x i32> %a to <8 x i8>
	store <8 x i8> %0, <8 x i8>* undef, align 4			store <8 x i8> %0, <8 x i8>* undef, align 4
	ret void			ret void
	}			}

	define void @trunc16i32_16i8(<16 x i32> %a) {			define void @trunc16i32_16i8(<16 x i32> %a) {
	; SSE2-LABEL: trunc16i32_16i8:			; SSE-LABEL: trunc16i32_16i8:
				RKSimonUnsubmitted Not Done Reply Inline Actions Aren't the SSE2/SSSE3/SSSE41 checks the same now? Shouldn't they just use SSE-CHECK ? Try to use update_llc_test_checks.py if you can. RKSimon: Aren't the SSE2/SSSE3/SSSE41 checks the same now? Shouldn't they just use SSE-CHECK ? Try to…
				conghAuthorUnsubmitted Not Done Reply Inline Actions Thanks so much for the advice of using update_llc_test_checks.py! It automatically combines tests of SSE2/SSSE3/SSSE41 into SSE test only. congh: Thanks so much for the advice of using update_llc_test_checks.py! It automatically combines…
	; SSE2: # BB#0: # %entry			; SSE: # BB#0: # %entry
	; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
	; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: pand %xmm4, %xmm3
	; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)			; SSE-NEXT: pand %xmm4, %xmm2
	; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: packuswb %xmm3, %xmm2
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]			; SSE-NEXT: pand %xmm4, %xmm1
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]			; SSE-NEXT: pand %xmm4, %xmm0
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]			; SSE-NEXT: packuswb %xmm1, %xmm0
	; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero			; SSE-NEXT: packuswb %xmm2, %xmm0
	; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero			; SSE-NEXT: movdqu %xmm0, (%rax)
	; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero			; SSE-NEXT: retq
	; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
	; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
	; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
	; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
	; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
	; SSE2-NEXT: movdqu %xmm0, (%rax)
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: trunc16i32_16i8:
	; SSSE3: # BB#0: # %entry
	; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
	; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
	; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
	; SSSE3-NEXT: movdqu %xmm0, (%rax)
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: trunc16i32_16i8:
	; SSE41: # BB#0: # %entry
	; SSE41-NEXT: pextrb $4, %xmm0, %eax
	; SSE41-NEXT: pextrb $8, %xmm0, %ecx
	; SSE41-NEXT: pextrb $12, %xmm0, %edx
	; SSE41-NEXT: pinsrb $1, %eax, %xmm0
	; SSE41-NEXT: pinsrb $2, %ecx, %xmm0
	; SSE41-NEXT: pinsrb $3, %edx, %xmm0
	; SSE41-NEXT: pextrb $0, %xmm1, %eax
	; SSE41-NEXT: pinsrb $4, %eax, %xmm0
	; SSE41-NEXT: pextrb $4, %xmm1, %eax
	; SSE41-NEXT: pinsrb $5, %eax, %xmm0
	; SSE41-NEXT: pextrb $8, %xmm1, %eax
	; SSE41-NEXT: pinsrb $6, %eax, %xmm0
	; SSE41-NEXT: pextrb $12, %xmm1, %eax
	; SSE41-NEXT: pinsrb $7, %eax, %xmm0
	; SSE41-NEXT: pextrb $0, %xmm2, %eax
	; SSE41-NEXT: pinsrb $8, %eax, %xmm0
	; SSE41-NEXT: pextrb $4, %xmm2, %eax
	; SSE41-NEXT: pinsrb $9, %eax, %xmm0
	; SSE41-NEXT: pextrb $8, %xmm2, %eax
	; SSE41-NEXT: pinsrb $10, %eax, %xmm0
	; SSE41-NEXT: pextrb $12, %xmm2, %eax
	; SSE41-NEXT: pinsrb $11, %eax, %xmm0
	; SSE41-NEXT: pextrb $0, %xmm3, %eax
	; SSE41-NEXT: pinsrb $12, %eax, %xmm0
	; SSE41-NEXT: pextrb $4, %xmm3, %eax
	; SSE41-NEXT: pinsrb $13, %eax, %xmm0
	; SSE41-NEXT: pextrb $8, %xmm3, %eax
	; SSE41-NEXT: pinsrb $14, %eax, %xmm0
	; SSE41-NEXT: pextrb $12, %xmm3, %eax
	; SSE41-NEXT: pinsrb $15, %eax, %xmm0
	; SSE41-NEXT: movdqu %xmm0, (%rax)
	; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: trunc16i32_16i8:			; AVX1-LABEL: trunc16i32_16i8:
	; AVX1: # BB#0: # %entry			; AVX1: # BB#0: # %entry
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]			; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
	; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2			; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1			; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]			; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1			; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4			; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4			; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0			; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX1-NEXT: vmovdqu %xmm0, (%rax)			; AVX1-NEXT: vmovdqu %xmm0, (%rax)
	; AVX1-NEXT: vzeroupper			; AVX1-NEXT: vzeroupper
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: trunc16i32_16i8:			; AVX2-LABEL: trunc16i32_16i8:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]			; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
	; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1			; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]			; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>			; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1			; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0			; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]			; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
	; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0			; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX2-NEXT: vmovdqu %xmm0, (%rax)			; AVX2-NEXT: vmovdqu %xmm0, (%rax)
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc16i32_16i8:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovdb %zmm0, (%rax)
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <16 x i32> %a to <16 x i8>			%0 = trunc <16 x i32> %a to <16 x i8>
	store <16 x i8> %0, <16 x i8>* undef, align 4			store <16 x i8> %0, <16 x i8>* undef, align 4
	ret void			ret void
	}			}

	define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {			define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
	; SSE2-LABEL: trunc2x4i64_8i32:			; SSE2-LABEL: trunc2x4i64_8i32:
	▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines
	;			;
	; AVX2-LABEL: trunc2x4i64_8i32:			; AVX2-LABEL: trunc2x4i64_8i32:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>			; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
	; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
	; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1			; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc2x4i64_8i32:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
				; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
				; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <4 x i64> %a to <4 x i32>			%0 = trunc <4 x i64> %a to <4 x i32>
	%1 = trunc <4 x i64> %b to <4 x i32>			%1 = trunc <4 x i64> %b to <4 x i32>
	%2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i32> %2			ret <8 x i32> %2
	}			}

	define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {			define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
	▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
	; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1			; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]			; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1			; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0			; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc2x4i64_8i16:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
				; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
				; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
				; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
				; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
				; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <4 x i64> %a to <4 x i16>			%0 = trunc <4 x i64> %a to <4 x i16>
	%1 = trunc <4 x i64> %b to <4 x i16>			%1 = trunc <4 x i64> %b to <4 x i16>
	%2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i16> %2			ret <8 x i16> %2
	}			}

	define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {			define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
	Show All 26 Lines
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: trunc2x2i64_4i32:			; AVX2-LABEL: trunc2x2i64_4i32:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]			; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
	; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]			; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				;
				; AVX512BW-LABEL: trunc2x2i64_4i32:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
				; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <2 x i64> %a to <2 x i32>			%0 = trunc <2 x i64> %a to <2 x i32>
	%1 = trunc <2 x i64> %b to <2 x i32>			%1 = trunc <2 x i64> %b to <2 x i32>
	%2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			%2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	ret <4 x i32> %2			ret <4 x i32> %2
	}			}

	define i64 @trunc2i64_i64(<2 x i64> %inval) {			define i64 @trunc2i64_i64(<2 x i64> %inval) {
	; SSE-LABEL: trunc2i64_i64:			; SSE-LABEL: trunc2i64_i64:
	; SSE: # BB#0: # %entry			; SSE: # BB#0: # %entry
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE-NEXT: movd %xmm0, %rax			; SSE-NEXT: movd %xmm0, %rax
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: trunc2i64_i64:			; AVX-LABEL: trunc2i64_i64:
	; AVX: # BB#0: # %entry			; AVX: # BB#0: # %entry
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX-NEXT: vmovq %xmm0, %rax			; AVX-NEXT: vmovq %xmm0, %rax
	; AVX-NEXT: retq			; AVX-NEXT: retq
				;
				; AVX512BW-LABEL: trunc2i64_i64:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; AVX512BW-NEXT: vmovq %xmm0, %rax
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <2 x i64> %inval to <2 x i32>			%0 = trunc <2 x i64> %inval to <2 x i32>
	%1 = bitcast <2 x i32> %0 to i64			%1 = bitcast <2 x i32> %0 to i64
	ret i64 %1			ret i64 %1
	}			}

	define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {			define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
	; SSE2-LABEL: trunc2x4i32_8i16:			; SSE2-LABEL: trunc2x4i32_8i16:
	Show All 25 Lines
	;			;
	; AVX-LABEL: trunc2x4i32_8i16:			; AVX-LABEL: trunc2x4i32_8i16:
	; AVX: # BB#0: # %entry			; AVX: # BB#0: # %entry
	; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]			; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
	; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1			; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0			; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX-NEXT: retq			; AVX-NEXT: retq
				;
				; AVX512BW-LABEL: trunc2x4i32_8i16:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
				; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
				; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
				; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <4 x i32> %a to <4 x i16>			%0 = trunc <4 x i32> %a to <4 x i16>
	%1 = trunc <4 x i32> %b to <4 x i16>			%1 = trunc <4 x i32> %b to <4 x i16>
	%2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i16> %2			ret <8 x i16> %2
	}			}

	; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524			; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
	Show All 18 Lines
	; SSE41-NEXT: movd %xmm0, %rax			; SSE41-NEXT: movd %xmm0, %rax
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: trunc4i32_i64:			; AVX-LABEL: trunc4i32_i64:
	; AVX: # BB#0: # %entry			; AVX: # BB#0: # %entry
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]			; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
	; AVX-NEXT: vmovq %xmm0, %rax			; AVX-NEXT: vmovq %xmm0, %rax
	; AVX-NEXT: retq			; AVX-NEXT: retq
				;
				; AVX512BW-LABEL: trunc4i32_i64:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
				; AVX512BW-NEXT: vmovq %xmm0, %rax
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <4 x i32> %inval to <4 x i16>			%0 = trunc <4 x i32> %inval to <4 x i16>
	%1 = bitcast <4 x i16> %0 to i64			%1 = bitcast <4 x i16> %0 to i64
	ret i64 %1			ret i64 %1
	}			}

	define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {			define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: trunc2x8i16_16i8:			; SSE2-LABEL: trunc2x8i16_16i8:
	Show All 22 Lines
	;			;
	; AVX-LABEL: trunc2x8i16_16i8:			; AVX-LABEL: trunc2x8i16_16i8:
	; AVX: # BB#0: # %entry			; AVX: # BB#0: # %entry
	; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>			; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1			; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0			; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX-NEXT: retq			; AVX-NEXT: retq
				;
				; AVX512BW-LABEL: trunc2x8i16_16i8:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
				; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
				; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
				; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <8 x i16> %a to <8 x i8>			%0 = trunc <8 x i16> %a to <8 x i8>
	%1 = trunc <8 x i16> %b to <8 x i8>			%1 = trunc <8 x i16> %b to <8 x i8>
	%2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i8> %2			ret <16 x i8> %2
	}			}

	; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524			; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
	Show All 17 Lines
	; SSE41-NEXT: movd %xmm0, %rax			; SSE41-NEXT: movd %xmm0, %rax
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: trunc8i16_i64:			; AVX-LABEL: trunc8i16_i64:
	; AVX: # BB#0: # %entry			; AVX: # BB#0: # %entry
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]			; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
	; AVX-NEXT: vmovq %xmm0, %rax			; AVX-NEXT: vmovq %xmm0, %rax
	; AVX-NEXT: retq			; AVX-NEXT: retq
				;
				; AVX512BW-LABEL: trunc8i16_i64:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
				; AVX512BW-NEXT: vmovq %xmm0, %rax
				; AVX512BW-NEXT: retq
	entry:			entry:
	%0 = trunc <8 x i16> %inval to <8 x i8>			%0 = trunc <8 x i16> %inval to <8 x i8>
	%1 = bitcast <8 x i8> %0 to i64			%1 = bitcast <8 x i8> %0 to i64
	ret i64 %1			ret i64 %1
	}			}

	define <16 x i8> @trunc16i64_16i8_const() {			define <16 x i8> @trunc16i64_16i8_const() {
	; SSE-LABEL: trunc16i64_16i8_const:			; SSE-LABEL: trunc16i64_16i8_const:
	; SSE: # BB#0: # %entry			; SSE: # BB#0: # %entry
	; SSE-NEXT: xorps %xmm0, %xmm0			; SSE-NEXT: xorps %xmm0, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: trunc16i64_16i8_const:			; AVX-LABEL: trunc16i64_16i8_const:
	; AVX: # BB#0: # %entry			; AVX: # BB#0: # %entry
	; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0			; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
				;
				; AVX512BW-LABEL: trunc16i64_16i8_const:
				; AVX512BW: # BB#0: # %entry
				; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
				; AVX512BW-NEXT: retq

	entry:			entry:
	%0 = trunc <16 x i64> zeroinitializer to <16 x i8>			%0 = trunc <16 x i64> zeroinitializer to <16 x i8>
	%1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>			%1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
	ret <16 x i8> %1			ret <16 x i8> %1
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Transform truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 42357

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/vector-trunc.ll

[X86][SSE] Transform truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine.
ClosedPublic