This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more leading zeros
ClosedPublic

Authored by RKSimon on Jan 18 2018, 11:44 AM.

Download Raw Diff

Details

Reviewers

craig.topper
pcordes
zvi
andreadb
spatel

Commits

rG9f551ad60423: [X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more…
rL323367: [X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more…

Summary

As discussed in D41484, PMADDWD for 'zero extended' vXi32 is nearly always a better option than PMULLD:
On SNB it will result in code that isn't any faster, but not any slower so we may as well keep it.
On KNL it only has half the throughput, so I've disabled it on there - ideally there'd be a better way than this.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Jan 18 2018, 11:44 AM

craig.topper added inline comments.Jan 18 2018, 7:47 PM

lib/Target/X86/X86ISelLowering.cpp
32606	Move the APInt inside the second if? Maybe combine the two ifs?
32607	Do you need an SSE2 check on the v4i32?

Merged outer two ifs, added SSE2 check.

We also now have KNL slow-pmulld.ll tests.

ping?

LGTM

This revision is now accepted and ready to land.Jan 24 2018, 9:55 AM

LGTM with a minor request:

We added a special feature for Silvermont, SlowPMULLD. Can you please add a RUN: config to the tests with SlowPMULLD + SSE4.2 to represent this processor?

RKSimon mentioned this in rL323364: [X86][SSE] Add slow-pmulld attribute (silvermont-style) test.Jan 24 2018, 11:10 AM

Closed by commit rL323367: [X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more… (authored by RKSimon). · Explain WhyJan 24 2018, 11:24 AM

This revision was automatically updated to reflect the committed changes.

Does this make the equivalent code in LowerMUL dead?

RKSimon mentioned this in rL323540: [X86][SSE] Drop PMADDWD in lowerMul.Jan 26 2018, 8:59 AM

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 322868)

26 lines

test/

CodeGen/

X86/

	promote.ll
	promote.ll (revision 322877)

4 lines

	shrink_vmul.ll
	shrink_vmul.ll (revision 322868)

64 lines

	slow-pmulld.ll
	slow-pmulld.ll (revision 322874)

192 lines

Diff 130468

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 32,422 Lines • ▼ Show 20 Lines	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);		SDLoc DL(N);
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
EVT VT = N->getOperand(0).getValueType();		EVT VT = N->getOperand(0).getValueType();
unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();
if ((NumElts % 2) != 0)		if ((NumElts % 2) != 0)
return SDValue();		return SDValue();

// If the upper 17 bits of each element are zero then we can use PMADD.
APInt Mask17 = APInt::getHighBitsSet(32, 17);
if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
DAG.MaskedValueIsZero(N1, Mask17))
return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
DAG.getBitcast(MVT::v8i16, N1));

unsigned RegSize = 128;		unsigned RegSize = 128;
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);		MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);		EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

// Shrink the operands of mul.		// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);		SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);		SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

▲ Show 20 Lines • Show All 155 Lines • ▼ Show 20 Lines
}		}

/// Optimize a single multiply with constant into two operations in order to		/// Optimize a single multiply with constant into two operations in order to
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.		/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,		static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);

		// If the upper 17 bits of each element are zero then we can use PMADD, which
		// is always at least as quick as PMULLD, expect on KNL.
		if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL) {
		APInt Mask17 = APInt::getHighBitsSet(32, 17);
		craig.topperUnsubmitted Not Done Reply Inline Actions Move the APInt inside the second if? Maybe combine the two ifs? craig.topper: Move the APInt inside the second if? Maybe combine the two ifs?
		if (VT == MVT::v4i32 \|\| (VT == MVT::v8i32 && Subtarget.hasAVX2()) \|\|
		craig.topperUnsubmitted Not Done Reply Inline Actions Do you need an SSE2 check on the v4i32? craig.topper: Do you need an SSE2 check on the v4i32?
		(VT == MVT::v16i32 && Subtarget.hasBWI())) {
		SDValue N0 = N->getOperand(0);
		SDValue N1 = N->getOperand(1);
		if (DAG.MaskedValueIsZero(N0, Mask17) &&
		DAG.MaskedValueIsZero(N1, Mask17)) {
		unsigned NumElts = VT.getVectorNumElements();
		MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts);
		return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT,
		DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1));
		}
		}
		}

if (DCI.isBeforeLegalize() && VT.isVector())		if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);		return reduceVMULWidth(N, DAG, Subtarget);

if (!MulConstantOptimization)		if (!MulConstantOptimization)
return SDValue();		return SDValue();
// An imul is usually smaller than the alternative sequence.		// An imul is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction().optForMinSize())		if (DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 6,274 Lines • Show Last 20 Lines

test/CodeGen/X86/promote.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7 \| FileCheck %s --check-prefixes=CHECK,X86			; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7 \| FileCheck %s --check-prefixes=CHECK,X86
	; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 \| FileCheck %s --check-prefixes=CHECK,X64			; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 \| FileCheck %s --check-prefixes=CHECK,X64

	define i32 @mul_f(<4 x i8>* %A) {			define i32 @mul_f(<4 x i8>* %A) {
	; X86-LABEL: mul_f:			; X86-LABEL: mul_f:
	; X86: # %bb.0: # %entry			; X86: # %bb.0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-NEXT: pmulld %xmm0, %xmm0			; X86-NEXT: pmaddwd %xmm0, %xmm0
	; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]			; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
	; X86-NEXT: movd %xmm0, (%eax)			; X86-NEXT: movd %xmm0, (%eax)
	; X86-NEXT: xorl %eax, %eax			; X86-NEXT: xorl %eax, %eax
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: mul_f:			; X64-LABEL: mul_f:
	; X64: # %bb.0: # %entry			; X64: # %bb.0: # %entry
	; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-NEXT: pmulld %xmm0, %xmm0			; X64-NEXT: pmaddwd %xmm0, %xmm0
	; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]			; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
	; X64-NEXT: movd %xmm0, (%rax)			; X64-NEXT: movd %xmm0, (%rax)
	; X64-NEXT: xorl %eax, %eax			; X64-NEXT: xorl %eax, %eax
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%0 = load <4 x i8>, <4 x i8>* %A, align 8			%0 = load <4 x i8>, <4 x i8>* %A, align 8
	%mul = mul <4 x i8> %0, %0			%mul = mul <4 x i8> %0, %0
	store <4 x i8> %mul, <4 x i8>* undef			store <4 x i8> %mul, <4 x i8>* undef
	▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

test/CodeGen/X86/shrink_vmul.ll

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; X86-AVX-NEXT: .cfi_def_cfa_offset 8			; X86-AVX-NEXT: .cfi_def_cfa_offset 8
	; X86-AVX-NEXT: .cfi_offset %esi, -8			; X86-AVX-NEXT: .cfi_offset %esi, -8
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX-NEXT: movl c, %esi			; X86-AVX-NEXT: movl c, %esi
	; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0			; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
	; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)			; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
	; X86-AVX-NEXT: popl %esi			; X86-AVX-NEXT: popl %esi
	; X86-AVX-NEXT: retl			; X86-AVX-NEXT: retl
	;			;
	; X64-SSE-LABEL: mul_2xi8:			; X64-SSE-LABEL: mul_2xi8:
	; X64-SSE: # %bb.0: # %entry			; X64-SSE: # %bb.0: # %entry
	; X64-SSE-NEXT: movq {{.*}}(%rip), %rax			; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
	Show All 9 Lines
	; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)			; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
	; X64-SSE-NEXT: retq			; X64-SSE-NEXT: retq
	;			;
	; X64-AVX-LABEL: mul_2xi8:			; X64-AVX-LABEL: mul_2xi8:
	; X64-AVX: # %bb.0: # %entry			; X64-AVX: # %bb.0: # %entry
	; X64-AVX-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0			; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
	; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)			; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
	; X64-AVX-NEXT: retq			; X64-AVX-NEXT: retq
	entry:			entry:
	%pre = load i32, i32* @c			%pre = load i32, i32* @c
	%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index			%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
	%tmp7 = bitcast i8* %tmp6 to <2 x i8>*			%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
	%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1			%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; X86-AVX-NEXT: .cfi_def_cfa_offset 8			; X86-AVX-NEXT: .cfi_def_cfa_offset 8
	; X86-AVX-NEXT: .cfi_offset %esi, -8			; X86-AVX-NEXT: .cfi_offset %esi, -8
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX-NEXT: movl c, %esi			; X86-AVX-NEXT: movl c, %esi
	; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0			; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
	; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)			; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
	; X86-AVX-NEXT: popl %esi			; X86-AVX-NEXT: popl %esi
	; X86-AVX-NEXT: retl			; X86-AVX-NEXT: retl
	;			;
	; X64-SSE-LABEL: mul_4xi8:			; X64-SSE-LABEL: mul_4xi8:
	; X64-SSE: # %bb.0: # %entry			; X64-SSE: # %bb.0: # %entry
	; X64-SSE-NEXT: movq {{.*}}(%rip), %rax			; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
	; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X64-SSE-NEXT: pxor %xmm1, %xmm1			; X64-SSE-NEXT: pxor %xmm1, %xmm1
	; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]			; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]			; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero			; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]			; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]			; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
	; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2			; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2
	; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)			; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
	; X64-SSE-NEXT: retq			; X64-SSE-NEXT: retq
	;			;
	; X64-AVX-LABEL: mul_4xi8:			; X64-AVX-LABEL: mul_4xi8:
	; X64-AVX: # %bb.0: # %entry			; X64-AVX: # %bb.0: # %entry
	; X64-AVX-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0			; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
	; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)			; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
	; X64-AVX-NEXT: retq			; X64-AVX-NEXT: retq
	entry:			entry:
	%pre = load i32, i32* @c			%pre = load i32, i32* @c
	%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index			%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
	%tmp7 = bitcast i8* %tmp6 to <4 x i8>*			%tmp7 = bitcast i8* %tmp6 to <4 x i8>*
	%wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1			%wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
	%tmp8 = zext <4 x i8> %wide.load to <4 x i32>			%tmp8 = zext <4 x i8> %wide.load to <4 x i32>
	▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
	; X86-AVX1-NEXT: .cfi_offset %esi, -8			; X86-AVX1-NEXT: .cfi_offset %esi, -8
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX1-NEXT: movl c, %esi			; X86-AVX1-NEXT: movl c, %esi
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0			; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1			; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
	; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)			; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
	; X86-AVX1-NEXT: popl %esi			; X86-AVX1-NEXT: popl %esi
	; X86-AVX1-NEXT: vzeroupper			; X86-AVX1-NEXT: vzeroupper
	; X86-AVX1-NEXT: retl			; X86-AVX1-NEXT: retl
	;			;
	; X86-AVX2-LABEL: mul_8xi8:			; X86-AVX2-LABEL: mul_8xi8:
	; X86-AVX2: # %bb.0: # %entry			; X86-AVX2: # %bb.0: # %entry
	; X86-AVX2-NEXT: pushl %esi			; X86-AVX2-NEXT: pushl %esi
	; X86-AVX2-NEXT: .cfi_def_cfa_offset 8			; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
	; X86-AVX2-NEXT: .cfi_offset %esi, -8			; X86-AVX2-NEXT: .cfi_offset %esi, -8
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX2-NEXT: movl c, %esi			; X86-AVX2-NEXT: movl c, %esi
	; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0			; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
	; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)			; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
	; X86-AVX2-NEXT: popl %esi			; X86-AVX2-NEXT: popl %esi
	; X86-AVX2-NEXT: vzeroupper			; X86-AVX2-NEXT: vzeroupper
	; X86-AVX2-NEXT: retl			; X86-AVX2-NEXT: retl
	;			;
	; X64-SSE-LABEL: mul_8xi8:			; X64-SSE-LABEL: mul_8xi8:
	; X64-SSE: # %bb.0: # %entry			; X64-SSE: # %bb.0: # %entry
	; X64-SSE-NEXT: movq {{.*}}(%rip), %rax			; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
	Show All 11 Lines
	; X64-SSE-NEXT: retq			; X64-SSE-NEXT: retq
	;			;
	; X64-AVX1-LABEL: mul_8xi8:			; X64-AVX1-LABEL: mul_8xi8:
	; X64-AVX1: # %bb.0: # %entry			; X64-AVX1: # %bb.0: # %entry
	; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0			; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1			; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
	; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)			; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
	; X64-AVX1-NEXT: vzeroupper			; X64-AVX1-NEXT: vzeroupper
	; X64-AVX1-NEXT: retq			; X64-AVX1-NEXT: retq
	;			;
	; X64-AVX2-LABEL: mul_8xi8:			; X64-AVX2-LABEL: mul_8xi8:
	; X64-AVX2: # %bb.0: # %entry			; X64-AVX2: # %bb.0: # %entry
	; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0			; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
	; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)			; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
	; X64-AVX2-NEXT: vzeroupper			; X64-AVX2-NEXT: vzeroupper
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	entry:			entry:
	%pre = load i32, i32* @c			%pre = load i32, i32* @c
	%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index			%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
	%tmp7 = bitcast i8* %tmp6 to <8 x i8>*			%tmp7 = bitcast i8* %tmp6 to <8 x i8>*
	%wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1			%wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
	▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX1-NEXT: movl c, %esi			; X86-AVX1-NEXT: movl c, %esi
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0			; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1			; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2			; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
	; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3			; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
	; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2			; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
	; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)			; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
	; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)			; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
	; X86-AVX1-NEXT: popl %esi			; X86-AVX1-NEXT: popl %esi
	; X86-AVX1-NEXT: vzeroupper			; X86-AVX1-NEXT: vzeroupper
	; X86-AVX1-NEXT: retl			; X86-AVX1-NEXT: retl
	;			;
	; X86-AVX2-LABEL: mul_16xi8:			; X86-AVX2-LABEL: mul_16xi8:
	; X86-AVX2: # %bb.0: # %entry			; X86-AVX2: # %bb.0: # %entry
	; X86-AVX2-NEXT: pushl %esi			; X86-AVX2-NEXT: pushl %esi
	; X86-AVX2-NEXT: .cfi_def_cfa_offset 8			; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
	; X86-AVX2-NEXT: .cfi_offset %esi, -8			; X86-AVX2-NEXT: .cfi_offset %esi, -8
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX2-NEXT: movl c, %esi			; X86-AVX2-NEXT: movl c, %esi
	; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0			; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
	; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1			; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
	; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)			; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
	; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)			; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
	; X86-AVX2-NEXT: popl %esi			; X86-AVX2-NEXT: popl %esi
	; X86-AVX2-NEXT: vzeroupper			; X86-AVX2-NEXT: vzeroupper
	; X86-AVX2-NEXT: retl			; X86-AVX2-NEXT: retl
	;			;
	; X64-SSE-LABEL: mul_16xi8:			; X64-SSE-LABEL: mul_16xi8:
	; X64-SSE: # %bb.0: # %entry			; X64-SSE: # %bb.0: # %entry
	Show All 24 Lines
	; X64-AVX1-LABEL: mul_16xi8:			; X64-AVX1-LABEL: mul_16xi8:
	; X64-AVX1: # %bb.0: # %entry			; X64-AVX1: # %bb.0: # %entry
	; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0			; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1			; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2			; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
	; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero			; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
	; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3			; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
	; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2			; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
	; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)			; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
	; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)			; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
	; X64-AVX1-NEXT: vzeroupper			; X64-AVX1-NEXT: vzeroupper
	; X64-AVX1-NEXT: retq			; X64-AVX1-NEXT: retq
	;			;
	; X64-AVX2-LABEL: mul_16xi8:			; X64-AVX2-LABEL: mul_16xi8:
	; X64-AVX2: # %bb.0: # %entry			; X64-AVX2: # %bb.0: # %entry
	; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0			; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
	; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero			; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1			; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
	; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)			; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
	; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)			; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
	; X64-AVX2-NEXT: vzeroupper			; X64-AVX2-NEXT: vzeroupper
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	entry:			entry:
	%pre = load i32, i32* @c			%pre = load i32, i32* @c
	%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index			%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
	%tmp7 = bitcast i8* %tmp6 to <16 x i8>*			%tmp7 = bitcast i8* %tmp6 to <16 x i8>*
	▲ Show 20 Lines • Show All 1,023 Lines • ▼ Show 20 Lines
	; X86-SSE-NEXT: retl			; X86-SSE-NEXT: retl
	;			;
	; X86-AVX-LABEL: mul_2xi8_varconst1:			; X86-AVX-LABEL: mul_2xi8_varconst1:
	; X86-AVX: # %bb.0: # %entry			; X86-AVX: # %bb.0: # %entry
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX-NEXT: movl c, %edx			; X86-AVX-NEXT: movl c, %edx
	; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0			; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
	; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)			; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
	; X86-AVX-NEXT: retl			; X86-AVX-NEXT: retl
	;			;
	; X64-SSE-LABEL: mul_2xi8_varconst1:			; X64-SSE-LABEL: mul_2xi8_varconst1:
	; X64-SSE: # %bb.0: # %entry			; X64-SSE: # %bb.0: # %entry
	; X64-SSE-NEXT: movq {{.*}}(%rip), %rax			; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
	; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx			; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
	; X64-SSE-NEXT: movd %ecx, %xmm0			; X64-SSE-NEXT: movd %ecx, %xmm0
	; X64-SSE-NEXT: pxor %xmm1, %xmm1			; X64-SSE-NEXT: pxor %xmm1, %xmm1
	; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]			; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0			; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
	; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]			; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)			; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
	; X64-SSE-NEXT: retq			; X64-SSE-NEXT: retq
	;			;
	; X64-AVX-LABEL: mul_2xi8_varconst1:			; X64-AVX-LABEL: mul_2xi8_varconst1:
	; X64-AVX: # %bb.0: # %entry			; X64-AVX: # %bb.0: # %entry
	; X64-AVX-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X64-AVX-NEXT: movl $255, %ecx			; X64-AVX-NEXT: movl $255, %ecx
	; X64-AVX-NEXT: vmovq %rcx, %xmm1			; X64-AVX-NEXT: vmovq %rcx, %xmm1
	; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]			; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
	; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0			; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
	; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)			; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
	; X64-AVX-NEXT: retq			; X64-AVX-NEXT: retq
	entry:			entry:
	%pre = load i32, i32* @c			%pre = load i32, i32* @c
	%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index			%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
	%tmp7 = bitcast i8* %tmp6 to <2 x i8>*			%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
	%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1			%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
	▲ Show 20 Lines • Show All 95 Lines • ▼ Show 20 Lines
	; X86-SSE-NEXT: retl			; X86-SSE-NEXT: retl
	;			;
	; X86-AVX-LABEL: mul_2xi8_varconst3:			; X86-AVX-LABEL: mul_2xi8_varconst3:
	; X86-AVX: # %bb.0: # %entry			; X86-AVX: # %bb.0: # %entry
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX-NEXT: movl c, %edx			; X86-AVX-NEXT: movl c, %edx
	; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0			; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
	; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)			; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
	; X86-AVX-NEXT: retl			; X86-AVX-NEXT: retl
	;			;
	; X64-SSE-LABEL: mul_2xi8_varconst3:			; X64-SSE-LABEL: mul_2xi8_varconst3:
	; X64-SSE: # %bb.0: # %entry			; X64-SSE: # %bb.0: # %entry
	; X64-SSE-NEXT: movq {{.*}}(%rip), %rax			; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
	; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx			; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
	Show All 10 Lines
	;			;
	; X64-AVX-LABEL: mul_2xi8_varconst3:			; X64-AVX-LABEL: mul_2xi8_varconst3:
	; X64-AVX: # %bb.0: # %entry			; X64-AVX: # %bb.0: # %entry
	; X64-AVX-NEXT: movq {{.*}}(%rip), %rax			; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
	; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100			; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100
	; X64-AVX-NEXT: vmovq %rcx, %xmm1			; X64-AVX-NEXT: vmovq %rcx, %xmm1
	; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]			; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
	; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0			; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
	; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)			; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
	; X64-AVX-NEXT: retq			; X64-AVX-NEXT: retq
	entry:			entry:
	%pre = load i32, i32* @c			%pre = load i32, i32* @c
	%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index			%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
	%tmp7 = bitcast i8* %tmp6 to <2 x i8>*			%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
	%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1			%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
	▲ Show 20 Lines • Show All 631 Lines • ▼ Show 20 Lines
	; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload			; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
	; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload			; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
	; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload			; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
	; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload			; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload
	; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero			; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero
	; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007			; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007
	; X86-AVX1-NEXT: vmovd %eax, %xmm3			; X86-AVX1-NEXT: vmovd %eax, %xmm3
	; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]			; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
	; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0			; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0
	; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1			; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm1, %xmm1
	; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1			; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1
	; X86-AVX1-NEXT: vmovd %xmm1, (%eax)			; X86-AVX1-NEXT: vmovd %xmm1, (%eax)
	; X86-AVX1-NEXT: vmovaps %ymm0, (%eax)			; X86-AVX1-NEXT: vmovaps %ymm0, (%eax)
	; X86-AVX1-NEXT: addl $16, %esp			; X86-AVX1-NEXT: addl $16, %esp
	; X86-AVX1-NEXT: popl %esi			; X86-AVX1-NEXT: popl %esi
	; X86-AVX1-NEXT: popl %edi			; X86-AVX1-NEXT: popl %edi
	; X86-AVX1-NEXT: popl %ebx			; X86-AVX1-NEXT: popl %ebx
	▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	; X86-AVX2-NEXT: divl %ecx			; X86-AVX2-NEXT: divl %ecx
	; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0			; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
	; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; X86-AVX2-NEXT: xorl %eax, %eax			; X86-AVX2-NEXT: xorl %eax, %eax
	; X86-AVX2-NEXT: xorl %edx, %edx			; X86-AVX2-NEXT: xorl %edx, %edx
	; X86-AVX2-NEXT: divl (%eax)			; X86-AVX2-NEXT: divl (%eax)
	; X86-AVX2-NEXT: vmovd %edx, %xmm1			; X86-AVX2-NEXT: vmovd %edx, %xmm1
	; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]			; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
	; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0			; X86-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
	; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007			; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
	; X86-AVX2-NEXT: vmovd %eax, %xmm2			; X86-AVX2-NEXT: vmovd %eax, %xmm2
	; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1			; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
	; X86-AVX2-NEXT: vmovd %xmm1, (%eax)			; X86-AVX2-NEXT: vmovd %xmm1, (%eax)
	; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)			; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
	; X86-AVX2-NEXT: popl %esi			; X86-AVX2-NEXT: popl %esi
	; X86-AVX2-NEXT: vzeroupper			; X86-AVX2-NEXT: vzeroupper
	; X86-AVX2-NEXT: retl			; X86-AVX2-NEXT: retl
	▲ Show 20 Lines • Show All 92 Lines • ▼ Show 20 Lines
	; X64-AVX1-NEXT: xorl %eax, %eax			; X64-AVX1-NEXT: xorl %eax, %eax
	; X64-AVX1-NEXT: xorl %edx, %edx			; X64-AVX1-NEXT: xorl %edx, %edx
	; X64-AVX1-NEXT: divl %ebp			; X64-AVX1-NEXT: divl %ebp
	; X64-AVX1-NEXT: vmovd %edx, %xmm0			; X64-AVX1-NEXT: vmovd %edx, %xmm0
	; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0			; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
	; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0			; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
	; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0			; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
	; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]			; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
	; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0			; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
	; X64-AVX1-NEXT: vmovd %esi, %xmm2			; X64-AVX1-NEXT: vmovd %esi, %xmm2
	; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2			; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
	; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2			; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
	; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2			; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
	; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1			; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
	; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; X64-AVX1-NEXT: vmovd %r8d, %xmm1			; X64-AVX1-NEXT: vmovd %r8d, %xmm1
	; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007			; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007
	; X64-AVX1-NEXT: vmovd %eax, %xmm2			; X64-AVX1-NEXT: vmovd %eax, %xmm2
	; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1			; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
	; X64-AVX1-NEXT: vmovd %xmm1, (%rax)			; X64-AVX1-NEXT: vmovd %xmm1, (%rax)
	; X64-AVX1-NEXT: vmovaps %ymm0, (%rax)			; X64-AVX1-NEXT: vmovaps %ymm0, (%rax)
	; X64-AVX1-NEXT: popq %rbx			; X64-AVX1-NEXT: popq %rbx
	▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines
	; X64-AVX2-NEXT: divl %ecx			; X64-AVX2-NEXT: divl %ecx
	; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0			; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
	; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; X64-AVX2-NEXT: xorl %eax, %eax			; X64-AVX2-NEXT: xorl %eax, %eax
	; X64-AVX2-NEXT: xorl %edx, %edx			; X64-AVX2-NEXT: xorl %edx, %edx
	; X64-AVX2-NEXT: divl (%rax)			; X64-AVX2-NEXT: divl (%rax)
	; X64-AVX2-NEXT: vmovd %edx, %xmm1			; X64-AVX2-NEXT: vmovd %edx, %xmm1
	; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]			; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
	; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0			; X64-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
	; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007			; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
	; X64-AVX2-NEXT: vmovd %eax, %xmm2			; X64-AVX2-NEXT: vmovd %eax, %xmm2
	; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1			; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
	; X64-AVX2-NEXT: vmovd %xmm1, (%rax)			; X64-AVX2-NEXT: vmovd %xmm1, (%rax)
	; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)			; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
	; X64-AVX2-NEXT: vzeroupper			; X64-AVX2-NEXT: vzeroupper
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	%tmp = load <9 x i32>, <9 x i32>* undef, align 64			%tmp = load <9 x i32>, <9 x i32>* undef, align 64
	%rem = urem <9 x i32> zeroinitializer, %tmp			%rem = urem <9 x i32> zeroinitializer, %tmp
	%mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem			%mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
	store <9 x i32> %mul, <9 x i32>* undef, align 64			store <9 x i32> %mul, <9 x i32>* undef, align 64
	ret void			ret void
	}			}

test/CodeGen/X86/slow-pmulld.ll

	Show All 23 Lines
	; CHECK64: # %bb.0:			; CHECK64: # %bb.0:
	; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0			; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
	; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0			; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
	; CHECK64-NEXT: retq			; CHECK64-NEXT: retq
	;			;
	; SSE4-32-LABEL: test_mul_v4i32_v4i8:			; SSE4-32-LABEL: test_mul_v4i32_v4i8:
	; SSE4-32: # %bb.0:			; SSE4-32: # %bb.0:
	; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0			; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
	; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0			; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
	; SSE4-32-NEXT: retl			; SSE4-32-NEXT: retl
	;			;
	; SSE4-64-LABEL: test_mul_v4i32_v4i8:			; SSE4-64-LABEL: test_mul_v4i32_v4i8:
	; SSE4-64: # %bb.0:			; SSE4-64: # %bb.0:
	; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0			; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0			; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
	; SSE4-64-NEXT: retq			; SSE4-64-NEXT: retq
	;			;
	; AVX-32-LABEL: test_mul_v4i32_v4i8:			; AVX-32-LABEL: test_mul_v4i32_v4i8:
	; AVX-32: # %bb.0:			; AVX-32: # %bb.0:
	; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0			; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
	; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]			; AVX-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
	; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
	; AVX-32-NEXT: retl			; AVX-32-NEXT: retl
	;			;
	; AVX-64-LABEL: test_mul_v4i32_v4i8:			; AVX-64-LABEL: test_mul_v4i32_v4i8:
	; AVX-64: # %bb.0:			; AVX-64: # %bb.0:
	; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0			; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
	; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]			; AVX-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
	; AVX-64-NEXT: retq			; AVX-64-NEXT: retq
	%z = zext <4 x i8> %A to <4 x i32>			%z = zext <4 x i8> %A to <4 x i32>
	%m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>			%m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
	ret <4 x i32> %m			ret <4 x i32> %m
	}			}

	define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {			define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
	; CHECK32-LABEL: test_mul_v8i32_v8i8:			; CHECK32-LABEL: test_mul_v8i32_v8i8:
	Show All 24 Lines
	;			;
	; SSE4-32-LABEL: test_mul_v8i32_v8i8:			; SSE4-32-LABEL: test_mul_v8i32_v8i8:
	; SSE4-32: # %bb.0:			; SSE4-32: # %bb.0:
	; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0			; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero			; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
	; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]			; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
	; SSE4-32-NEXT: pmulld %xmm2, %xmm0			; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
	; SSE4-32-NEXT: pmulld %xmm2, %xmm1			; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
	; SSE4-32-NEXT: retl			; SSE4-32-NEXT: retl
	;			;
	; SSE4-64-LABEL: test_mul_v8i32_v8i8:			; SSE4-64-LABEL: test_mul_v8i32_v8i8:
	; SSE4-64: # %bb.0:			; SSE4-64: # %bb.0:
	; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0			; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero			; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
	; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]			; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
	; SSE4-64-NEXT: pmulld %xmm2, %xmm0			; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
	; SSE4-64-NEXT: pmulld %xmm2, %xmm1			; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
	; SSE4-64-NEXT: retq			; SSE4-64-NEXT: retq
	;			;
	; AVX-32-LABEL: test_mul_v8i32_v8i8:			; AVX-32-LABEL: test_mul_v8i32_v8i8:
	; AVX-32: # %bb.0:			; AVX-32: # %bb.0:
	; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0			; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
	; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
	; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
	; AVX-32-NEXT: retl			; AVX-32-NEXT: retl
	;			;
	; AVX-64-LABEL: test_mul_v8i32_v8i8:			; AVX-64-LABEL: test_mul_v8i32_v8i8:
	; AVX-64: # %bb.0:			; AVX-64: # %bb.0:
	; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0			; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
	; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
	; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
	; AVX-64-NEXT: retq			; AVX-64-NEXT: retq
	%z = zext <8 x i8> %A to <8 x i32>			%z = zext <8 x i8> %A to <8 x i32>
	%m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>			%m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
	ret <8 x i32> %m			ret <8 x i32> %m
	}			}

	define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {			define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
	; CHECK32-LABEL: test_mul_v16i32_v16i8:			; CHECK32-LABEL: test_mul_v16i32_v16i8:
	▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]			; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
	; SSE4-32-NEXT: pmulld %xmm4, %xmm0			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
	; SSE4-32-NEXT: pmulld %xmm4, %xmm1			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
	; SSE4-32-NEXT: pmulld %xmm4, %xmm2			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
	; SSE4-32-NEXT: pmulld %xmm4, %xmm3			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
	; SSE4-32-NEXT: retl			; SSE4-32-NEXT: retl
	;			;
	; SSE4-64-LABEL: test_mul_v16i32_v16i8:			; SSE4-64-LABEL: test_mul_v16i32_v16i8:
	; SSE4-64: # %bb.0:			; SSE4-64: # %bb.0:
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]			; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
	; SSE4-64-NEXT: pmulld %xmm4, %xmm0			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
	; SSE4-64-NEXT: pmulld %xmm4, %xmm1			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
	; SSE4-64-NEXT: pmulld %xmm4, %xmm2			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
	; SSE4-64-NEXT: pmulld %xmm4, %xmm3			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
	; SSE4-64-NEXT: retq			; SSE4-64-NEXT: retq
	;			;
	; AVX2-32-LABEL: test_mul_v16i32_v16i8:			; AVX2-32-LABEL: test_mul_v16i32_v16i8:
	; AVX2-32: # %bb.0:			; AVX2-32: # %bb.0:
	; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero			; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
	; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0			; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
	; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1			; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
	; AVX2-32-NEXT: retl			; AVX2-32-NEXT: retl
	;			;
	; AVX2-64-LABEL: test_mul_v16i32_v16i8:			; AVX2-64-LABEL: test_mul_v16i32_v16i8:
	; AVX2-64: # %bb.0:			; AVX2-64: # %bb.0:
	; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero			; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
	; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0			; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
	; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1			; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
	; AVX2-64-NEXT: retq			; AVX2-64-NEXT: retq
	;			;
	; AVX512-32-LABEL: test_mul_v16i32_v16i8:			; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
	; AVX512-32: # %bb.0:			; AVX512DQ-32: # %bb.0:
	; AVX512-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero			; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0			; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
	; AVX512-32-NEXT: retl			; AVX512DQ-32-NEXT: retl
	;			;
	; AVX512-64-LABEL: test_mul_v16i32_v16i8:			; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
	; AVX512-64: # %bb.0:			; AVX512DQ-64: # %bb.0:
	; AVX512-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero			; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0			; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
	; AVX512-64-NEXT: retq			; AVX512DQ-64-NEXT: retq
				;
				; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
				; AVX512BW-32: # %bb.0:
				; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
				; AVX512BW-32-NEXT: retl
				;
				; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
				; AVX512BW-64: # %bb.0:
				; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
				; AVX512BW-64-NEXT: retq
	%z = zext <16 x i8> %A to <16 x i32>			%z = zext <16 x i8> %A to <16 x i32>
	%m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>			%m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
	ret <16 x i32> %m			ret <16 x i32> %m
	}			}

	define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {			define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
	; CHECK32-LABEL: test_mul_v4i32_v4i16:			; CHECK32-LABEL: test_mul_v4i32_v4i16:
	; CHECK32: # %bb.0:			; CHECK32: # %bb.0:
	▲ Show 20 Lines • Show All 222 Lines • ▼ Show 20 Lines
	;			;
	; MinSize Tests			; MinSize Tests
	;			;

	define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {			define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
	; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:			; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
	; CHECK32: # %bb.0:			; CHECK32: # %bb.0:
	; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0			; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
	; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0			; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
	; CHECK32-NEXT: retl			; CHECK32-NEXT: retl
	;			;
	; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:			; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
	; CHECK64: # %bb.0:			; CHECK64: # %bb.0:
	; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0			; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
	; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0			; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
	; CHECK64-NEXT: retq			; CHECK64-NEXT: retq
	;			;
	; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:			; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
	; SSE4-32: # %bb.0:			; SSE4-32: # %bb.0:
	; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0			; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
	; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0			; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
	; SSE4-32-NEXT: retl			; SSE4-32-NEXT: retl
	;			;
	; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:			; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
	; SSE4-64: # %bb.0:			; SSE4-64: # %bb.0:
	; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0			; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0			; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
	; SSE4-64-NEXT: retq			; SSE4-64-NEXT: retq
	;			;
	; AVX-32-LABEL: test_mul_v4i32_v4i8_minsize:			; AVX-32-LABEL: test_mul_v4i32_v4i8_minsize:
	; AVX-32: # %bb.0:			; AVX-32: # %bb.0:
	; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0			; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
	; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]			; AVX-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
	; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
	; AVX-32-NEXT: retl			; AVX-32-NEXT: retl
	;			;
	; AVX-64-LABEL: test_mul_v4i32_v4i8_minsize:			; AVX-64-LABEL: test_mul_v4i32_v4i8_minsize:
	; AVX-64: # %bb.0:			; AVX-64: # %bb.0:
	; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0			; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
	; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]			; AVX-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
	; AVX-64-NEXT: retq			; AVX-64-NEXT: retq
	%z = zext <4 x i8> %A to <4 x i32>			%z = zext <4 x i8> %A to <4 x i32>
	%m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>			%m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
	ret <4 x i32> %m			ret <4 x i32> %m
	}			}

	define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {			define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
	; CHECK32-LABEL: test_mul_v8i32_v8i8_minsize:			; CHECK32-LABEL: test_mul_v8i32_v8i8_minsize:
	; CHECK32: # %bb.0:			; CHECK32: # %bb.0:
	; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0			; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
	; CHECK32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]			; CHECK32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
	; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero			; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
	; CHECK32-NEXT: pmulld %xmm2, %xmm0			; CHECK32-NEXT: pmaddwd %xmm2, %xmm0
	; CHECK32-NEXT: pmulld %xmm2, %xmm1			; CHECK32-NEXT: pmaddwd %xmm2, %xmm1
	; CHECK32-NEXT: retl			; CHECK32-NEXT: retl
	;			;
	; CHECK64-LABEL: test_mul_v8i32_v8i8_minsize:			; CHECK64-LABEL: test_mul_v8i32_v8i8_minsize:
	; CHECK64: # %bb.0:			; CHECK64: # %bb.0:
	; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0			; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
	; CHECK64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]			; CHECK64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
	; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero			; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
	; CHECK64-NEXT: pmulld %xmm2, %xmm0			; CHECK64-NEXT: pmaddwd %xmm2, %xmm0
	; CHECK64-NEXT: pmulld %xmm2, %xmm1			; CHECK64-NEXT: pmaddwd %xmm2, %xmm1
	; CHECK64-NEXT: retq			; CHECK64-NEXT: retq
	;			;
	; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:			; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
	; SSE4-32: # %bb.0:			; SSE4-32: # %bb.0:
	; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0			; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero			; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
	; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]			; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
	; SSE4-32-NEXT: pmulld %xmm2, %xmm0			; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
	; SSE4-32-NEXT: pmulld %xmm2, %xmm1			; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
	; SSE4-32-NEXT: retl			; SSE4-32-NEXT: retl
	;			;
	; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:			; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
	; SSE4-64: # %bb.0:			; SSE4-64: # %bb.0:
	; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0			; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero			; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
	; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]			; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
	; SSE4-64-NEXT: pmulld %xmm2, %xmm0			; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
	; SSE4-64-NEXT: pmulld %xmm2, %xmm1			; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
	; SSE4-64-NEXT: retq			; SSE4-64-NEXT: retq
	;			;
	; AVX-32-LABEL: test_mul_v8i32_v8i8_minsize:			; AVX-32-LABEL: test_mul_v8i32_v8i8_minsize:
	; AVX-32: # %bb.0:			; AVX-32: # %bb.0:
	; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0			; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
	; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
	; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
	; AVX-32-NEXT: retl			; AVX-32-NEXT: retl
	;			;
	; AVX-64-LABEL: test_mul_v8i32_v8i8_minsize:			; AVX-64-LABEL: test_mul_v8i32_v8i8_minsize:
	; AVX-64: # %bb.0:			; AVX-64: # %bb.0:
	; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0			; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
	; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
	; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
	; AVX-64-NEXT: retq			; AVX-64-NEXT: retq
	%z = zext <8 x i8> %A to <8 x i32>			%z = zext <8 x i8> %A to <8 x i32>
	%m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>			%m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
	ret <8 x i32> %m			ret <8 x i32> %m
	}			}

	define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {			define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
	; CHECK32-LABEL: test_mul_v16i32_v16i8_minsize:			; CHECK32-LABEL: test_mul_v16i32_v16i8_minsize:
	; CHECK32: # %bb.0:			; CHECK32: # %bb.0:
	; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
	; CHECK32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]			; CHECK32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
	; CHECK32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]			; CHECK32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
	; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero			; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
	; CHECK32-NEXT: pmulld %xmm5, %xmm0			; CHECK32-NEXT: pmaddwd %xmm5, %xmm0
	; CHECK32-NEXT: pmulld %xmm5, %xmm1			; CHECK32-NEXT: pmaddwd %xmm5, %xmm1
	; CHECK32-NEXT: pmulld %xmm5, %xmm2			; CHECK32-NEXT: pmaddwd %xmm5, %xmm2
	; CHECK32-NEXT: pmulld %xmm5, %xmm3			; CHECK32-NEXT: pmaddwd %xmm5, %xmm3
	; CHECK32-NEXT: retl			; CHECK32-NEXT: retl
	;			;
	; CHECK64-LABEL: test_mul_v16i32_v16i8_minsize:			; CHECK64-LABEL: test_mul_v16i32_v16i8_minsize:
	; CHECK64: # %bb.0:			; CHECK64: # %bb.0:
	; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
	; CHECK64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]			; CHECK64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
	; CHECK64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]			; CHECK64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
	; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero			; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
	; CHECK64-NEXT: pmulld %xmm5, %xmm0			; CHECK64-NEXT: pmaddwd %xmm5, %xmm0
	; CHECK64-NEXT: pmulld %xmm5, %xmm1			; CHECK64-NEXT: pmaddwd %xmm5, %xmm1
	; CHECK64-NEXT: pmulld %xmm5, %xmm2			; CHECK64-NEXT: pmaddwd %xmm5, %xmm2
	; CHECK64-NEXT: pmulld %xmm5, %xmm3			; CHECK64-NEXT: pmaddwd %xmm5, %xmm3
	; CHECK64-NEXT: retq			; CHECK64-NEXT: retq
	;			;
	; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:			; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
	; SSE4-32: # %bb.0:			; SSE4-32: # %bb.0:
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]			; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
	; SSE4-32-NEXT: pmulld %xmm4, %xmm0			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
	; SSE4-32-NEXT: pmulld %xmm4, %xmm1			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
	; SSE4-32-NEXT: pmulld %xmm4, %xmm2			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
	; SSE4-32-NEXT: pmulld %xmm4, %xmm3			; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
	; SSE4-32-NEXT: retl			; SSE4-32-NEXT: retl
	;			;
	; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:			; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
	; SSE4-64: # %bb.0:			; SSE4-64: # %bb.0:
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]			; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
	; SSE4-64-NEXT: pmulld %xmm4, %xmm0			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
	; SSE4-64-NEXT: pmulld %xmm4, %xmm1			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
	; SSE4-64-NEXT: pmulld %xmm4, %xmm2			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
	; SSE4-64-NEXT: pmulld %xmm4, %xmm3			; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
	; SSE4-64-NEXT: retq			; SSE4-64-NEXT: retq
	;			;
	; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:			; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
	; AVX2-32: # %bb.0:			; AVX2-32: # %bb.0:
	; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero			; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
	; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0			; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
	; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1			; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
	; AVX2-32-NEXT: retl			; AVX2-32-NEXT: retl
	;			;
	; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:			; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
	; AVX2-64: # %bb.0:			; AVX2-64: # %bb.0:
	; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero			; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]			; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
	; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0			; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
	; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1			; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
	; AVX2-64-NEXT: retq			; AVX2-64-NEXT: retq
	;			;
	; AVX512-32-LABEL: test_mul_v16i32_v16i8_minsize:			; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
	; AVX512-32: # %bb.0:			; AVX512DQ-32: # %bb.0:
	; AVX512-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero			; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0			; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
	; AVX512-32-NEXT: retl			; AVX512DQ-32-NEXT: retl
	;			;
	; AVX512-64-LABEL: test_mul_v16i32_v16i8_minsize:			; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
	; AVX512-64: # %bb.0:			; AVX512DQ-64: # %bb.0:
	; AVX512-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero			; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0			; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
	; AVX512-64-NEXT: retq			; AVX512DQ-64-NEXT: retq
				;
				; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
				; AVX512BW-32: # %bb.0:
				; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
				; AVX512BW-32-NEXT: retl
				;
				; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
				; AVX512BW-64: # %bb.0:
				; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
				; AVX512BW-64-NEXT: retq
	%z = zext <16 x i8> %A to <16 x i32>			%z = zext <16 x i8> %A to <16 x i32>
	%m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>			%m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
	ret <16 x i32> %m			ret <16 x i32> %m
	}			}

	define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {			define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
	; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:			; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
	; CHECK32: # %bb.0:			; CHECK32: # %bb.0:
	▲ Show 20 Lines • Show All 205 Lines • Show Last 20 Lines