This is an archive of the discontinued LLVM Phabricator instance.

[TargetLowering] Add support for non-uniform vectors to BuildExactSDIV
ClosedPublic

Authored by RKSimon on Aug 7 2018, 9:12 AM.

Download Raw Diff

Details

Reviewers

spatel
efriedma
craig.topper
andreadb

Commits

rGa272fa9b0c32: [TargetLowering] Add support for non-uniform vectors to BuildExactSDIV
rL339756: [TargetLowering] Add support for non-uniform vectors to BuildExactSDIV

Summary

This patch refactors the existing BuildExactSDIV implementation to support non-uniform constant vector denominators.

I've ended up duplicating much of the scalar/vector code pattern that I did for TargetLowering::BuildUDIV (D49248) - calling a 'BuildSDIVPattern' helper - does anyone have any suggestions how I could reduce this further? I'm going to end up doing the same again for TargetLowering::BuildSDIV as well when I get around to it.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Aug 7 2018, 9:12 AM

Is it possible to break this into a NFC refactor and a separate patch to handle non-uniform denominators?

In D50392#1191041, @greened wrote:

Is it possible to break this into a NFC refactor and a separate patch to handle non-uniform denominators?

Sure, pulling out the BuildSDIVPattern helper and the codegen is doable as an NFC. That would leave the patch mainly as the if(isVector()) .. else .. block

Sure, pulling out the BuildSDIVPattern helper and the codegen is doable as an NFC. That would leave the patch mainly as the if(isVector()) .. else .. block

The intent would be to have the largish code changes not result in test changes and then the smaller functional change would be reflected in the tests.

RKSimon mentioned this in rL339246: [TargetLowering] Remove APInt divisor argument from BuildExactSDIV (NFCI)..Aug 8 2018, 7:00 AM

RKSimon mentioned this in rL339346: [TargetLowering] Add BuildSDIVPattern helper to BuildExactSDIV (NFCI)..Aug 9 2018, 6:56 AM

Rebased.

I've refactored the approach slightly to demonstrate ISD::matchUnaryPredicate being used to iterate over a scalar or the elements of a vector. - this could be used in TargetLowering::BuildUDIV as well to make the equivalent code easier to understand.

ping?

craig.topper added inline comments.Aug 14 2018, 12:48 PM

test/CodeGen/X86/sdiv-exact.ll
2	Why are the features flags between 32-bit and 64-bit mismatched? This results in making 64-bit look way better than 32-bit in the modified test cases. For example test5 where 64-bit is one instruction.

RKSimon added inline comments.Aug 14 2018, 1:05 PM

test/CodeGen/X86/sdiv-exact.ll
2	Laziness mainly - I didn't see much gain from SSE2/AVX2 codegen tests for both 32/64 - its easy enough to add if you think its useful?

LGTM

test/CodeGen/X86/sdiv-exact.ll
2	I agree there's probably not much value. I just had to go up here to figure out why 64-bit looked so much better.

This revision is now accepted and ready to land.Aug 14 2018, 1:24 PM

Closed by commit rL339756: [TargetLowering] Add support for non-uniform vectors to BuildExactSDIV (authored by RKSimon). · Explain WhyAug 15 2018, 2:35 AM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in rL339758: [TargetLowering] Minor refactor to TargetLowering::BuildUDIV to merge….Aug 15 2018, 3:11 AM

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

	TargetLowering.cpp
	TargetLowering.cpp (revision 339346)

36 lines

test/

CodeGen/

X86/

	sdiv-exact.ll
	sdiv-exact.ll (revision 339335)

172 lines

Diff 159922

lib/CodeGen/SelectionDAG/TargetLowering.cpp

	Show First 20 Lines • Show All 3,432 Lines • ▼ Show 20 Lines
	/// Given an exact SDIV by a constant, create a multiplication			/// Given an exact SDIV by a constant, create a multiplication
	/// with the multiplicative inverse of the constant.			/// with the multiplicative inverse of the constant.
	static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,			static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
	const SDLoc &dl, SelectionDAG &DAG,			const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) {			SmallVectorImpl<SDNode *> &Created) {
	SDValue Op0 = N->getOperand(0);			SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);			SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);
				EVT SVT = VT.getScalarType();
	EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());			EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
				EVT ShSVT = ShVT.getScalarType();

	auto BuildSDIVPattern = [](APInt Divisor, unsigned &Shift, APInt &Factor) {
	bool UseSRA = false;			bool UseSRA = false;
	Shift = Divisor.countTrailingZeros();			SmallVector<SDValue, 16> Shifts, Factors;

				auto BuildSDIVPattern = [&](ConstantSDNode *C) {
				if (C->isNullValue())
				return false;
				APInt Divisor = C->getAPIntValue();
				unsigned Shift = Divisor.countTrailingZeros();
	if (Shift) {			if (Shift) {
	Divisor.ashrInPlace(Shift);			Divisor.ashrInPlace(Shift);
	UseSRA = true;			UseSRA = true;
	}			}
	// Calculate the multiplicative inverse, using Newton's method.			// Calculate the multiplicative inverse, using Newton's method.
	APInt t;			APInt t;
	Factor = Divisor;			APInt Factor = Divisor;
	while ((t = Divisor * Factor) != 1)			while ((t = Divisor * Factor) != 1)
	Factor *= APInt(Divisor.getBitWidth(), 2) - t;			Factor *= APInt(Divisor.getBitWidth(), 2) - t;
	return UseSRA;			Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
				Factors.push_back(DAG.getConstant(Factor, dl, SVT));
				return true;
	};			};

	ConstantSDNode *C = isConstOrConstSplat(Op1);			// Collect all magic values from the build vector.
	if (!C \|\| C->isNullValue())			if (!ISD::matchUnaryPredicate(Op1, BuildSDIVPattern))
	return SDValue();			return SDValue();

	APInt FactorVal;			SDValue Shift, Factor;
	unsigned ShiftVal;			if (VT.isVector()) {
	bool UseSRA = BuildSDIVPattern(C->getAPIntValue(), ShiftVal, FactorVal);			Shift = DAG.getBuildVector(ShVT, dl, Shifts);
	SDValue Shift = DAG.getConstant(ShiftVal, dl, ShVT);			Factor = DAG.getBuildVector(VT, dl, Factors);
	SDValue Factor = DAG.getConstant(FactorVal, dl, VT);			} else {
				Shift = Shifts[0];
				Factor = Factors[0];
				}

	SDValue Res = Op0;			SDValue Res = Op0;

	// Shift the value upfront if it is even, so the LSB is one.			// Shift the value upfront if it is even, so the LSB is one.
	if (UseSRA) {			if (UseSRA) {
	// TODO: For UDIV use SRL instead of SRA.			// TODO: For UDIV use SRL instead of SRA.
	SDNodeFlags Flags;			SDNodeFlags Flags;
	Flags.setExact(true);			Flags.setExact(true);
	▲ Show 20 Lines • Show All 1,034 Lines • Show Last 20 Lines

test/CodeGen/X86/sdiv-exact.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=X86			; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=X86
				craig.topperUnsubmitted Not Done Reply Inline Actions Why are the features flags between 32-bit and 64-bit mismatched? This results in making 64-bit look way better than 32-bit in the modified test cases. For example test5 where 64-bit is one instruction. craig.topper: Why are the features flags between 32-bit and 64-bit mismatched? This results in making 64-bit…
				RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Laziness mainly - I didn't see much gain from SSE2/AVX2 codegen tests for both 32/64 - its easy enough to add if you think its useful? RKSimon: Laziness mainly - I didn't see much gain from SSE2/AVX2 codegen tests for both 32/64 - its easy…
				craig.topperUnsubmitted Not Done Reply Inline Actions I agree there's probably not much value. I just had to go up here to figure out why 64-bit looked so much better. craig.topper: I agree there's probably not much value. I just had to go up here to figure out why 64-bit…
	; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=X64

	define i32 @test1(i32 %x) {			define i32 @test1(i32 %x) {
	; X86-LABEL: test1:			; X86-LABEL: test1:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %eax # imm = 0xC28F5C29			; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %eax # imm = 0xC28F5C29
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines
	; X64-NEXT: retq			; X64-NEXT: retq
	%div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 25, i32 25>			%div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 25, i32 25>
	ret <4 x i32> %div			ret <4 x i32> %div
	}			}

	define <4 x i32> @test5(<4 x i32> %x) {			define <4 x i32> @test5(<4 x i32> %x) {
	; X86-LABEL: test5:			; X86-LABEL: test5:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3264175145,3264175145]
	; X86-NEXT: movd %xmm1, %eax			; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
	; X86-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29			; X86-NEXT: pmuludq %xmm1, %xmm0
	; X86-NEXT: movd %eax, %xmm1			; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]			; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; X86-NEXT: movd %xmm2, %eax			; X86-NEXT: pmuludq %xmm2, %xmm1
	; X86-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29			; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; X86-NEXT: movd %eax, %xmm2			; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; X86-NEXT: movd %xmm0, %eax
	; X86-NEXT: sarl $3, %eax
	; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X86-NEXT: movd %eax, %xmm1
	; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; X86-NEXT: movd %xmm0, %eax
	; X86-NEXT: sarl $3, %eax
	; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X86-NEXT: movd %eax, %xmm0
	; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
	; X86-NEXT: movdqa %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: test5:			; X64-LABEL: test5:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: vpextrd $1, %xmm0, %eax			; X64-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: sarl $3, %eax
	; X64-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X64-NEXT: vmovd %xmm0, %ecx
	; X64-NEXT: sarl $3, %ecx
	; X64-NEXT: imull $-1431655765, %ecx, %ecx # imm = 0xAAAAAAAB
	; X64-NEXT: vmovd %ecx, %xmm1
	; X64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
	; X64-NEXT: vpextrd $2, %xmm0, %eax
	; X64-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29
	; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
	; X64-NEXT: vpextrd $3, %xmm0, %eax
	; X64-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29
	; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>			%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
	ret <4 x i32> %div			ret <4 x i32> %div
	}			}

	define <4 x i32> @test6(<4 x i32> %x) {			define <4 x i32> @test6(<4 x i32> %x) {
	; X86-LABEL: test6:			; X86-LABEL: test6:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; X86-NEXT: movdqa %xmm0, %xmm1
	; X86-NEXT: movd %xmm1, %eax			; X86-NEXT: psrad $3, %xmm1
	; X86-NEXT: sarl %eax			; X86-NEXT: psrad $1, %xmm0
	; X86-NEXT: imull $-991146299, %eax, %eax # imm = 0xC4EC4EC5			; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
	; X86-NEXT: movd %eax, %xmm1			; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997]
	; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]			; X86-NEXT: movapd %xmm0, %xmm1
	; X86-NEXT: movd %xmm2, %eax			; X86-NEXT: pmuludq %xmm2, %xmm1
	; X86-NEXT: sarl %eax			; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; X86-NEXT: imull $-991146299, %eax, %eax # imm = 0xC4EC4EC5			; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; X86-NEXT: movd %eax, %xmm2			; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]			; X86-NEXT: pmuludq %xmm0, %xmm2
	; X86-NEXT: movd %xmm0, %eax			; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
	; X86-NEXT: sarl $3, %eax
	; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X86-NEXT: movd %eax, %xmm1
	; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; X86-NEXT: movd %xmm0, %eax
	; X86-NEXT: sarl $3, %eax
	; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X86-NEXT: movd %eax, %xmm0
	; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]			; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
	; X86-NEXT: movdqa %xmm1, %xmm0			; X86-NEXT: movdqa %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: test6:			; X64-LABEL: test6:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: vpextrd $1, %xmm0, %eax			; X64-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: sarl $3, %eax			; X64-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X64-NEXT: vmovd %xmm0, %ecx
	; X64-NEXT: sarl $3, %ecx
	; X64-NEXT: imull $-1431655765, %ecx, %ecx # imm = 0xAAAAAAAB
	; X64-NEXT: vmovd %ecx, %xmm1
	; X64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
	; X64-NEXT: vpextrd $2, %xmm0, %eax
	; X64-NEXT: sarl %eax
	; X64-NEXT: imull $-991146299, %eax, %eax # imm = 0xC4EC4EC5
	; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
	; X64-NEXT: vpextrd $3, %xmm0, %eax
	; X64-NEXT: sarl %eax
	; X64-NEXT: imull $-991146299, %eax, %eax # imm = 0xC4EC4EC5
	; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>			%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
	ret <4 x i32> %div			ret <4 x i32> %div
	}			}

	define <4 x i32> @test7(<4 x i32> %x) {			define <4 x i32> @test7(<4 x i32> %x) {
	; X86-LABEL: test7:			; X86-LABEL: test7:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491]
	; X86-NEXT: movd %xmm1, %eax			; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
	; X86-NEXT: imull $1749801491, %eax, %eax # imm = 0x684BDA13			; X86-NEXT: pmuludq %xmm1, %xmm0
	; X86-NEXT: movd %eax, %xmm1			; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]			; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; X86-NEXT: movd %xmm2, %eax			; X86-NEXT: pmuludq %xmm2, %xmm1
	; X86-NEXT: imull $1749801491, %eax, %eax # imm = 0x684BDA13			; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; X86-NEXT: movd %eax, %xmm2			; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; X86-NEXT: movd %xmm0, %eax
	; X86-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29
	; X86-NEXT: movd %eax, %xmm1
	; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; X86-NEXT: movd %xmm0, %eax
	; X86-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29
	; X86-NEXT: movd %eax, %xmm0
	; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
	; X86-NEXT: movdqa %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: test7:			; X64-LABEL: test7:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: vpextrd $1, %xmm0, %eax			; X64-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: imull $-1030792151, %eax, %eax # imm = 0xC28F5C29
	; X64-NEXT: vmovd %xmm0, %ecx
	; X64-NEXT: imull $-1030792151, %ecx, %ecx # imm = 0xC28F5C29
	; X64-NEXT: vmovd %ecx, %xmm1
	; X64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
	; X64-NEXT: vpextrd $2, %xmm0, %eax
	; X64-NEXT: imull $1749801491, %eax, %eax # imm = 0x684BDA13
	; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
	; X64-NEXT: vpextrd $3, %xmm0, %eax
	; X64-NEXT: imull $1749801491, %eax, %eax # imm = 0x684BDA13
	; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>			%div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
	ret <4 x i32> %div			ret <4 x i32> %div
	}			}

	define <4 x i32> @test8(<4 x i32> %x) {			define <4 x i32> @test8(<4 x i32> %x) {
	; X86-LABEL: test8:			; X86-LABEL: test8:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; X86-NEXT: movdqa %xmm0, %xmm1
	; X86-NEXT: movd %xmm1, %eax			; X86-NEXT: psrad $3, %xmm1
	; X86-NEXT: sarl $3, %eax			; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
	; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB			; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531]
	; X86-NEXT: movd %eax, %xmm1			; X86-NEXT: movapd %xmm1, %xmm0
	; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]			; X86-NEXT: pmuludq %xmm2, %xmm0
	; X86-NEXT: movaps %xmm0, %xmm2			; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]			; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]			; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; X86-NEXT: movd %xmm1, %eax			; X86-NEXT: pmuludq %xmm1, %xmm2
	; X86-NEXT: sarl $3, %eax			; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
	; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB			; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; X86-NEXT: movd %eax, %xmm1
	; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
	; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: test8:			; X64-LABEL: test8:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: vpextrd $2, %xmm0, %eax			; X64-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: sarl $3, %eax			; X64-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm1
	; X64-NEXT: vpextrd $3, %xmm0, %eax
	; X64-NEXT: sarl $3, %eax
	; X64-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
	; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%div = sdiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>			%div = sdiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
	ret <4 x i32> %div			ret <4 x i32> %div
	}			}