This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Add cost model for BSWAP of vectors
ClosedPublic

Authored by RKSimon on Jun 20 2016, 10:47 AM.

Download Raw Diff

Details

Reviewers

spatel
delena
andreadb
mkuper
hfinkel

Commits

rG356e823b51c3: [X86][SSE] Add cost model for BSWAP of vectors
rL273217: [X86][SSE] Add cost model for BSWAP of vectors

Summary

The BSWAP of vector types is quite efficiently implemented using vector shuffles on SSE/AVX targets, we should reflect the typical cost of this to encourage vectorization.

Also, we're not making much use of the intrinsic costings on any target - for instance why do we not use this for CTPOP instead of the rather limited getPopcntSupport() approach? CTLZ/CTTZ would probably benefit as well.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 61267.Jun 20 2016, 10:47 AM

RKSimon retitled this revision from to [X86][SSE] Add cost model for BSWAP of vectors.

RKSimon updated this object.

RKSimon added reviewers: hfinkel, delena, mkuper, andreadb, spatel.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

I assume the costs are based on the lowering we have in test/CodeGen/X86/bswap-vector.ll ?
In any case, this LGTM.

(Out of curiosity, do you see this get hit in practice?)

This revision is now accepted and ready to land.Jun 20 2016, 11:02 AM

In D21521#462314, @mkuper wrote:

I assume the costs are based on the lowering we have in test/CodeGen/X86/bswap-vector.ll ?
In any case, this LGTM.

(Out of curiosity, do you see this get hit in practice?)

Thanks Michael - yes the costs are based off the codegen from bswap-vector.ll

I have a couple of examples where this patch (along with enabling the loadcombine pass) can help vectorize the loading/conversion of big endian float data.

Closed by commit rL273217: [X86][SSE] Add cost model for BSWAP of vectors (authored by RKSimon). · Explain WhyJun 20 2016, 4:15 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

	X86TargetTransformInfo.cpp
	X86TargetTransformInfo.cpp (revision 273167)

27 lines

test/

Analysis/

CostModel/

X86/

	bswap.ll
	bswap.ll (revision 273167)

68 lines

	scalarize.ll
	scalarize.ll (revision 273167)

8 lines

Transforms/

SLPVectorizer/

X86/

	bswap.ll
	bswap.ll (revision 273167)

202 lines

Diff 61267

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 945 Lines • ▼ Show 20 Lines	static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::i32, 3 },		{ ISD::BITREVERSE, MVT::i32, 3 },
{ ISD::BITREVERSE, MVT::i16, 3 },		{ ISD::BITREVERSE, MVT::i16, 3 },
{ ISD::BITREVERSE, MVT::i8, 3 }		{ ISD::BITREVERSE, MVT::i8, 3 }
};		};
static const CostTblEntry AVX2CostTbl[] = {		static const CostTblEntry AVX2CostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 5 },		{ ISD::BITREVERSE, MVT::v4i64, 5 },
{ ISD::BITREVERSE, MVT::v8i32, 5 },		{ ISD::BITREVERSE, MVT::v8i32, 5 },
{ ISD::BITREVERSE, MVT::v16i16, 5 },		{ ISD::BITREVERSE, MVT::v16i16, 5 },
{ ISD::BITREVERSE, MVT::v32i8, 5 }		{ ISD::BITREVERSE, MVT::v32i8, 5 },
		{ ISD::BSWAP, MVT::v4i64, 1 },
		{ ISD::BSWAP, MVT::v8i32, 1 },
		{ ISD::BSWAP, MVT::v16i16, 1 }
};		};
static const CostTblEntry AVX1CostTbl[] = {		static const CostTblEntry AVX1CostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 10 },		{ ISD::BITREVERSE, MVT::v4i64, 10 },
{ ISD::BITREVERSE, MVT::v8i32, 10 },		{ ISD::BITREVERSE, MVT::v8i32, 10 },
{ ISD::BITREVERSE, MVT::v16i16, 10 },		{ ISD::BITREVERSE, MVT::v16i16, 10 },
{ ISD::BITREVERSE, MVT::v32i8, 10 }		{ ISD::BITREVERSE, MVT::v32i8, 10 },
		{ ISD::BSWAP, MVT::v4i64, 4 },
		{ ISD::BSWAP, MVT::v8i32, 4 },
		{ ISD::BSWAP, MVT::v16i16, 4 }
};		};
static const CostTblEntry SSSE3CostTbl[] = {		static const CostTblEntry SSSE3CostTbl[] = {
{ ISD::BITREVERSE, MVT::v2i64, 5 },		{ ISD::BITREVERSE, MVT::v2i64, 5 },
{ ISD::BITREVERSE, MVT::v4i32, 5 },		{ ISD::BITREVERSE, MVT::v4i32, 5 },
{ ISD::BITREVERSE, MVT::v8i16, 5 },		{ ISD::BITREVERSE, MVT::v8i16, 5 },
{ ISD::BITREVERSE, MVT::v16i8, 5 }		{ ISD::BITREVERSE, MVT::v16i8, 5 },
		{ ISD::BSWAP, MVT::v2i64, 1 },
		{ ISD::BSWAP, MVT::v4i32, 1 },
		{ ISD::BSWAP, MVT::v8i16, 1 }
		};
		static const CostTblEntry SSE2CostTbl[] = {
		{ ISD::BSWAP, MVT::v2i64, 7 },
		{ ISD::BSWAP, MVT::v4i32, 7 },
		{ ISD::BSWAP, MVT::v8i16, 7 }
};		};

unsigned ISD = ISD::DELETED_NODE;		unsigned ISD = ISD::DELETED_NODE;
switch (IID) {		switch (IID) {
default:		default:
break;		break;
case Intrinsic::bitreverse:		case Intrinsic::bitreverse:
ISD = ISD::BITREVERSE;		ISD = ISD::BITREVERSE;
break;		break;
		case Intrinsic::bswap:
		ISD = ISD::BSWAP;
		break;
}		}

// Legalize the type.		// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);		std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
MVT MTy = LT.second;		MVT MTy = LT.second;

// Attempt to lookup cost.		// Attempt to lookup cost.
if (ST->hasXOP())		if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;

if (ST->hasAVX2())		if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;

if (ST->hasAVX())		if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;

if (ST->hasSSSE3())		if (ST->hasSSSE3())
if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;

		if (ST->hasSSE2())
		if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
		return LT.first * Entry->Cost;

return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);		return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
}		}

int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,		int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF) {		ArrayRef<Value *> Args, FastMathFlags FMF) {
return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);		return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
}		}

▲ Show 20 Lines • Show All 583 Lines • Show Last 20 Lines

test/Analysis/CostModel/X86/bswap.ll

	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=SSE2			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=SSE42			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=SSE42
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=AVX			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=AVX2			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX1
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze \| FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2

	; Verify the cost of vector bswap instructions.			; Verify the cost of vector bswap instructions.

	declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)			declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
	declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)			declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
	declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)			declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)

	declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)			declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
	declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)			declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
	declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)			declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)

	define <2 x i64> @var_bswap_v2i64(<2 x i64> %a) {			define <2 x i64> @var_bswap_v2i64(<2 x i64> %a) {
	; CHECK: 'Cost Model Analysis' for function 'var_bswap_v2i64':			; CHECK: 'Cost Model Analysis' for function 'var_bswap_v2i64':
	; SSE2: Found an estimated cost of 6 for instruction: %bswap			; SSE2: Found an estimated cost of 7 for instruction: %bswap
	; SSE42: Found an estimated cost of 6 for instruction: %bswap			; SSE42: Found an estimated cost of 1 for instruction: %bswap
	; AVX: Found an estimated cost of 6 for instruction: %bswap			; AVX: Found an estimated cost of 1 for instruction: %bswap
	; AVX2: Found an estimated cost of 6 for instruction: %bswap			; XOP: Found an estimated cost of 1 for instruction: %bswap
	; XOP: Found an estimated cost of 6 for instruction: %bswap
	%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)			%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)
	ret <2 x i64> %bswap			ret <2 x i64> %bswap
	}			}

	define <4 x i64> @var_bswap_v4i64(<4 x i64> %a) {			define <4 x i64> @var_bswap_v4i64(<4 x i64> %a) {
	; CHECK: 'Cost Model Analysis' for function 'var_bswap_v4i64':			; CHECK: 'Cost Model Analysis' for function 'var_bswap_v4i64':
	; SSE2: Found an estimated cost of 12 for instruction: %bswap			; SSE2: Found an estimated cost of 14 for instruction: %bswap
	; SSE42: Found an estimated cost of 12 for instruction: %bswap			; SSE42: Found an estimated cost of 2 for instruction: %bswap
	; AVX: Found an estimated cost of 12 for instruction: %bswap			; AVX1: Found an estimated cost of 4 for instruction: %bswap
	; AVX2: Found an estimated cost of 12 for instruction: %bswap			; AVX2: Found an estimated cost of 1 for instruction: %bswap
	; XOP: Found an estimated cost of 12 for instruction: %bswap			; XOPAVX1: Found an estimated cost of 4 for instruction: %bswap
				; XOPAVX2: Found an estimated cost of 1 for instruction: %bswap
	%bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a)			%bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a)
	ret <4 x i64> %bswap			ret <4 x i64> %bswap
	}			}

	define <4 x i32> @var_bswap_v4i32(<4 x i32> %a) {			define <4 x i32> @var_bswap_v4i32(<4 x i32> %a) {
	; CHECK: 'Cost Model Analysis' for function 'var_bswap_v4i32':			; CHECK: 'Cost Model Analysis' for function 'var_bswap_v4i32':
	; SSE2: Found an estimated cost of 12 for instruction: %bswap			; SSE2: Found an estimated cost of 7 for instruction: %bswap
	; SSE42: Found an estimated cost of 12 for instruction: %bswap			; SSE42: Found an estimated cost of 1 for instruction: %bswap
	; AVX: Found an estimated cost of 12 for instruction: %bswap			; AVX: Found an estimated cost of 1 for instruction: %bswap
	; AVX2: Found an estimated cost of 12 for instruction: %bswap			; XOP: Found an estimated cost of 1 for instruction: %bswap
	; XOP: Found an estimated cost of 12 for instruction: %bswap
	%bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a)			%bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a)
	ret <4 x i32> %bswap			ret <4 x i32> %bswap
	}			}

	define <8 x i32> @var_bswap_v8i32(<8 x i32> %a) {			define <8 x i32> @var_bswap_v8i32(<8 x i32> %a) {
	; CHECK: 'Cost Model Analysis' for function 'var_bswap_v8i32':			; CHECK: 'Cost Model Analysis' for function 'var_bswap_v8i32':
	; SSE2: Found an estimated cost of 24 for instruction: %bswap			; SSE2: Found an estimated cost of 14 for instruction: %bswap
	; SSE42: Found an estimated cost of 24 for instruction: %bswap			; SSE42: Found an estimated cost of 2 for instruction: %bswap
	; AVX: Found an estimated cost of 24 for instruction: %bswap			; AVX1: Found an estimated cost of 4 for instruction: %bswap
	; AVX2: Found an estimated cost of 24 for instruction: %bswap			; AVX2: Found an estimated cost of 1 for instruction: %bswap
	; XOP: Found an estimated cost of 24 for instruction: %bswap			; XOPAVX1: Found an estimated cost of 4 for instruction: %bswap
				; XOPAVX2: Found an estimated cost of 1 for instruction: %bswap
	%bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a)			%bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a)
	ret <8 x i32> %bswap			ret <8 x i32> %bswap
	}			}

	define <8 x i16> @var_bswap_v8i16(<8 x i16> %a) {			define <8 x i16> @var_bswap_v8i16(<8 x i16> %a) {
	; CHECK: 'Cost Model Analysis' for function 'var_bswap_v8i16':			; CHECK: 'Cost Model Analysis' for function 'var_bswap_v8i16':
	; SSE2: Found an estimated cost of 24 for instruction: %bswap			; SSE2: Found an estimated cost of 7 for instruction: %bswap
	; SSE42: Found an estimated cost of 24 for instruction: %bswap			; SSE42: Found an estimated cost of 1 for instruction: %bswap
	; AVX: Found an estimated cost of 24 for instruction: %bswap			; AVX: Found an estimated cost of 1 for instruction: %bswap
	; AVX2: Found an estimated cost of 24 for instruction: %bswap			; XOP: Found an estimated cost of 1 for instruction: %bswap
	; XOP: Found an estimated cost of 24 for instruction: %bswap
	%bswap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)			%bswap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)
	ret <8 x i16> %bswap			ret <8 x i16> %bswap
	}			}

	define <16 x i16> @var_bswap_v16i16(<16 x i16> %a) {			define <16 x i16> @var_bswap_v16i16(<16 x i16> %a) {
	; CHECK: 'Cost Model Analysis' for function 'var_bswap_v16i16':			; CHECK: 'Cost Model Analysis' for function 'var_bswap_v16i16':
	; SSE2: Found an estimated cost of 48 for instruction: %bswap			; SSE2: Found an estimated cost of 14 for instruction: %bswap
	; SSE42: Found an estimated cost of 48 for instruction: %bswap			; SSE42: Found an estimated cost of 2 for instruction: %bswap
	; AVX: Found an estimated cost of 48 for instruction: %bswap			; AVX1: Found an estimated cost of 4 for instruction: %bswap
	; AVX2: Found an estimated cost of 48 for instruction: %bswap			; AVX2: Found an estimated cost of 1 for instruction: %bswap
	; XOP: Found an estimated cost of 48 for instruction: %bswap			; XOPAVX1: Found an estimated cost of 4 for instruction: %bswap
				; XOPAVX2: Found an estimated cost of 1 for instruction: %bswap
	%bswap = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a)			%bswap = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a)
	ret <16 x i16> %bswap			ret <16 x i16> %bswap
	}			}
	No newline at end of file

test/Analysis/CostModel/X86/scalarize.ll

	Show All 15 Lines
	declare %i4 @llvm.cttz.v4i32(%i4)			declare %i4 @llvm.cttz.v4i32(%i4)
	declare %i8 @llvm.cttz.v2i64(%i8)			declare %i8 @llvm.cttz.v2i64(%i8)

	; CHECK32-LABEL: test_scalarized_intrinsics			; CHECK32-LABEL: test_scalarized_intrinsics
	; CHECK64-LABEL: test_scalarized_intrinsics			; CHECK64-LABEL: test_scalarized_intrinsics
	define void @test_scalarized_intrinsics() {			define void @test_scalarized_intrinsics() {
	%r1 = add %i8 undef, undef			%r1 = add %i8 undef, undef

	; CHECK32: cost of 12 {{.*}}bswap.v4i32			; CHECK32: cost of 1 {{.*}}bswap.v4i32
	; CHECK64: cost of 12 {{.*}}bswap.v4i32			; CHECK64: cost of 1 {{.*}}bswap.v4i32
	%r2 = call %i4 @llvm.bswap.v4i32(%i4 undef)			%r2 = call %i4 @llvm.bswap.v4i32(%i4 undef)
	; CHECK32: cost of 10 {{.*}}bswap.v2i64			; CHECK32: cost of 1 {{.*}}bswap.v2i64
	; CHECK64: cost of 6 {{.*}}bswap.v2i64			; CHECK64: cost of 1 {{.*}}bswap.v2i64
	%r3 = call %i8 @llvm.bswap.v2i64(%i8 undef)			%r3 = call %i8 @llvm.bswap.v2i64(%i8 undef)

	; CHECK32: cost of 12 {{.*}}cttz.v4i32			; CHECK32: cost of 12 {{.*}}cttz.v4i32
	; CHECK64: cost of 12 {{.*}}cttz.v4i32			; CHECK64: cost of 12 {{.*}}cttz.v4i32
	%r4 = call %i4 @llvm.cttz.v4i32(%i4 undef)			%r4 = call %i4 @llvm.cttz.v4i32(%i4 undef)
	; CHECK32: cost of 10 {{.*}}cttz.v2i64			; CHECK32: cost of 10 {{.*}}cttz.v2i64
	; CHECK64: cost of 6 {{.*}}cttz.v2i64			; CHECK64: cost of 6 {{.*}}cttz.v2i64
	%r5 = call %i8 @llvm.cttz.v2i64(%i8 undef)			%r5 = call %i8 @llvm.cttz.v2i64(%i8 undef)

	; CHECK32: ret			; CHECK32: ret
	; CHECK64: ret			; CHECK64: ret
	ret void			ret void
	}			}

test/Transforms/SLPVectorizer/X86/bswap.ll

Show All 11 Lines
@src16 = common global [16 x i16] zeroinitializer, align 32		@src16 = common global [16 x i16] zeroinitializer, align 32
@dst16 = common global [16 x i16] zeroinitializer, align 32		@dst16 = common global [16 x i16] zeroinitializer, align 32

declare i64 @llvm.bswap.i64(i64)		declare i64 @llvm.bswap.i64(i64)
declare i32 @llvm.bswap.i32(i32)		declare i32 @llvm.bswap.i32(i32)
declare i16 @llvm.bswap.i16(i16)		declare i16 @llvm.bswap.i16(i16)

define void @bswap_2i64() #0 {		define void @bswap_2i64() #0 {
; CHECK-LABEL: @bswap_2i64(		; SSE-LABEL: @bswap_2i64(
; CHECK-NEXT: [[LD0:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8		; SSE-NEXT: [[LD0:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
; CHECK-NEXT: [[LD1:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8		; SSE-NEXT: [[LD1:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
; CHECK-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])		; SSE-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])
; CHECK-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])		; SSE-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])
; CHECK-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8		; SSE-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
; CHECK-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8		; SSE-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
; CHECK-NEXT: ret void		; SSE-NEXT: ret void
		;
		; AVX-LABEL: @bswap_2i64(
		; AVX-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
		; AVX-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]])
		; AVX-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
		; AVX-NEXT: ret void
;		;
%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8		%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8		%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
%bswap0 = call i64 @llvm.bswap.i64(i64 %ld0)		%bswap0 = call i64 @llvm.bswap.i64(i64 %ld0)
%bswap1 = call i64 @llvm.bswap.i64(i64 %ld1)		%bswap1 = call i64 @llvm.bswap.i64(i64 %ld1)
store i64 %bswap0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8		store i64 %bswap0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
store i64 %bswap1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8		store i64 %bswap1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
ret void		ret void
}		}

define void @bswap_4i64() #0 {		define void @bswap_4i64() #0 {
; CHECK-LABEL: @bswap_4i64(		; SSE-LABEL: @bswap_4i64(
; CHECK-NEXT: [[LD0:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4		; SSE-NEXT: [[LD0:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
; CHECK-NEXT: [[LD1:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4		; SSE-NEXT: [[LD1:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
; CHECK-NEXT: [[LD2:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4		; SSE-NEXT: [[LD2:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
; CHECK-NEXT: [[LD3:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4		; SSE-NEXT: [[LD3:%.]] = load i64, i64 getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
; CHECK-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])		; SSE-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])
; CHECK-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])		; SSE-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])
; CHECK-NEXT: [[BSWAP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD2]])		; SSE-NEXT: [[BSWAP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD2]])
; CHECK-NEXT: [[BSWAP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD3]])		; SSE-NEXT: [[BSWAP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD3]])
; CHECK-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4		; SSE-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
; CHECK-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4		; SSE-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
; CHECK-NEXT: store i64 [[BSWAP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4		; SSE-NEXT: store i64 [[BSWAP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
; CHECK-NEXT: store i64 [[BSWAP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4		; SSE-NEXT: store i64 [[BSWAP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
; CHECK-NEXT: ret void		; SSE-NEXT: ret void
		;
		; AVX-LABEL: @bswap_4i64(
		; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
		; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> [[TMP1]])
		; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
		; AVX-NEXT: ret void
;		;
%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4		%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4		%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
%ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4		%ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
%ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4		%ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
%bswap0 = call i64 @llvm.bswap.i64(i64 %ld0)		%bswap0 = call i64 @llvm.bswap.i64(i64 %ld0)
%bswap1 = call i64 @llvm.bswap.i64(i64 %ld1)		%bswap1 = call i64 @llvm.bswap.i64(i64 %ld1)
%bswap2 = call i64 @llvm.bswap.i64(i64 %ld2)		%bswap2 = call i64 @llvm.bswap.i64(i64 %ld2)
%bswap3 = call i64 @llvm.bswap.i64(i64 %ld3)		%bswap3 = call i64 @llvm.bswap.i64(i64 %ld3)
store i64 %bswap0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4		store i64 %bswap0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
store i64 %bswap1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4		store i64 %bswap1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
store i64 %bswap2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4		store i64 %bswap2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
store i64 %bswap3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4		store i64 %bswap3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
ret void		ret void
}		}

define void @bswap_4i32() #0 {		define void @bswap_4i32() #0 {
; CHECK-LABEL: @bswap_4i32(		; CHECK-LABEL: @bswap_4i32(
; CHECK-NEXT: [[LD0:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
; CHECK-NEXT: [[LD1:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: [[LD2:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4		; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
; CHECK-NEXT: [[LD3:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
; CHECK-NEXT: [[BSWAP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD0]])
; CHECK-NEXT: [[BSWAP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD1]])
; CHECK-NEXT: [[BSWAP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD2]])
; CHECK-NEXT: [[BSWAP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD3]])
; CHECK-NEXT: store i32 [[BSWAP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
; CHECK-NEXT: store i32 [[BSWAP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
; CHECK-NEXT: store i32 [[BSWAP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
; CHECK-NEXT: store i32 [[BSWAP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4		%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4		%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4		%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4		%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
%bswap0 = call i32 @llvm.bswap.i32(i32 %ld0)		%bswap0 = call i32 @llvm.bswap.i32(i32 %ld0)
%bswap1 = call i32 @llvm.bswap.i32(i32 %ld1)		%bswap1 = call i32 @llvm.bswap.i32(i32 %ld1)
%bswap2 = call i32 @llvm.bswap.i32(i32 %ld2)		%bswap2 = call i32 @llvm.bswap.i32(i32 %ld2)
%bswap3 = call i32 @llvm.bswap.i32(i32 %ld3)		%bswap3 = call i32 @llvm.bswap.i32(i32 %ld3)
store i32 %bswap0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4		store i32 %bswap0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
store i32 %bswap1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4		store i32 %bswap1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
store i32 %bswap2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4		store i32 %bswap2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
store i32 %bswap3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4		store i32 %bswap3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @bswap_8i32() #0 {		define void @bswap_8i32() #0 {
; CHECK-LABEL: @bswap_8i32(		; SSE-LABEL: @bswap_8i32(
; CHECK-NEXT: [[LD0:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2		; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
; CHECK-NEXT: [[LD1:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2		; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
; CHECK-NEXT: [[LD2:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2		; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: [[LD3:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2		; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP2]])
; CHECK-NEXT: [[LD4:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2		; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
; CHECK-NEXT: [[LD5:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2		; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
; CHECK-NEXT: [[LD6:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2		; SSE-NEXT: ret void
; CHECK-NEXT: [[LD7:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2		;
; CHECK-NEXT: [[BSWAP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD0]])		; AVX-LABEL: @bswap_8i32(
; CHECK-NEXT: [[BSWAP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD1]])		; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; CHECK-NEXT: [[BSWAP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD2]])		; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> [[TMP1]])
; CHECK-NEXT: [[BSWAP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD3]])		; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; CHECK-NEXT: [[BSWAP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD4]])		; AVX-NEXT: ret void
; CHECK-NEXT: [[BSWAP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD5]])
; CHECK-NEXT: [[BSWAP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD6]])
; CHECK-NEXT: [[BSWAP7:%.*]] = call i32 @llvm.bswap.i32(i32 [[LD7]])
; CHECK-NEXT: store i32 [[BSWAP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
; CHECK-NEXT: store i32 [[BSWAP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
; CHECK-NEXT: store i32 [[BSWAP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
; CHECK-NEXT: store i32 [[BSWAP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
; CHECK-NEXT: store i32 [[BSWAP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
; CHECK-NEXT: store i32 [[BSWAP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
; CHECK-NEXT: store i32 [[BSWAP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
; CHECK-NEXT: store i32 [[BSWAP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
; CHECK-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2		%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2		%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2		%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2		%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
%ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2		%ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
%ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2		%ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
%ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2		%ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
Show All 14 Lines	;
store i32 %bswap5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2		store i32 %bswap5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
store i32 %bswap6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2		store i32 %bswap6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
store i32 %bswap7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2		store i32 %bswap7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
ret void		ret void
}		}

define void @bswap_8i16() #0 {		define void @bswap_8i16() #0 {
; CHECK-LABEL: @bswap_8i16(		; CHECK-LABEL: @bswap_8i16(
; CHECK-NEXT: [[LD0:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
; CHECK-NEXT: [[LD1:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
; CHECK-NEXT: [[LD2:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; CHECK-NEXT: [[LD3:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
; CHECK-NEXT: [[LD4:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
; CHECK-NEXT: [[LD5:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
; CHECK-NEXT: [[LD6:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
; CHECK-NEXT: [[LD7:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
; CHECK-NEXT: [[BSWAP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD0]])
; CHECK-NEXT: [[BSWAP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD1]])
; CHECK-NEXT: [[BSWAP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD2]])
; CHECK-NEXT: [[BSWAP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD3]])
; CHECK-NEXT: [[BSWAP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD4]])
; CHECK-NEXT: [[BSWAP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD5]])
; CHECK-NEXT: [[BSWAP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD6]])
; CHECK-NEXT: [[BSWAP7:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD7]])
; CHECK-NEXT: store i16 [[BSWAP0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
; CHECK-NEXT: store i16 [[BSWAP1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
; CHECK-NEXT: store i16 [[BSWAP2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
; CHECK-NEXT: store i16 [[BSWAP3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
; CHECK-NEXT: store i16 [[BSWAP4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
; CHECK-NEXT: store i16 [[BSWAP5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
; CHECK-NEXT: store i16 [[BSWAP6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
; CHECK-NEXT: store i16 [[BSWAP7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
Show All 14 Lines	;
store i16 %bswap4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %bswap4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %bswap5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %bswap5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %bswap6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %bswap6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %bswap7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %bswap7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @bswap_16i16() #0 {		define void @bswap_16i16() #0 {
; CHECK-LABEL: @bswap_16i16(		; SSE-LABEL: @bswap_16i16(
; CHECK-NEXT: [[LD0:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
; CHECK-NEXT: [[LD1:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
; CHECK-NEXT: [[LD2:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
; CHECK-NEXT: [[LD3:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP2]])
; CHECK-NEXT: [[LD4:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; CHECK-NEXT: [[LD5:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; CHECK-NEXT: [[LD6:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		; SSE-NEXT: ret void
; CHECK-NEXT: [[LD7:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2		;
; CHECK-NEXT: [[LD8:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8), align 2		; AVX-LABEL: @bswap_16i16(
; CHECK-NEXT: [[LD9:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 9), align 2		; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; CHECK-NEXT: [[LD10:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2		; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> [[TMP1]])
; CHECK-NEXT: [[LD11:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2		; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; CHECK-NEXT: [[LD12:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2		; AVX-NEXT: ret void
; CHECK-NEXT: [[LD13:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
; CHECK-NEXT: [[LD14:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
; CHECK-NEXT: [[LD15:%.]] = load i16, i16 getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
; CHECK-NEXT: [[BSWAP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD0]])
; CHECK-NEXT: [[BSWAP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD1]])
; CHECK-NEXT: [[BSWAP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD2]])
; CHECK-NEXT: [[BSWAP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD3]])
; CHECK-NEXT: [[BSWAP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD4]])
; CHECK-NEXT: [[BSWAP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD5]])
; CHECK-NEXT: [[BSWAP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD6]])
; CHECK-NEXT: [[BSWAP7:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD7]])
; CHECK-NEXT: [[BSWAP8:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD8]])
; CHECK-NEXT: [[BSWAP9:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD9]])
; CHECK-NEXT: [[BSWAP10:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD10]])
; CHECK-NEXT: [[BSWAP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD11]])
; CHECK-NEXT: [[BSWAP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD12]])
; CHECK-NEXT: [[BSWAP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD13]])
; CHECK-NEXT: [[BSWAP14:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD14]])
; CHECK-NEXT: [[BSWAP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[LD15]])
; CHECK-NEXT: store i16 [[BSWAP0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
; CHECK-NEXT: store i16 [[BSWAP1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
; CHECK-NEXT: store i16 [[BSWAP2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
; CHECK-NEXT: store i16 [[BSWAP3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
; CHECK-NEXT: store i16 [[BSWAP4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
; CHECK-NEXT: store i16 [[BSWAP5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
; CHECK-NEXT: store i16 [[BSWAP6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
; CHECK-NEXT: store i16 [[BSWAP7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
; CHECK-NEXT: store i16 [[BSWAP8]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8), align 2
; CHECK-NEXT: store i16 [[BSWAP9]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 9), align 2
; CHECK-NEXT: store i16 [[BSWAP10]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
; CHECK-NEXT: store i16 [[BSWAP11]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
; CHECK-NEXT: store i16 [[BSWAP12]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
; CHECK-NEXT: store i16 [[BSWAP13]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
; CHECK-NEXT: store i16 [[BSWAP14]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
; CHECK-NEXT: store i16 [[BSWAP15]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
; CHECK-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Add cost model for BSWAP of vectorsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 61267

lib/Target/X86/X86TargetTransformInfo.cpp

test/Analysis/CostModel/X86/bswap.ll

test/Analysis/CostModel/X86/scalarize.ll

test/Transforms/SLPVectorizer/X86/bswap.ll

[X86][SSE] Add cost model for BSWAP of vectors
ClosedPublic