Diff 343101

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 2,686 Lines • ▼ Show 20 Lines	X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets		static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
{ ISD::CTPOP, MVT::i64, 1 },		{ ISD::CTPOP, MVT::i64, 1 },
};		};
static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets		static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
{ ISD::CTPOP, MVT::i32, 1 },		{ ISD::CTPOP, MVT::i32, 1 },
{ ISD::CTPOP, MVT::i16, 1 },		{ ISD::CTPOP, MVT::i16, 1 },
{ ISD::CTPOP, MVT::i8, 1 },		{ ISD::CTPOP, MVT::i8, 1 },
};		};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets		static const CostTblEntry X64CostTbl[] = { // 64-bit targets
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV - { ISD::BITREVERSE, MVT::i64, 14 }, - { ISD::BSWAP, MVT::i64, 1 }, - { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV - { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH - { ISD::CTPOP, MVT::i64, 10 }, - { ISD::SADDO, MVT::i64, 1 }, - { ISD::UADDO, MVT::i64, 1 }, - { ISD::UMULO, MVT::i64, 2 }, // mulq + seto 55 diff lines are omitted. See full path. Lint: Pre-merge checks: clang-format: please reformat the code ``` - static const CostTblEntry X64CostTbl[] = { // 64…
{ ISD::ABS, MVT::i64, 2 }, // SUB+CMOV		{ ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, 14 },		{ ISD::BITREVERSE, MVT::i64, 14 },
		{ ISD::BSWAP, MVT::i64, 1 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV		{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH		{ ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },		{ ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },		{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },		{ ISD::UADDO, MVT::i64, 1 },
{ ISD::UMULO, MVT::i64, 2 }, // mulq + seto		{ ISD::UMULO, MVT::i64, 2 }, // mulq + seto
};		};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets		static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ABS, MVT::i32, 2 }, // SUB+CMOV		{ ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
{ ISD::ABS, MVT::i16, 2 }, // SUB+CMOV		{ ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i32, 14 },		{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },		{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },		{ ISD::BITREVERSE, MVT::i8, 11 },
		{ ISD::BSWAP, MVT::i32, 1 },
		{ ISD::BSWAP, MVT::i16, 1 }, // MOVZX + ROL by 8
{ ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV		{ ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV		{ ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV		{ ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH		{ ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH		{ ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH		{ ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i32, 8 },		{ ISD::CTPOP, MVT::i32, 8 },
{ ISD::CTPOP, MVT::i16, 9 },		{ ISD::CTPOP, MVT::i16, 9 },
▲ Show 20 Lines • Show All 195 Lines • ▼ Show 20 Lines	if (ST->hasPOPCNT()) {
if (ST->is64Bit())		if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
return adjustTableCost(*Entry, LT.first, ICA.getFlags());		return adjustTableCost(*Entry, LT.first, ICA.getFlags());

if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
return adjustTableCost(*Entry, LT.first, ICA.getFlags());		return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}		}

		if (ST->hasMOVBE()) {
		if (const Instruction *II = ICA.getInst()) {
		if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
		craig.topperUnsubmitted Done Reply Inline Actions At least on Intel Core CPUs, MOVBE isn't optimized. It's a load or store and a bswap operation. Maybe it's optimized on Atom/Silvermont/Goldmont? It was added to that line of CPU first possibly because those CPUs have been used in networking equipment. craig.topper: At least on Intel Core CPUs, MOVBE isn't optimized. It's a load or store and a bswap operation.
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Looking at actual AMD Zen3 measurements, `movbe r<-m` is `1` uop, while `movbe m<-r` is `2`, which is actually a regression from Zen2/Zen1, as per https://www.agner.org/optimize/instruction_tables.pdf. As per that table, both are really slow on haswell/broadwell/skylake, but fast on Silvermont/Goldmont/KNL. So i think we could mark `movbe r<-m` on AMD's at least. lebedev.ri: Looking at actual AMD Zen3 measurements, `movbe r<-m` is `1` uop, while `movbe m<-r` is `2`…
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions So i think we could mark `movbe r<-m` as free on AMD's at least. lebedev.ri: So i think we could mark `movbe r<-m` as free on AMD's at least.
		return TTI::TCC_Free;
		if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
		if (LI->hasOneUse())
		return TTI::TCC_Free;
		}
		}
		}

// TODO - add BMI (TZCNT) scalar handling		// TODO - add BMI (TZCNT) scalar handling

if (ST->is64Bit())		if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
return adjustTableCost(*Entry, LT.first, ICA.getFlags());		return adjustTableCost(*Entry, LT.first, ICA.getFlags());

if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))		if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
return adjustTableCost(*Entry, LT.first, ICA.getFlags());		return adjustTableCost(*Entry, LT.first, ICA.getFlags());
▲ Show 20 Lines • Show All 1,894 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/bswap-store.ll

	; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X64			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X64			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X64
	; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X86			; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
	; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X86			; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X86

	declare i16 @llvm.bswap.i16(i16)			declare i16 @llvm.bswap.i16(i16)
	declare i32 @llvm.bswap.i32(i32)			declare i32 @llvm.bswap.i32(i32)
	declare i64 @llvm.bswap.i64(i64)			declare i64 @llvm.bswap.i64(i64)
	declare i128 @llvm.bswap.i128(i128)			declare i128 @llvm.bswap.i128(i128)

	define void @var_bswap_store_i16(i16 %a, i16* %dst) {			define void @var_bswap_store_i16(i16 %a, i16* %dst) {
	; NOMOVBE-LABEL: 'var_bswap_store_i16'			; NOMOVBE-LABEL: 'var_bswap_store_i16'
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	; MOVBE-LABEL: 'var_bswap_store_i16'			; MOVBE-LABEL: 'var_bswap_store_i16'
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1			; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	%bswap = call i16 @llvm.bswap.i16(i16 %a)			%bswap = call i16 @llvm.bswap.i16(i16 %a)
	store i16 %bswap, i16* %dst, align 1			store i16 %bswap, i16* %dst, align 1

	ret void			ret void
	}			}
	define void @var_bswap_store_i16_extrause(i16 %a, i16* %dst) {			define void @var_bswap_store_i16_extrause(i16 %a, i16* %dst) {
	; NOMOVBE-LABEL: 'var_bswap_store_i16_extrause'			; ALL-LABEL: 'var_bswap_store_i16_extrause'
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;
	; MOVBE-LABEL: 'var_bswap_store_i16_extrause'
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	%bswap = call i16 @llvm.bswap.i16(i16 %a)			%bswap = call i16 @llvm.bswap.i16(i16 %a)
	store i16 %bswap, i16* %dst, align 1			store i16 %bswap, i16* %dst, align 1

	%bswap2 = shl i16 %bswap, 2 ; incur an extra use to the bswap			%bswap2 = shl i16 %bswap, 2 ; incur an extra use to the bswap

	ret void			ret void
	}			}

	define void @var_bswap_store_i32(i32 %a, i32* %dst) {			define void @var_bswap_store_i32(i32 %a, i32* %dst) {
	; ALL-LABEL: 'var_bswap_store_i32'			; NOMOVBE-LABEL: 'var_bswap_store_i32'
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
	; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; MOVBE-LABEL: 'var_bswap_store_i32'
				; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
				; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
				; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	%bswap = call i32 @llvm.bswap.i32(i32 %a)			%bswap = call i32 @llvm.bswap.i32(i32 %a)
	store i32 %bswap, i32* %dst, align 1			store i32 %bswap, i32* %dst, align 1

	ret void			ret void
	}			}
	define void @var_bswap_store_i32_extrause(i32 %a, i32* %dst) {			define void @var_bswap_store_i32_extrause(i32 %a, i32* %dst) {
	; ALL-LABEL: 'var_bswap_store_i32_extrause'			; ALL-LABEL: 'var_bswap_store_i32_extrause'
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i32 %bswap, 2			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i32 %bswap, 2
	; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	%bswap = call i32 @llvm.bswap.i32(i32 %a)			%bswap = call i32 @llvm.bswap.i32(i32 %a)
	store i32 %bswap, i32* %dst, align 1			store i32 %bswap, i32* %dst, align 1

	%bswap2 = shl i32 %bswap, 2 ; incur an extra use to the bswap			%bswap2 = shl i32 %bswap, 2 ; incur an extra use to the bswap

	ret void			ret void
	}			}

	define void @var_bswap_store_i64(i64 %a, i64* %dst) {			define void @var_bswap_store_i64(i64 %a, i64* %dst) {
	; X64-LABEL: 'var_bswap_store_i64'
	craig.topperUnsubmitted Not Done Reply Inline Actions These check lines vanished and were not replaced craig.topper: These check lines vanished and were not replaced
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;
	; X86-LABEL: 'var_bswap_store_i64'
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;
	%bswap = call i64 @llvm.bswap.i64(i64 %a)			%bswap = call i64 @llvm.bswap.i64(i64 %a)
	store i64 %bswap, i64* %dst, align 1			store i64 %bswap, i64* %dst, align 1

	ret void			ret void
	}			}
	define void @var_bswap_store_i64_extrause(i64 %a, i64* %dst) {			define void @var_bswap_store_i64_extrause(i64 %a, i64* %dst) {
	; X64-LABEL: 'var_bswap_store_i64_extrause'			; X64-LABEL: 'var_bswap_store_i64_extrause'
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i64 %bswap, 2			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i64 %bswap, 2
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	; X86-LABEL: 'var_bswap_store_i64_extrause'			; X86-LABEL: 'var_bswap_store_i64_extrause'
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i64 %bswap, 2			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i64 %bswap, 2
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	%bswap = call i64 @llvm.bswap.i64(i64 %a)			%bswap = call i64 @llvm.bswap.i64(i64 %a)
	store i64 %bswap, i64* %dst, align 1			store i64 %bswap, i64* %dst, align 1

	%bswap2 = shl i64 %bswap, 2 ; incur an extra use to the bswap			%bswap2 = shl i64 %bswap, 2 ; incur an extra use to the bswap

	ret void			ret void
	}			}

	define void @var_bswap_store_i128(i128 %a, i128* %dst) {			define void @var_bswap_store_i128(i128 %a, i128* %dst) {
	; X64-LABEL: 'var_bswap_store_i128'
	craig.topperUnsubmitted Not Done Reply Inline Actions Same here craig.topper: Same here
	; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;
	; X86-LABEL: 'var_bswap_store_i128'
	; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;
	%bswap = call i128 @llvm.bswap.i128(i128 %a)			%bswap = call i128 @llvm.bswap.i128(i128 %a)
	store i128 %bswap, i128* %dst, align 1			store i128 %bswap, i128* %dst, align 1

	ret void			ret void
	}			}
	define void @var_bswap_store_i128_extrause(i128 %a, i128* %dst) {			define void @var_bswap_store_i128_extrause(i128 %a, i128* %dst) {
	; X64-LABEL: 'var_bswap_store_i128_extrause'			; X64-LABEL: 'var_bswap_store_i128_extrause'
	; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i128 %bswap, 2			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i128 %bswap, 2
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	; X86-LABEL: 'var_bswap_store_i128_extrause'			; X86-LABEL: 'var_bswap_store_i128_extrause'
	; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap2 = shl i128 %bswap, 2			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap2 = shl i128 %bswap, 2
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void			; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	;			;
	%bswap = call i128 @llvm.bswap.i128(i128 %a)			%bswap = call i128 @llvm.bswap.i128(i128 %a)
	store i128 %bswap, i128* %dst, align 1			store i128 %bswap, i128* %dst, align 1

	%bswap2 = shl i128 %bswap, 2 ; incur an extra use to the bswap			%bswap2 = shl i128 %bswap, 2 ; incur an extra use to the bswap

	ret void			ret void
	}			}

llvm/test/Analysis/CostModel/X86/bswap.ll

	; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X64			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X64			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X64
	; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X86			; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
	; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X86			; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X86

	declare i16 @llvm.bswap.i16(i16)			declare i16 @llvm.bswap.i16(i16)
	declare i32 @llvm.bswap.i32(i32)			declare i32 @llvm.bswap.i32(i32)
	declare i64 @llvm.bswap.i64(i64)			declare i64 @llvm.bswap.i64(i64)
	declare i128 @llvm.bswap.i128(i128)			declare i128 @llvm.bswap.i128(i128)

	; Verify the cost of scalar bswap instructions.			; Verify the cost of scalar bswap instructions.

	define i16 @var_bswap_i16(i16 %a) {			define i16 @var_bswap_i16(i16 %a) {
	; NOMOVBE-LABEL: 'var_bswap_i16'			; ALL-LABEL: 'var_bswap_i16'
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap			; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
	;
	; MOVBE-LABEL: 'var_bswap_i16'
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
	;			;
	%bswap = call i16 @llvm.bswap.i16(i16 %a)			%bswap = call i16 @llvm.bswap.i16(i16 %a)
	ret i16 %bswap			ret i16 %bswap
	}			}

	define i32 @var_bswap_i32(i32 %a) {			define i32 @var_bswap_i32(i32 %a) {
	; ALL-LABEL: 'var_bswap_i32'			; ALL-LABEL: 'var_bswap_i32'
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
	; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap			; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
	;			;
	%bswap = call i32 @llvm.bswap.i32(i32 %a)			%bswap = call i32 @llvm.bswap.i32(i32 %a)
	ret i32 %bswap			ret i32 %bswap
	}			}

	define i64 @var_bswap_i64(i64 %a) {			define i64 @var_bswap_i64(i64 %a) {
	; X64-LABEL: 'var_bswap_i64'			; X64-LABEL: 'var_bswap_i64'
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap			; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
	;			;
	; X86-LABEL: 'var_bswap_i64'			; X86-LABEL: 'var_bswap_i64'
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap			; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
	;			;
	%bswap = call i64 @llvm.bswap.i64(i64 %a)			%bswap = call i64 @llvm.bswap.i64(i64 %a)
	ret i64 %bswap			ret i64 %bswap
	}			}

	define i128 @var_bswap_i128(i128 %a) {			define i128 @var_bswap_i128(i128 %a) {
	; X64-LABEL: 'var_bswap_i128'			; X64-LABEL: 'var_bswap_i128'
	; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap			; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
	;			;
	; X86-LABEL: 'var_bswap_i128'			; X86-LABEL: 'var_bswap_i128'
	; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap			; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
	;			;
	%bswap = call i128 @llvm.bswap.i128(i128 %a)			%bswap = call i128 @llvm.bswap.i128(i128 %a)
	ret i128 %bswap			ret i128 %bswap
	}			}

llvm/test/Analysis/CostModel/X86/load-bswap.ll

	; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X64			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
	; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X64			; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X64
	; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X86			; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze \| FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
	; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X86			; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe \| FileCheck %s --check-prefixes=ALL,MOVBE,X86

	declare i16 @llvm.bswap.i16(i16)			declare i16 @llvm.bswap.i16(i16)
	declare i32 @llvm.bswap.i32(i32)			declare i32 @llvm.bswap.i32(i32)
	declare i64 @llvm.bswap.i64(i64)			declare i64 @llvm.bswap.i64(i64)
	declare i128 @llvm.bswap.i128(i128)			declare i128 @llvm.bswap.i128(i128)

	define i16 @var_load_bswap_i16(i16* %src) {			define i16 @var_load_bswap_i16(i16* %src) {
	; NOMOVBE-LABEL: 'var_load_bswap_i16'			; NOMOVBE-LABEL: 'var_load_bswap_i16'
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
	;			;
	; MOVBE-LABEL: 'var_load_bswap_i16'			; MOVBE-LABEL: 'var_load_bswap_i16'
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1			; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap			; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
	;			;
	%a = load i16, i16* %src, align 1			%a = load i16, i16* %src, align 1
	%bswap = call i16 @llvm.bswap.i16(i16 %a)			%bswap = call i16 @llvm.bswap.i16(i16 %a)

	ret i16 %bswap			ret i16 %bswap
	}			}
	define i16 @var_load_bswap_i16_extrause(i16* %src, i16* %clobberdst) {			define i16 @var_load_bswap_i16_extrause(i16* %src, i16* %clobberdst) {
	; NOMOVBE-LABEL: 'var_load_bswap_i16_extrause'			; ALL-LABEL: 'var_load_bswap_i16_extrause'
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
	; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap			; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
	;
	; MOVBE-LABEL: 'var_load_bswap_i16_extrause'
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
	; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
	;			;
	%a = load i16, i16* %src, align 1			%a = load i16, i16* %src, align 1
	%bswap = call i16 @llvm.bswap.i16(i16 %a)			%bswap = call i16 @llvm.bswap.i16(i16 %a)

	%a2 = shl i16 %a, 2 ; incur an extra use to the load			%a2 = shl i16 %a, 2 ; incur an extra use to the load
	store i16 %a2, i16* %clobberdst, align 1			store i16 %a2, i16* %clobberdst, align 1

	ret i16 %bswap			ret i16 %bswap
	}			}

	define i32 @var_load_bswap_i32(i32* %src) {			define i32 @var_load_bswap_i32(i32* %src) {
	; ALL-LABEL: 'var_load_bswap_i32'			; NOMOVBE-LABEL: 'var_load_bswap_i32'
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
	; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap			; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
				;
				; MOVBE-LABEL: 'var_load_bswap_i32'
				; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
				; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
				; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
	;			;
	%a = load i32, i32* %src, align 1			%a = load i32, i32* %src, align 1
	%bswap = call i32 @llvm.bswap.i32(i32 %a)			%bswap = call i32 @llvm.bswap.i32(i32 %a)

	ret i32 %bswap			ret i32 %bswap
	}			}
	define i32 @var_load_bswap_i32_extrause(i32* %src, i32* %clobberdst) {			define i32 @var_load_bswap_i32_extrause(i32* %src, i32* %clobberdst) {
	; ALL-LABEL: 'var_load_bswap_i32_extrause'			; ALL-LABEL: 'var_load_bswap_i32_extrause'
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i32 %a, 2			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i32 %a, 2
	; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %a2, i32* %clobberdst, align 1			; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %a2, i32* %clobberdst, align 1
	; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap			; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
	;			;
	%a = load i32, i32* %src, align 1			%a = load i32, i32* %src, align 1
	%bswap = call i32 @llvm.bswap.i32(i32 %a)			%bswap = call i32 @llvm.bswap.i32(i32 %a)

	%a2 = shl i32 %a, 2 ; incur an extra use to the load			%a2 = shl i32 %a, 2 ; incur an extra use to the load
	store i32 %a2, i32* %clobberdst, align 1			store i32 %a2, i32* %clobberdst, align 1

	ret i32 %bswap			ret i32 %bswap
	}			}

	define i64 @var_load_bswap_i64(i64* %src) {			define i64 @var_load_bswap_i64(i64* %src) {
	; X64-LABEL: 'var_load_bswap_i64'
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
	;
	; X86-LABEL: 'var_load_bswap_i64'
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
	;
	RKSimonUnsubmitted Not Done Reply Inline Actions Script being stupid again - you'll need to fix your prefixes RKSimon: Script being stupid again - you'll need to fix your prefixes
	lebedev.riAuthorUnsubmitted Not Done Reply Inline Actions UGH, i thought i did :( We need to disable FileCheck strict mode for x86 tests. lebedev.ri: UGH, i thought i did :( We need to disable FileCheck strict mode for x86 tests.
	%a = load i64, i64* %src, align 1			%a = load i64, i64* %src, align 1
	%bswap = call i64 @llvm.bswap.i64(i64 %a)			%bswap = call i64 @llvm.bswap.i64(i64 %a)

	ret i64 %bswap			ret i64 %bswap
	}			}
	define i64 @var_load_bswap_i64_extrause(i64* %src, i64* %clobberdst) {			define i64 @var_load_bswap_i64_extrause(i64* %src, i64* %clobberdst) {
	; X64-LABEL: 'var_load_bswap_i64_extrause'			; X64-LABEL: 'var_load_bswap_i64_extrause'
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i64 %a, 2			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i64 %a, 2
	; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %a2, i64* %clobberdst, align 1			; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %a2, i64* %clobberdst, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap			; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
	;			;
	; X86-LABEL: 'var_load_bswap_i64_extrause'			; X86-LABEL: 'var_load_bswap_i64_extrause'
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i64 %a, 2			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i64 %a, 2
	; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %a2, i64* %clobberdst, align 1			; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %a2, i64* %clobberdst, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap			; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
	;			;
	%a = load i64, i64* %src, align 1			%a = load i64, i64* %src, align 1
	%bswap = call i64 @llvm.bswap.i64(i64 %a)			%bswap = call i64 @llvm.bswap.i64(i64 %a)

	%a2 = shl i64 %a, 2 ; incur an extra use to the load			%a2 = shl i64 %a, 2 ; incur an extra use to the load
	store i64 %a2, i64* %clobberdst, align 1			store i64 %a2, i64* %clobberdst, align 1

	ret i64 %bswap			ret i64 %bswap
	}			}

	define i128 @var_load_bswap_i128(i128* %src) {			define i128 @var_load_bswap_i128(i128* %src) {
	; X64-LABEL: 'var_load_bswap_i128'
	craig.topperUnsubmitted Not Done Reply Inline Actions And here craig.topper: And here
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
	;
	; X86-LABEL: 'var_load_bswap_i128'
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
	;
	%a = load i128, i128* %src, align 1			%a = load i128, i128* %src, align 1
	%bswap = call i128 @llvm.bswap.i128(i128 %a)			%bswap = call i128 @llvm.bswap.i128(i128 %a)

	ret i128 %bswap			ret i128 %bswap
	}			}
	define i128 @var_load_bswap_i128_extrause(i128* %src, i128* %clobberdst) {			define i128 @var_load_bswap_i128_extrause(i128* %src, i128* %clobberdst) {
	; X64-LABEL: 'var_load_bswap_i128_extrause'			; X64-LABEL: 'var_load_bswap_i128_extrause'
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i128 %a, 2			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i128 %a, 2
	; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %a2, i128* %clobberdst, align 1			; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %a2, i128* %clobberdst, align 1
	; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap			; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
	;			;
	; X86-LABEL: 'var_load_bswap_i128_extrause'			; X86-LABEL: 'var_load_bswap_i128_extrause'
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2 = shl i128 %a, 2			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2 = shl i128 %a, 2
	; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %a2, i128* %clobberdst, align 1			; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %a2, i128* %clobberdst, align 1
	; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap			; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
	;			;
	%a = load i128, i128* %src, align 1			%a = load i128, i128* %src, align 1
	%bswap = call i128 @llvm.bswap.i128(i128 %a)			%bswap = call i128 @llvm.bswap.i128(i128 %a)

	%a2 = shl i128 %a, 2 ; incur an extra use to the load			%a2 = shl i128 %a, 2 ; incur an extra use to the load
	store i128 %a2, i128* %clobberdst, align 1			store i128 %a2, i128* %clobberdst, align 1

	ret i128 %bswap			ret i128 %bswap
	}			}

llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll

	Show All 36 Lines
	; SSE-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)			; SSE-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
	; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8			; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
	; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8			; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
	; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8			; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
	; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8			; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
	; SSE-NEXT: ret void			; SSE-NEXT: ret void
	;			;
	; SLM-LABEL: @abs_v8i64(			; SLM-LABEL: @abs_v8i64(
	; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8			; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
	; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8			; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
	; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8			; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
	; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8			; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
	; SLM-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)			; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
	; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)			; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
	; SLM-NEXT: [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)			; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
	; SLM-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)			; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
	; SLM-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8			; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.abs.i64(i64 [[A0]], i1 false)
	; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8			; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.abs.i64(i64 [[A1]], i1 false)
	; SLM-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8			; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.abs.i64(i64 [[A2]], i1 false)
	; SLM-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8			; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.abs.i64(i64 [[A3]], i1 false)
				; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.abs.i64(i64 [[A4]], i1 false)
				; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.abs.i64(i64 [[A5]], i1 false)
				; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.abs.i64(i64 [[A6]], i1 false)
				; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.abs.i64(i64 [[A7]], i1 false)
				; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
				; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
				; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
				; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
				; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
				; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
				; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
				; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SLM-NEXT: ret void			; SLM-NEXT: ret void
	;			;
	; AVX-LABEL: @abs_v8i64(			; AVX-LABEL: @abs_v8i64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false)			; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false)
	; AVX-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP2]], i1 false)			; AVX-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP2]], i1 false)
	; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	▲ Show 20 Lines • Show All 518 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll

	Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
	; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8			; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
	; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8			; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
	; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8			; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
	; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8			; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
	; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8			; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SSE-NEXT: ret void			; SSE-NEXT: ret void
	;			;
	; SLM-LABEL: @add_v8i64(			; SLM-LABEL: @add_v8i64(
	; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8			; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
	; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8			; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
	; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8			; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
	; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8			; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
	; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8			; SLM-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
	; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8			; SLM-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
	; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8			; SLM-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
	; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8			; SLM-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
	; SLM-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8			; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
	; SLM-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8			; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
	; SLM-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8			; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
	; SLM-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8			; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
	; SLM-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8			; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
	; SLM-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8			; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
	; SLM-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8			; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
	; SLM-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8			; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
	; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
	; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
	; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
	; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
	; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
	; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
	; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
	; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
	; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
	; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
	; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
	; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
	; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
	; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
	; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
	; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SLM-NEXT: ret void			; SLM-NEXT: ret void
	;			;
				; AVX512-LABEL: @add_v8i64(
				; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
				; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
				; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
				; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
				; AVX512-NEXT: ret void
				;
				RKSimonUnsubmitted Done Reply Inline Actions why are these sat math tests changing? RKSimon: why are these sat math tests changing?
				lebedev.riAuthorUnsubmitted Done Reply Inline Actions Oops :) lebedev.ri: Oops :)
	; AVX1-LABEL: @add_v8i64(			; AVX1-LABEL: @add_v8i64(
	; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])			; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
	; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])			; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
	; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])			; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
	; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])			; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
	; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
	; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
	; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
	; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
	; AVX1-NEXT: ret void			; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @add_v8i64(			; AVX2-LABEL: @add_v8i64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])			; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
	; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])			; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
	; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX2-NEXT: ret void			; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @add_v8i64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
	; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
	; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
	; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
	; AVX512-NEXT: ret void
	;
	; AVX256BW-LABEL: @add_v8i64(			; AVX256BW-LABEL: @add_v8i64(
	; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])			; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
	; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])			; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
	; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256BW-NEXT: ret void			; AVX256BW-NEXT: ret void
	;
	%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8			%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
	%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8			%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
	%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8			%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
	%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8			%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
	%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8			%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
	%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8			%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
	%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8			%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
	%a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8			%a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
	▲ Show 20 Lines • Show All 654 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll

	Show All 20 Lines
	@c8 = common global [64 x i8] zeroinitializer, align 64			@c8 = common global [64 x i8] zeroinitializer, align 64

	declare i64 @llvm.uadd.sat.i64(i64, i64)			declare i64 @llvm.uadd.sat.i64(i64, i64)
	declare i32 @llvm.uadd.sat.i32(i32, i32)			declare i32 @llvm.uadd.sat.i32(i32, i32)
	declare i16 @llvm.uadd.sat.i16(i16, i16)			declare i16 @llvm.uadd.sat.i16(i16, i16)
	declare i8 @llvm.uadd.sat.i8 (i8 , i8 )			declare i8 @llvm.uadd.sat.i8 (i8 , i8 )

	define void @add_v8i64() {			define void @add_v8i64() {
	; SSE-LABEL: @add_v8i64(
	craig.topperUnsubmitted Not Done Reply Inline Actions And here craig.topper: And here
	; SSE-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
	; SSE-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
	; SSE-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
	; SSE-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
	; SSE-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
	; SSE-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
	; SSE-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
	; SSE-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
	; SSE-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
	; SSE-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
	; SSE-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
	; SSE-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
	; SSE-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
	; SSE-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
	; SSE-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
	; SSE-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
	; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
	; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
	; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
	; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
	; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
	; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
	; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
	; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
	; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
	; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
	; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
	; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
	; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
	; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
	; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
	; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SSE-NEXT: ret void
	;
	; AVX-LABEL: @add_v8i64(			; AVX-LABEL: @add_v8i64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8			; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])			; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
	; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])			; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
	; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	▲ Show 20 Lines • Show All 615 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll

	Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
	; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8			; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
	; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8			; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
	; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8			; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
	; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8			; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
	; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8			; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SSE-NEXT: ret void			; SSE-NEXT: ret void
	;			;
	; SLM-LABEL: @sub_v8i64(			; SLM-LABEL: @sub_v8i64(
	; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8			; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
	; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8			; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
	; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8			; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
	; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8			; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
	; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8			; SLM-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
	; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8			; SLM-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
	; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8			; SLM-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
	; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8			; SLM-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
	; SLM-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8			; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
	; SLM-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8			; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
	; SLM-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8			; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
	; SLM-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8			; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
	; SLM-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8			; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
	; SLM-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8			; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
	; SLM-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8			; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
	; SLM-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8			; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
	; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
	; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
	; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
	; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
	; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
	; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
	; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
	; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
	; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
	; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
	; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
	; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
	; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
	; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
	; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
	; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SLM-NEXT: ret void			; SLM-NEXT: ret void
	;			;
				; AVX512-LABEL: @sub_v8i64(
				; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
				; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
				; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
				; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
				; AVX512-NEXT: ret void
				;
	; AVX1-LABEL: @sub_v8i64(			; AVX1-LABEL: @sub_v8i64(
	; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8			; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
	; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])			; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
	; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])			; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
	; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])			; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
	; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])			; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
	; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
	; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
	; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
	; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8			; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
	; AVX1-NEXT: ret void			; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @sub_v8i64(			; AVX2-LABEL: @sub_v8i64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])			; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
	; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])			; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
	; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX2-NEXT: ret void			; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @sub_v8i64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
	; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
	; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
	; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
	; AVX512-NEXT: ret void
	;
	; AVX256BW-LABEL: @sub_v8i64(			; AVX256BW-LABEL: @sub_v8i64(
	; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])			; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
	; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])			; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
	; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256BW-NEXT: ret void			; AVX256BW-NEXT: ret void
	;
	%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8			%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
	%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8			%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
	%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8			%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
	%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8			%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
	%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8			%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
	%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8			%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
	%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8			%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
	%a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8			%a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
	▲ Show 20 Lines • Show All 654 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

	Show All 20 Lines
	@c8 = common global [64 x i8] zeroinitializer, align 64			@c8 = common global [64 x i8] zeroinitializer, align 64

	declare i64 @llvm.usub.sat.i64(i64, i64)			declare i64 @llvm.usub.sat.i64(i64, i64)
	declare i32 @llvm.usub.sat.i32(i32, i32)			declare i32 @llvm.usub.sat.i32(i32, i32)
	declare i16 @llvm.usub.sat.i16(i16, i16)			declare i16 @llvm.usub.sat.i16(i16, i16)
	declare i8 @llvm.usub.sat.i8 (i8 , i8 )			declare i8 @llvm.usub.sat.i8 (i8 , i8 )

	define void @sub_v8i64() {			define void @sub_v8i64() {
	; SSE-LABEL: @sub_v8i64(
	; SSE-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
	; SSE-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
	; SSE-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
	; SSE-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
	; SSE-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
	; SSE-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
	; SSE-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
	; SSE-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
	; SSE-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
	; SSE-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
	; SSE-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
	; SSE-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
	; SSE-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
	; SSE-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
	; SSE-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
	; SSE-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
	; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
	; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
	; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
	; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
	; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
	; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
	; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
	; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
	; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
	; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
	; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
	; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
	; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
	; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
	; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
	; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
	; SSE-NEXT: ret void
	;
	; AVX-LABEL: @sub_v8i64(			; AVX-LABEL: @sub_v8i64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8			; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8			; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])			; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
	; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])			; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
	; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8			; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
	▲ Show 20 Lines • Show All 615 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll

	Show All 16 Lines
	@dst8 = common global [32 x i8] zeroinitializer, align 32			@dst8 = common global [32 x i8] zeroinitializer, align 32

	declare i64 @llvm.bitreverse.i64(i64)			declare i64 @llvm.bitreverse.i64(i64)
	declare i32 @llvm.bitreverse.i32(i32)			declare i32 @llvm.bitreverse.i32(i32)
	declare i16 @llvm.bitreverse.i16(i16)			declare i16 @llvm.bitreverse.i16(i16)
	declare i8 @llvm.bitreverse.i8(i8)			declare i8 @llvm.bitreverse.i8(i8)

	define void @bitreverse_2i64() #0 {			define void @bitreverse_2i64() #0 {
	; CHECK-LABEL: @bitreverse_2i64(			; SSE-LABEL: @bitreverse_2i64(
	; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8			; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
	; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])			; SSE-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
	; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8			; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
	; CHECK-NEXT: ret void			; SSE-NEXT: ret void
				;
				RKSimonUnsubmitted Not Done Reply Inline Actions AVX? RKSimon: AVX?
				; XOP-LABEL: @bitreverse_2i64(
				; XOP-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
				; XOP-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
				; XOP-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
				; XOP-NEXT: ret void
	;			;
	%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8			%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
	%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8			%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
	%bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)			%bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
	%bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)			%bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
	store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8			store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
	store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8			store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
	ret void			ret void
	▲ Show 20 Lines • Show All 398 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Improve costmodel for scalar byte swaps
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 343101

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/test/Analysis/CostModel/X86/bswap-store.ll

llvm/test/Analysis/CostModel/X86/bswap.ll

llvm/test/Analysis/CostModel/X86/load-bswap.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Improve costmodel for scalar byte swapsClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 343101

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/test/Analysis/CostModel/X86/bswap-store.ll

llvm/test/Analysis/CostModel/X86/bswap.ll

llvm/test/Analysis/CostModel/X86/load-bswap.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll

[X86] Improve costmodel for scalar byte swaps
ClosedPublic