This is an archive of the discontinued LLVM Phabricator instance.

[X86] Use native shuffle vector for the perm2f128 intrinsics
ClosedPublic

Authored by craig.topper on Sep 14 2017, 11:36 PM.

Download Raw Diff

Details

Reviewers

RKSimon
zvi
igorb

Commits

rG8cd7b0cd2cec: [X86] Use native shuffle vector for the perm2f128 intrinsics
rC313418: [X86] Use native shuffle vector for the perm2f128 intrinsics
rL313418: [X86] Use native shuffle vector for the perm2f128 intrinsics

Summary

This patch replaces the perm2f128 intrinsics with native shuffle vectors.

This uses a pretty simple approach to allocate source 0 to the lower half input and source 1 to the upper half input. Then its just a matter of filling in the indices to use either the lower or upper half of that specific source. This can result in the same source being used by both operands. InstCombine or SelectionDAGBuilder should be able to clean that up.

Diff Detail

Repository: rL LLVM

Event Timeline

craig.topper created this revision.Sep 14 2017, 11:36 PM

craig.topper edited subscribers, added: cfe-commits; removed: llvm-commits.Sep 14 2017, 11:39 PM

_mm256_permute2x128_si256 ?

Also, there currently isn't any testing of the zero vector case.

Convert the AVX2 integer intrinsic as well.

LGTM - please can you update the avx-intrinsics-fast-isel.ll/avxs-intrinsics-fast-isel.ll cases to match the *-builtins.c as well (either now or if/when you add the intrinsics to autoupgrade).

This revision is now accepted and ready to land.Sep 15 2017, 11:24 AM

Closed by commit rL313418: [X86] Use native shuffle vector for the perm2f128 intrinsics (authored by ctopper). · Explain WhySep 15 2017, 4:02 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

cfe/

trunk/

lib/

CodeGen/

CGBuiltin.cpp

39 lines

test/

CodeGen/

avx-builtins.c

6 lines

avx2-builtins.c

4 lines

Diff 115518

cfe/trunk/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,917 Lines • ▼ Show 20 Lines	case X86::BI__builtin_ia32_palignr512_mask: {

// If this isn't a masked builtin, just return the align operation.		// If this isn't a masked builtin, just return the align operation.
if (Ops.size() == 3)		if (Ops.size() == 3)
return Align;		return Align;

return EmitX86Select(*this, Ops[4], Align, Ops[3]);		return EmitX86Select(*this, Ops[4], Align, Ops[3]);
}		}

		case X86::BI__builtin_ia32_vperm2f128_pd256:
		case X86::BI__builtin_ia32_vperm2f128_ps256:
		case X86::BI__builtin_ia32_vperm2f128_si256:
		case X86::BI__builtin_ia32_permti256: {
		unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
		unsigned NumElts = Ops[0]->getType()->getVectorNumElements();

		// This takes a very simple approach since there are two lanes and a
		// shuffle can have 2 inputs. So we reserve the first input for the first
		// lane and the second input for the second lane. This may result in
		// duplicate sources, but this can be dealt with in the backend.

		Value *OutOps[2];
		uint32_t Indices[8];
		for (unsigned l = 0; l != 2; ++l) {
		// Determine the source for this lane.
		if (Imm & (1 << ((l * 4) + 3)))
		OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
		else if (Imm & (1 << ((l * 4) + 1)))
		OutOps[l] = Ops[1];
		else
		OutOps[l] = Ops[0];

		for (unsigned i = 0; i != NumElts/2; ++i) {
		// Start with ith element of the source for this lane.
		unsigned Idx = (l * NumElts) + i;
		// If bit 0 of the immediate half is set, switch to the high half of
		// the source.
		if (Imm & (1 << (l * 4)))
		Idx += NumElts/2;
		Indices[(l * (NumElts/2)) + i] = Idx;
		}
		}

		return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
		makeArrayRef(Indices, NumElts),
		"vperm");
		}

case X86::BI__builtin_ia32_movnti:		case X86::BI__builtin_ia32_movnti:
case X86::BI__builtin_ia32_movnti64:		case X86::BI__builtin_ia32_movnti64:
case X86::BI__builtin_ia32_movntsd:		case X86::BI__builtin_ia32_movntsd:
case X86::BI__builtin_ia32_movntss: {		case X86::BI__builtin_ia32_movntss: {
llvm::MDNode *Node = llvm::MDNode::get(		llvm::MDNode *Node = llvm::MDNode::get(
getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));		getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));

Value *Ptr = Ops[0];		Value *Ptr = Ops[0];
▲ Show 20 Lines • Show All 1,648 Lines • Show Last 20 Lines

cfe/trunk/test/CodeGen/avx-builtins.c

	Show First 20 Lines • Show All 672 Lines • ▼ Show 20 Lines
	__m256 test_mm256_permute_ps(__m256 A) {			__m256 test_mm256_permute_ps(__m256 A) {
	// CHECK-LABEL: test_mm256_permute_ps			// CHECK-LABEL: test_mm256_permute_ps
	// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>			// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
	return _mm256_permute_ps(A, 0x1b);			return _mm256_permute_ps(A, 0x1b);
	}			}

	__m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {			__m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
	// CHECK-LABEL: test_mm256_permute2f128_pd			// CHECK-LABEL: test_mm256_permute2f128_pd
	// CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}}, i8 49)			// CHECK: shufflevector <4 x double> %{{.}}, <4 x double> %{{.}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
	return _mm256_permute2f128_pd(A, B, 0x31);			return _mm256_permute2f128_pd(A, B, 0x31);
	}			}

	__m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {			__m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
	// CHECK-LABEL: test_mm256_permute2f128_ps			// CHECK-LABEL: test_mm256_permute2f128_ps
	// CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %{{.}}, <8 x float> %{{.}}, i8 19)			// CHECK: shufflevector <8 x float> %{{.}}, <8 x float> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
	return _mm256_permute2f128_ps(A, B, 0x13);			return _mm256_permute2f128_ps(A, B, 0x13);
	}			}

	__m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {			__m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
	// CHECK-LABEL: test_mm256_permute2f128_si256			// CHECK-LABEL: test_mm256_permute2f128_si256
	// CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.}}, <8 x i32> %{{.}}, i8 32)			// CHECK: shufflevector <8 x i32> %{{.}}, <8 x i32> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
	return _mm256_permute2f128_si256(A, B, 0x20);			return _mm256_permute2f128_si256(A, B, 0x20);
	}			}

	__m128d test_mm_permutevar_pd(__m128d A, __m128i B) {			__m128d test_mm_permutevar_pd(__m128d A, __m128i B) {
	// CHECK-LABEL: test_mm_permutevar_pd			// CHECK-LABEL: test_mm_permutevar_pd
	// CHECK: call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.}}, <2 x i64> %{{.}})			// CHECK: call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.}}, <2 x i64> %{{.}})
	return _mm_permutevar_pd(A, B);			return _mm_permutevar_pd(A, B);
	}			}
	▲ Show 20 Lines • Show All 776 Lines • Show Last 20 Lines

cfe/trunk/test/CodeGen/avx2-builtins.c

	Show First 20 Lines • Show All 901 Lines • ▼ Show 20 Lines
	__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {			__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
	// CHECK-LABEL: test_mm256_packs_epu32			// CHECK-LABEL: test_mm256_packs_epu32
	// CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.}}, <8 x i32> %{{.}})			// CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.}}, <8 x i32> %{{.}})
	return _mm256_packus_epi32(a, b);			return _mm256_packus_epi32(a, b);
	}			}

	__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {			__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
	// CHECK-LABEL: test_mm256_permute2x128_si256			// CHECK-LABEL: test_mm256_permute2x128_si256
	// CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.}}, <4 x i64> %{{.}}, i8 49)			// CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	return _mm256_permute2x128_si256(a, b, 0x31);			return _mm256_permute2x128_si256(a, b, 0x38);
	}			}

	__m256i test_mm256_permute4x64_epi64(__m256i a) {			__m256i test_mm256_permute4x64_epi64(__m256i a) {
	// CHECK-LABEL: test_mm256_permute4x64_epi64			// CHECK-LABEL: test_mm256_permute4x64_epi64
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 2, i32 0>			// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
	return _mm256_permute4x64_epi64(a, 35);			return _mm256_permute4x64_epi64(a, 35);
	}			}

	▲ Show 20 Lines • Show All 323 Lines • Show Last 20 Lines