This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Allow vectorization of packed types
ClosedPublic

Authored by arsenm on May 1 2017, 4:11 PM.

Download Raw Diff

Details

Reviewers

kzhuravl
cfang

Diff Detail

Event Timeline

arsenm created this revision.May 1 2017, 4:11 PM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptMay 1 2017, 4:11 PM

arsenm added a parent revision: D32714: SLPVectorizer: Clamp slp-min-reg-size to target maximum.May 1 2017, 4:11 PM

arsenm added a child revision: D32730: LV: Don't insert runtime ptr checks on divergent targets.May 1 2017, 6:53 PM

Add a few more tests

Add missing test file

Use new TTI hook

ping

LGTM.

This revision is now accepted and ready to land.Jun 20 2017, 12:26 PM

cfang accepted this revision.Jun 20 2017, 1:14 PM

cfang added inline comments.

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
261	The interleaving was disabled based on SHOC DeviceMemory readLocalMemory test. We request CQE to do a complete performance measurement around this, and the results were very positive. The major reason to disable it is based on register usage concern. I remember that I re-measure DeviceMemory performance later when new waitcnt insertion was introduced, and it turned out that it does not matter for DeviceMemory readLocalMemory if we enable it! Note sure the other tests that CQE found beneficial when it is disabled.

r305844

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUTargetTransformInfo.h

2 lines

AMDGPUTargetTransformInfo.cpp

22 lines

test/

Transforms/

LoopVectorize/

AMDGPU/

packed-math.ll

34 lines

SLPVectorizer/

AMDGPU/

packed-math.ll

195 lines

simplebb.ll

Diff 100010

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Show First 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	public:

void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);		void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);

TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {		TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");		assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
return TTI::PSK_FastHardware;		return TTI::PSK_FastHardware;
}		}

		unsigned getHardwareNumberOfRegisters(bool Vector);
unsigned getNumberOfRegisters(bool Vector);		unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);		unsigned getRegisterBitWidth(bool Vector);
		unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;		unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;

bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		unsigned AddrSpace) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		unsigned AddrSpace) const;
Show All 37 Lines

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Show First 20 Lines • Show All 178 Lines • ▼ Show 20 Lines	for (const Instruction &I : *BB) {
DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"		DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
<< L << " due to " << GEP << '\n');		<< L << " due to " << GEP << '\n');
if (UP.Threshold >= MaxBoost)		if (UP.Threshold >= MaxBoost)
return;		return;
}		}
}		}
}		}

unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {		unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) {
if (Vec)		// The concept of vector registers doesn't really exist. Some packed vector
return 0;		// operations operate on the normal 32-bit registers.

// Number of VGPRs on SI.		// Number of VGPRs on SI.
if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)		if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return 256;		return 256;

return 4 * 128; // XXX - 4 channels. Should these count as vector instead?		return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}		}

		unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
		// This is really the number of registers to fill when vectorizing /
		// interleaving loops, so we lie to avoid trying to use all registers.
		return getHardwareNumberOfRegisters(Vec) >> 3;
		}

unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {		unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
return Vector ? 0 : 32;		return 32;
		}

		unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
		return 32;
}		}

unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {		unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AMDGPUAS AS = ST->getAMDGPUAS();		AMDGPUAS AS = ST->getAMDGPUAS();
if (AddrSpace == AS.GLOBAL_ADDRESS \|\|		if (AddrSpace == AS.GLOBAL_ADDRESS \|\|
AddrSpace == AS.CONSTANT_ADDRESS \|\|		AddrSpace == AS.CONSTANT_ADDRESS \|\|
AddrSpace == AS.FLAT_ADDRESS)		AddrSpace == AS.FLAT_ADDRESS)
return 128;		return 128;
Show All 34 Lines
bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);		return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}		}

unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {		unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.		// Disable unrolling if the loop is not vectorized.
		// TODO: Enable this again.
if (VF == 1)		if (VF == 1)
		cfangUnsubmitted Not Done Reply Inline Actions The interleaving was disabled based on SHOC DeviceMemory readLocalMemory test. We request CQE to do a complete performance measurement around this, and the results were very positive. The major reason to disable it is based on register usage concern. I remember that I re-measure DeviceMemory performance later when new waitcnt insertion was introduced, and it turned out that it does not matter for DeviceMemory readLocalMemory if we enable it! Note sure the other tests that CQE found beneficial when it is disabled. cfang: The interleaving was disabled based on SHOC DeviceMemory readLocalMemory test. We request CQE…
return 1;		return 1;

// Semi-arbitrary large amount.		return 8;
return 64;
}		}

int AMDGPUTTIImpl::getArithmeticInstrCost(		int AMDGPUTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,		unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,		TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {		TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {

EVT OrigTy = TLI->getValueType(DL, Ty);		EVT OrigTy = TLI->getValueType(DL, Ty);
▲ Show 20 Lines • Show All 251 Lines • Show Last 20 Lines

test/Transforms/LoopVectorize/AMDGPU/packed-math.ll

This file was added.

				; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s -loop-vectorize -dce -instcombine -S \| FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
				; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s -loop-vectorize -dce -instcombine -S \| FileCheck -check-prefix=CIVI -check-prefix=GCN %s
				; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s -loop-vectorize -dce -instcombine -S \| FileCheck -check-prefix=CIVI -check-prefix=GCN %s

				; GCN-LABEL: @vectorize_v2f16_loop(
				; GFX9: vector.body:
				; GFX9: phi <2 x half>
				; GFX9: load <2 x half>
				; GFX9: fadd fast <2 x half>

				; GFX9: middle.block:
				; GFX9: fadd fast <2 x half>

				; VI: phi half
				; VI: phi load half
				; VI: fadd fast half
				define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
				entry:
				br label %for.body

				for.body:
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
				%q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ]
				%arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv
				%0 = load half, half addrspace(1)* %arrayidx, align 2
				%add = fadd fast half %q.04, %0
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 256
				br i1 %exitcond, label %for.end, label %for.body

				for.end:
				%add.lcssa = phi half [ %add, %for.body ]
				ret half %add.lcssa
				}

test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

This file was added.

				; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s \| FileCheck -check-prefixes=GCN,GFX9 %s
				; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s \| FileCheck -check-prefixes=GCN,VI %s

				; FIXME: Should still like to vectorize the memory operations for VI

				; Simple 3-pair chain with loads and stores
				; GCN-LABEL: @test1_as_3_3_3_v2f16(
				; GFX9: load <2 x half>, <2 x half> addrspace(3)*
				; GFX9: load <2 x half>, <2 x half> addrspace(3)*
				; GFX9: fmul <2 x half>
				; GFX9: store <2 x half> %{{.}}, <2 x half> addrspace(3) %
				; GFX9: ret

				; VI: load half
				; VI: load half
				define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%i1 = load half, half addrspace(3)* %b, align 2
				%mul = fmul half %i0, %i1
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
				%i4 = load half, half addrspace(3)* %arrayidx4, align 2
				%mul5 = fmul half %i3, %i4
				store half %mul, half addrspace(3)* %c, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				store half %mul5, half addrspace(3)* %arrayidx5, align 2
				ret void
				}

				; GCN-LABEL: @test1_as_3_0_0(
				; GFX9: load <2 x half>, <2 x half> addrspace(3)*
				; GFX9: load <2 x half>, <2 x half>*
				; GFX9: fmul <2 x half>
				; GFX9: store <2 x half> %{{.}}, <2 x half> %
				; GFX9: ret

				; VI: load half
				; VI: load half
				define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%i1 = load half, half* %b, align 2
				%mul = fmul half %i0, %i1
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%arrayidx4 = getelementptr inbounds half, half* %b, i64 1
				%i4 = load half, half* %arrayidx4, align 2
				%mul5 = fmul half %i3, %i4
				store half %mul, half* %c, align 2
				%arrayidx5 = getelementptr inbounds half, half* %c, i64 1
				store half %mul5, half* %arrayidx5, align 2
				ret void
				}

				; GCN-LABEL: @test1_as_0_0_3_v2f16(
				; GFX9: load <2 x half>, <2 x half>*
				; GFX9: load <2 x half>, <2 x half>*
				; GFX9: fmul <2 x half>
				; GFX9: store <2 x half> %{{.}}, <2 x half> addrspace(3) %
				; GFX9: ret

				; VI: load half
				; VI: load half
				define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
				%i0 = load half, half* %a, align 2
				%i1 = load half, half* %b, align 2
				%mul = fmul half %i0, %i1
				%arrayidx3 = getelementptr inbounds half, half* %a, i64 1
				%i3 = load half, half* %arrayidx3, align 2
				%arrayidx4 = getelementptr inbounds half, half* %b, i64 1
				%i4 = load half, half* %arrayidx4, align 2
				%mul5 = fmul half %i3, %i4
				store half %mul, half addrspace(3)* %c, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				store half %mul5, half addrspace(3)* %arrayidx5, align 2
				ret void
				}

				; GCN-LABEL: @test1_fma_v2f16(
				; GFX9: load <2 x half>
				; GFX9: load <2 x half>
				; GFX9: load <2 x half>
				; GFX9: call <2 x half> @llvm.fma.v2f16(
				; GFX9: store <2 x half>
				define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%i1 = load half, half addrspace(3)* %b, align 2
				%i2 = load half, half addrspace(3)* %c, align 2
				%fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
				%i4 = load half, half addrspace(3)* %arrayidx4, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				%i5 = load half, half addrspace(3)* %arrayidx5, align 2
				%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
				store half %fma0, half addrspace(3)* %d, align 2
				%arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
				store half %fma1, half addrspace(3)* %arrayidx6, align 2
				ret void
				}

				; GCN-LABEL: @mul_scalar_v2f16(
				; GFX9: load <2 x half>
				; GFX9: fmul <2 x half>
				; GFX9: store <2 x half>
				define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%mul = fmul half %i0, %scalar
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%mul5 = fmul half %i3, %scalar
				store half %mul, half addrspace(3)* %c, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				store half %mul5, half addrspace(3)* %arrayidx5, align 2
				ret void
				}

				; GCN-LABEL: @fabs_v2f16
				; GFX9: load <2 x half>
				; GFX9: call <2 x half> @llvm.fabs.v2f16(
				; GFX9: store <2 x half>
				define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%fabs0 = call half @llvm.fabs.f16(half %i0)
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%fabs1 = call half @llvm.fabs.f16(half %i3)
				store half %fabs0, half addrspace(3)* %c, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				store half %fabs1, half addrspace(3)* %arrayidx5, align 2
				ret void
				}

				; GCN-LABEL: @test1_fabs_fma_v2f16(
				; GFX9: load <2 x half>
				; GFX9: call <2 x half> @llvm.fabs.v2f16(
				; GFX9: call <2 x half> @llvm.fma.v2f16(
				; GFX9: store <2 x half>
				define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%i1 = load half, half addrspace(3)* %b, align 2
				%i2 = load half, half addrspace(3)* %c, align 2
				%i0.fabs = call half @llvm.fabs.f16(half %i0)

				%fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
				%i4 = load half, half addrspace(3)* %arrayidx4, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				%i5 = load half, half addrspace(3)* %arrayidx5, align 2
				%i3.fabs = call half @llvm.fabs.f16(half %i3)

				%fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
				store half %fma0, half addrspace(3)* %d, align 2
				%arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
				store half %fma1, half addrspace(3)* %arrayidx6, align 2
				ret void
				}

				; FIXME: Should do vector load and extract component for fabs
				; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
				; GFX9: load half
				; GFX9: call half @llvm.fabs.f16(
				; GFX9: load <2 x half>
				; GFX9: load half
				; GFX9: load <2 x half>
				; GFX9: call <2 x half> @llvm.fma.v2f16(
				; GFX9: store <2 x half>
				define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
				%i0 = load half, half addrspace(3)* %a, align 2
				%i1 = load half, half addrspace(3)* %b, align 2
				%i2 = load half, half addrspace(3)* %c, align 2
				%i1.fabs = call half @llvm.fabs.f16(half %i1)

				%fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
				%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
				%i3 = load half, half addrspace(3)* %arrayidx3, align 2
				%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
				%i4 = load half, half addrspace(3)* %arrayidx4, align 2
				%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
				%i5 = load half, half addrspace(3)* %arrayidx5, align 2
				%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
				store half %fma0, half addrspace(3)* %d, align 2
				%arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
				store half %fma1, half addrspace(3)* %arrayidx6, align 2
				ret void
				}

				declare half @llvm.fabs.f16(half) #1
				declare half @llvm.fma.f16(half, half, half) #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }

test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll

This file was deleted.

	; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s \| FileCheck %s
	; XFAIL: *
	;
	; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
	; has vector registers of the expected width.
	; Currently, it says there are 8 vector registers that are 32-bits wide.

	target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"


	; Simple 3-pair chain with loads and stores
	define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
	; CHECK-LABEL: @test1_as_3_3_3(
	; CHECK: load <2 x double>, <2 x double> addrspace(3)*
	; CHECK: load <2 x double>, <2 x double> addrspace(3)*
	; CHECK: store <2 x double> %{{.}}, <2 x double> addrspace(3) %
	; CHECK: ret
	%i0 = load double, double addrspace(3)* %a, align 8
	%i1 = load double, double addrspace(3)* %b, align 8
	%mul = fmul double %i0, %i1
	%arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
	%i3 = load double, double addrspace(3)* %arrayidx3, align 8
	%arrayidx4 = getelementptr inbounds double, double addrspace(3)* %b, i64 1
	%i4 = load double, double addrspace(3)* %arrayidx4, align 8
	%mul5 = fmul double %i3, %i4
	store double %mul, double addrspace(3)* %c, align 8
	%arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
	store double %mul5, double addrspace(3)* %arrayidx5, align 8
	ret void
	}

	define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
	; CHECK-LABEL: @test1_as_3_0_0(
	; CHECK: load <2 x double>, <2 x double> addrspace(3)*
	; CHECK: load <2 x double>, <2 x double>*
	; CHECK: store <2 x double> %{{.}}, <2 x double> %
	; CHECK: ret
	%i0 = load double, double addrspace(3)* %a, align 8
	%i1 = load double, double* %b, align 8
	%mul = fmul double %i0, %i1
	%arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
	%i3 = load double, double addrspace(3)* %arrayidx3, align 8
	%arrayidx4 = getelementptr inbounds double, double* %b, i64 1
	%i4 = load double, double* %arrayidx4, align 8
	%mul5 = fmul double %i3, %i4
	store double %mul, double* %c, align 8
	%arrayidx5 = getelementptr inbounds double, double* %c, i64 1
	store double %mul5, double* %arrayidx5, align 8
	ret void
	}

	define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
	; CHECK-LABEL: @test1_as_0_0_3(
	; CHECK: load <2 x double>, <2 x double>*
	; CHECK: load <2 x double>, <2 x double>*
	; CHECK: store <2 x double> %{{.}}, <2 x double> addrspace(3) %
	; CHECK: ret
	%i0 = load double, double* %a, align 8
	%i1 = load double, double* %b, align 8
	%mul = fmul double %i0, %i1
	%arrayidx3 = getelementptr inbounds double, double* %a, i64 1
	%i3 = load double, double* %arrayidx3, align 8
	%arrayidx4 = getelementptr inbounds double, double* %b, i64 1
	%i4 = load double, double* %arrayidx4, align 8
	%mul5 = fmul double %i3, %i4
	store double %mul, double addrspace(3)* %c, align 8
	%arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
	store double %mul5, double addrspace(3)* %arrayidx5, align 8
	ret void
	}

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Allow vectorization of packed typesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 100010

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

test/Transforms/LoopVectorize/AMDGPU/packed-math.ll

test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll

AMDGPU: Allow vectorization of packed types
ClosedPublic