This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Stop wasting argument registers with v3i32/v3f32
ClosedPublic

Authored by arsenm on Jul 9 2018, 1:45 AM.

Download Raw Diff

Details

Reviewers

tstellar
kzhuravl
rampitec
mareko

Summary

Since v4i32/v4f32 are legal, SelectionDAGBuilder promotes these
to v4i32/v4f32 arguments which consume an additional register.
In addition to wasting argument space, this produces extra
instructions since now it appears the 4th vector component has
a meaningful value to most combines.

Diff Detail

Event Timeline

arsenm created this revision.Jul 9 2018, 1:45 AM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptJul 9 2018, 1:45 AM

Typo in summary v4i32/v4f32 -> v3i32,/v3f32.

In D49065#1155767, @tstellar wrote:

Typo in summary v4i32/v4f32 -> v3i32,/v3f32.

I think it's right but I can reword it to be less ambiguous sounding

Can you please add tests for <3 x i64> and <3 x double>?

mareko added inline comments.Jul 9 2018, 6:33 PM

test/CodeGen/AMDGPU/ret.ll
165 ↗	(On Diff #154548)	This can't be changed. <3 x i32> is a valid function argument type meaning 3 input VGPRs. There is no wasted space. It declares exactly 3 VGPRs. The VGPR indices are hardcoded in the hardware and can't be adjusted.

arsenm mentioned this in D49128: AMDGPU: Properly handle shader inputs with split arguments.Jul 11 2018, 11:24 PM

Update for D49128

arsenm added a parent revision: D49128: AMDGPU: Properly handle shader inputs with split arguments.Jul 12 2018, 12:36 AM

arsenm added a child revision: D49254: AMDGPU: Scalarize vector argument types to calls.Jul 12 2018, 10:36 AM

This patch along with the prerequisite patch doesn't break Mesa.

In D49065#1161120, @mareko wrote:

This patch along with the prerequisite patch doesn't break Mesa.

What's the prerequisite patch?

LGTM

This revision is now accepted and ready to land.Jul 13 2018, 9:46 AM

In D49065#1161236, @michel.daenzer wrote:

In D49065#1161120, @mareko wrote:

This patch along with the prerequisite patch doesn't break Mesa.

What's the prerequisite patch?

D49128

I can confirm Mesa now works with this patch, however, it still breaks 9 lit tests for me:

LLVM :: CodeGen/AMDGPU/fceil.ll
LLVM :: CodeGen/AMDGPU/fmaxnum.ll
LLVM :: CodeGen/AMDGPU/fpext.ll
LLVM :: CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
LLVM :: CodeGen/AMDGPU/insert_vector_elt.ll
LLVM :: CodeGen/AMDGPU/kernel-args.ll
LLVM :: CodeGen/AMDGPU/max.ll
LLVM :: CodeGen/AMDGPU/store-global.ll
LLVM :: CodeGen/AMDGPU/store-private.ll

r338197

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIISelLowering.h

11 lines

SIISelLowering.cpp

41 lines

test/

CodeGen/

AMDGPU/

call-argument-types.ll

64 lines

22 lines

11 lines

39 lines

39 lines

Diff 155124

lib/Target/AMDGPU/SIISelLowering.h

	Show All 19 Lines
	#include "SIInstrInfo.h"			#include "SIInstrInfo.h"

	namespace llvm {			namespace llvm {

	class SITargetLowering final : public AMDGPUTargetLowering {			class SITargetLowering final : public AMDGPUTargetLowering {
	private:			private:
	const SISubtarget *Subtarget;			const SISubtarget *Subtarget;

				public:
				MVT getRegisterTypeForCallingConv(LLVMContext &Context,
				EVT VT) const override;
				unsigned getNumRegistersForCallingConv(LLVMContext &Context,
				EVT VT) const override;

				unsigned getVectorTypeBreakdownForCallingConv(
				LLVMContext &Context, EVT VT, EVT &IntermediateVT,
				unsigned &NumIntermediates, MVT &RegisterVT) const override;

				private:
	SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,			SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
	SDValue Chain, uint64_t Offset) const;			SDValue Chain, uint64_t Offset) const;
	SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;			SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
	SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,			SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
	const SDLoc &SL, SDValue Chain,			const SDLoc &SL, SDValue Chain,
	uint64_t Offset, unsigned Align, bool Signed,			uint64_t Offset, unsigned Align, bool Signed,
	const ISD::InputArg *Arg = nullptr) const;			const ISD::InputArg *Arg = nullptr) const;

	▲ Show 20 Lines • Show All 281 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 687 Lines • ▼ Show 20 Lines
	}			}

	bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {			bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
	// SI has some legal vector types, but no legal vector operations. Say no			// SI has some legal vector types, but no legal vector operations. Say no
	// shuffles are legal in order to prefer scalarizing some vector operations.			// shuffles are legal in order to prefer scalarizing some vector operations.
	return false;			return false;
	}			}

				MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
				EVT VT) const {
				if (VT.isVector() && VT.getVectorNumElements() == 3) {
				EVT ScalarVT = VT.getScalarType();
				if (ScalarVT.getSizeInBits() == 32)
				return ScalarVT.getSimpleVT();
				}

				return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
				}

				unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
				EVT VT) const {
				if (VT.isVector() && VT.getVectorNumElements() == 3) {
				EVT ScalarVT = VT.getScalarType();
				if (ScalarVT.getSizeInBits() == 32)
				return 3;
				}

				return TargetLowering::getNumRegistersForCallingConv(Context, VT);
				}

				unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
				LLVMContext &Context, EVT VT, EVT &IntermediateVT,
				unsigned &NumIntermediates, MVT &RegisterVT) const {

				if (VT.getVectorNumElements() == 3) {
				EVT ScalarVT = VT.getScalarType();
				if (ScalarVT.getSizeInBits() == 32 \|\|
				ScalarVT.getSizeInBits() == 64) {
				RegisterVT = ScalarVT.getSimpleVT();
				IntermediateVT = RegisterVT;
				NumIntermediates = 3;
				return NumIntermediates;
				}
				}

				return TargetLowering::getVectorTypeBreakdownForCallingConv(
				Context, VT, IntermediateVT, NumIntermediates, RegisterVT);
				}

	bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,			bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &CI,			const CallInst &CI,
	MachineFunction &MF,			MachineFunction &MF,
	unsigned IntrID) const {			unsigned IntrID) const {
	if (const AMDGPU::RsrcIntrinsic *RsrcIntr =			if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
	AMDGPU::lookupRsrcIntrinsic(IntrID)) {			AMDGPU::lookupRsrcIntrinsic(IntrID)) {
	AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),			AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
	(Intrinsic::ID)IntrID);			(Intrinsic::ID)IntrID);
	▲ Show 20 Lines • Show All 7,655 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/call-argument-types.ll

	Show All 11 Lines
	declare void @external_void_func_i8_zeroext(i8 zeroext) #0			declare void @external_void_func_i8_zeroext(i8 zeroext) #0

	declare void @external_void_func_i16(i16) #0			declare void @external_void_func_i16(i16) #0
	declare void @external_void_func_i16_signext(i16 signext) #0			declare void @external_void_func_i16_signext(i16 signext) #0
	declare void @external_void_func_i16_zeroext(i16 zeroext) #0			declare void @external_void_func_i16_zeroext(i16 zeroext) #0

	declare void @external_void_func_i32(i32) #0			declare void @external_void_func_i32(i32) #0
	declare void @external_void_func_i64(i64) #0			declare void @external_void_func_i64(i64) #0
				declare void @external_void_func_v2i64(<2 x i64>) #0
				declare void @external_void_func_v3i64(<3 x i64>) #0
				declare void @external_void_func_v4i64(<4 x i64>) #0

	declare void @external_void_func_f16(half) #0			declare void @external_void_func_f16(half) #0
	declare void @external_void_func_f32(float) #0			declare void @external_void_func_f32(float) #0
	declare void @external_void_func_f64(double) #0			declare void @external_void_func_f64(double) #0

	declare void @external_void_func_v2i16(<2 x i16>) #0			declare void @external_void_func_v2i16(<2 x i16>) #0
	declare void @external_void_func_v2f16(<2 x half>) #0			declare void @external_void_func_v2f16(<2 x half>) #0

	declare void @external_void_func_v2i32(<2 x i32>) #0			declare void @external_void_func_v2i32(<2 x i32>) #0
	declare void @external_void_func_v3i32(<3 x i32>) #0			declare void @external_void_func_v3i32(<3 x i32>) #0
				declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
	declare void @external_void_func_v4i32(<4 x i32>) #0			declare void @external_void_func_v4i32(<4 x i32>) #0
	declare void @external_void_func_v8i32(<8 x i32>) #0			declare void @external_void_func_v8i32(<8 x i32>) #0
	declare void @external_void_func_v16i32(<16 x i32>) #0			declare void @external_void_func_v16i32(<16 x i32>) #0
	declare void @external_void_func_v32i32(<32 x i32>) #0			declare void @external_void_func_v32i32(<32 x i32>) #0
	declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0			declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0

	; return value and argument			; return value and argument
	declare i32 @external_i32_func_i32(i32) #0			declare i32 @external_i32_func_i32(i32) #0
	▲ Show 20 Lines • Show All 212 Lines • ▼ Show 20 Lines
	; GCN-DAG: v_mov_b32_e32 v1, [[K1]]			; GCN-DAG: v_mov_b32_e32 v1, [[K1]]
	; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}			; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
	; GCN-NEXT: s_endpgm			; GCN-NEXT: s_endpgm
	define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {			define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
	call void @external_void_func_i64(i64 123)			call void @external_void_func_i64(i64 123)
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
				; GCN: buffer_load_dwordx4 v[0:3]
				; GCN: s_waitcnt
				; GCN-NEXT: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
				%val = load <2 x i64>, <2 x i64> addrspace(1)* null
				call void @external_void_func_v2i64(<2 x i64> %val)
				ret void
				}

				; GCN-LABEL: {{^}}test_call_external_void_func_v3i64:
				; GCN: buffer_load_dwordx4 v[0:3]
				; GCN: v_mov_b32_e32 v4, s
				; GCN: v_mov_b32_e32 v5, s
				; GCN: s_waitcnt
				; GCN-NEXT: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
				%load = load <2 x i64>, <2 x i64> addrspace(1)* null
				%val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>

				call void @external_void_func_v3i64(<3 x i64> %val)
				ret void
				}

				; FIXME: Immedites should fold directly into v_mov_b32s
				; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
				; GCN: buffer_load_dwordx4 v[0:3]
				; GCN: v_mov_b32_e32 v4, s
				; GCN: v_mov_b32_e32 v5, s
				; GCN: v_mov_b32_e32 v6, s
				; GCN: v_mov_b32_e32 v7, s

				; GCN: s_waitcnt
				; GCN-NEXT: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
				%load = load <2 x i64>, <2 x i64> addrspace(1)* null
				%val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				call void @external_void_func_v4i64(<4 x i64> %val)
				ret void
				}

	; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:			; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
	; VI: v_mov_b32_e32 v0, 0x4400			; VI: v_mov_b32_e32 v0, 0x4400
	; CI: v_mov_b32_e32 v0, 4.0			; CI: v_mov_b32_e32 v0, 4.0
	; GCN-NOT: v0			; GCN-NOT: v0
	; GCN: s_swappc_b64			; GCN: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {			define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
	call void @external_void_func_f16(half 4.0)			call void @external_void_func_f16(half 4.0)
	ret void			ret void
	▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; GCN: s_waitcnt			; GCN: s_waitcnt
	; GCN-NEXT: s_swappc_b64			; GCN-NEXT: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {			define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
	%val = load <2 x i32>, <2 x i32> addrspace(1)* undef			%val = load <2 x i32>, <2 x i32> addrspace(1)* undef
	call void @external_void_func_v2i32(<2 x i32> %val)			call void @external_void_func_v2i32(<2 x i32> %val)
	ret void			ret void
	}			}

	; FIXME: Passing 4th
	; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:			; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
	; HSA-DAG: s_mov_b32 s33, s9			; HSA-DAG: s_mov_b32 s33, s9
	; MESA-DAG: s_mov_b32 s33, s3{{$}}			; MESA-DAG: s_mov_b32 s33, s3{{$}}

	; GCN-DAG: v_mov_b32_e32 v0			; GCN-DAG: v_mov_b32_e32 v0, 3
	; GCN-DAG: v_mov_b32_e32 v1			; GCN-DAG: v_mov_b32_e32 v1, 4
	; GCN-DAG: v_mov_b32_e32 v2			; GCN-DAG: v_mov_b32_e32 v2, 5
	; GCN-DAG: v_mov_b32_e32 v3			; GCN-NOT: v3

	; GCN: s_swappc_b64			; GCN: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {			define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
	call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)			call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32:
				; GCN-DAG: v_mov_b32_e32 v0, 3
				; GCN-DAG: v_mov_b32_e32 v1, 4
				; GCN-DAG: v_mov_b32_e32 v2, 5
				; GCN-DAG: v_mov_b32_e32 v3, 6
				define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
				call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
				ret void
				}

	; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:			; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
	; GCN: buffer_load_dwordx4 v[0:3]			; GCN: buffer_load_dwordx4 v[0:3]
	; GCN: s_waitcnt			; GCN: s_waitcnt
	; GCN-NEXT: s_swappc_b64			; GCN-NEXT: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {			define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
	%val = load <4 x i32>, <4 x i32> addrspace(1)* undef			%val = load <4 x i32>, <4 x i32> addrspace(1)* undef
	call void @external_void_func_v4i32(<4 x i32> %val)			call void @external_void_func_v4i32(<4 x i32> %val)
	ret void			ret void
	▲ Show 20 Lines • Show All 188 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/fmaxnum.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s

	declare float @llvm.maxnum.f32(float, float) #0			declare float @llvm.maxnum.f32(float, float) #0
	declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0			declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
				declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #0
	declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0			declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
	declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0			declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
	declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0			declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0

	declare double @llvm.maxnum.f64(double, double)			declare double @llvm.maxnum.f64(double, double)

	; FUNC-LABEL: @test_fmax_f32			; FUNC-LABEL: @test_fmax_f32
	; SI: v_max_f32_e32			; SI: v_max_f32_e32
	Show All 14 Lines
	; EG: MAX_DX10 {{.*}}[[OUT]]			; EG: MAX_DX10 {{.*}}[[OUT]]
	; EG: MAX_DX10 {{.*}}[[OUT]]			; EG: MAX_DX10 {{.*}}[[OUT]]
	define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {			define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
	%val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0			%val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
	store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8			store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
	ret void			ret void
	}			}

				; FUNC-LABEL: {{^}}test_fmax_v3f32:
				; SI: v_max_f32_e32
				; SI: v_max_f32_e32
				; SI: v_max_f32_e32
				; SI-NOT: v_max_f32
				define amdgpu_kernel void @test_fmax_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b) nounwind {
				%val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0
				store <3 x float> %val, <3 x float> addrspace(1)* %out, align 16
				ret void
				}

	; FUNC-LABEL: @test_fmax_v4f32			; FUNC-LABEL: @test_fmax_v4f32
	; SI: v_max_f32_e32			; SI: v_max_f32_e32
	; SI: v_max_f32_e32			; SI: v_max_f32_e32
	; SI: v_max_f32_e32			; SI: v_max_f32_e32
	; SI: v_max_f32_e32			; SI: v_max_f32_e32

	; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]			; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
	; EG: MAX_DX10 {{.*}}[[OUT]]			; EG: MAX_DX10 {{.*}}[[OUT]]
	▲ Show 20 Lines • Show All 231 Lines • ▼ Show 20 Lines
	; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]			; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
	; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}			; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
	define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {			define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
	%val = call float @llvm.maxnum.f32(float 99.0, float %a) #0			%val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
	store float %val, float addrspace(1)* %out, align 4			store float %val, float addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				; FUNC-LABEL: {{^}}test_func_fmax_v3f32:
				; SI: v_max_f32_e32
				; SI: v_max_f32_e32
				; SI: v_max_f32_e32
				; SI-NOT: v_max_f32
				define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) nounwind {
				%val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0
				ret <3 x float> %val
				}

	attributes #0 = { nounwind readnone }			attributes #0 = { nounwind readnone }

test/CodeGen/AMDGPU/fminnum.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=cypress < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=cypress < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	declare float @llvm.minnum.f32(float, float) #0			declare float @llvm.minnum.f32(float, float) #0
	declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0			declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
				declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #0
	declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0			declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
	declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0			declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
	declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0			declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0

	; FUNC-LABEL: @test_fmin_f32			; FUNC-LABEL: @test_fmin_f32
	; SI: v_min_f32_e32			; SI: v_min_f32_e32

	; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]			; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
	▲ Show 20 Lines • Show All 258 Lines • ▼ Show 20 Lines
	; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]			; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
	; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}			; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
	define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {			define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
	%val = call float @llvm.minnum.f32(float 99.0, float %a) #0			%val = call float @llvm.minnum.f32(float 99.0, float %a) #0
	store float %val, float addrspace(1)* %out, align 4			store float %val, float addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				; FUNC-LABEL: {{^}}test_func_fmin_v3f32:
				; SI: v_min_f32_e32
				; SI: v_min_f32_e32
				; SI: v_min_f32_e32
				; SI-NOT: v_min_f32
				define <3 x float> @test_func_fmin_v3f32(<3 x float> %a, <3 x float> %b) nounwind {
				%val = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b) #0
				ret <3 x float> %val
				}

	attributes #0 = { nounwind readnone }			attributes #0 = { nounwind readnone }

test/CodeGen/AMDGPU/function-args.ll

	Show First 20 Lines • Show All 733 Lines • ▼ Show 20 Lines
	; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:128{{$}}			; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:128{{$}}
	define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {			define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
	store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef			store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
	store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef			store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
	store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef			store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
	ret void			ret void
	}			}

				; Make sure v3 isn't a wasted register because of v3 types being promoted to v4
				; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg:
				; GCN: s_waitcnt
				; GCN: ds_write_b32 v{{[0-9]+}}, v0
				; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
				; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
				; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
				; GCN-NEXT: s_waitcnt
				; GCN-NEXT: s_setpc_b64
				define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
				%arg0.0 = extractelement <3 x float> %arg0, i32 0
				%arg0.1 = extractelement <3 x float> %arg0, i32 1
				%arg0.2 = extractelement <3 x float> %arg0, i32 2
				store volatile float %arg0.0, float addrspace(3)* undef
				store volatile float %arg0.1, float addrspace(3)* undef
				store volatile float %arg0.2, float addrspace(3)* undef
				store volatile i32 %arg1, i32 addrspace(3)* undef
				ret void
				}

				; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg:
				; GCN: s_waitcnt
				; GCN: ds_write_b32 v{{[0-9]+}}, v0
				; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
				; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
				; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
				; GCN-NEXT: s_waitcnt
				; GCN-NEXT: s_setpc_b64
				define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
				%arg0.0 = extractelement <3 x i32> %arg0, i32 0
				%arg0.1 = extractelement <3 x i32> %arg0, i32 1
				%arg0.2 = extractelement <3 x i32> %arg0, i32 2
				store volatile i32 %arg0.0, i32 addrspace(3)* undef
				store volatile i32 %arg0.1, i32 addrspace(3)* undef
				store volatile i32 %arg0.2, i32 addrspace(3)* undef
				store volatile i32 %arg1, i32 addrspace(3)* undef
				ret void
				}

	; Check there is no crash.			; Check there is no crash.
	; GCN-LABEL: {{^}}void_func_v16i8:			; GCN-LABEL: {{^}}void_func_v16i8:
	define void @void_func_v16i8(<16 x i8> %arg0) #0 {			define void @void_func_v16i8(<16 x i8> %arg0) #0 {
	store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef			store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
	ret void			ret void
	}			}

	; Check there is no crash.			; Check there is no crash.
	; GCN-LABEL: {{^}}void_func_v32i32_v16i8:			; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
	define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {			define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
	store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef			store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
	store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef			store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
	ret void			ret void
	}			}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }

test/CodeGen/AMDGPU/function-returns.ll

	Show First 20 Lines • Show All 525 Lines • ▼ Show 20 Lines
	; GFX9: s_waitcnt vmcnt(0)			; GFX9: s_waitcnt vmcnt(0)
	; GFX9-NEXT: s_setpc_b64			; GFX9-NEXT: s_setpc_b64
	define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {			define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
	%ptr = load volatile { i32, <32 x i32> } addrspace(1), { i32, <32 x i32> } addrspace(1) addrspace(4)* undef			%ptr = load volatile { i32, <32 x i32> } addrspace(1), { i32, <32 x i32> } addrspace(1) addrspace(4)* undef
	%val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr			%val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
	ret { i32, <32 x i32> }%val			ret { i32, <32 x i32> }%val
	}			}

				; Make sure the last struct component is returned in v3, not v4.
				; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg:
				; GCN: ds_read_b32 v0,
				; GCN: ds_read_b32 v1,
				; GCN: ds_read_b32 v2,
				; GCN: ds_read_b32 v3,
				define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
				%load0 = load volatile i32, i32 addrspace(3)* undef
				%load1 = load volatile i32, i32 addrspace(3)* undef
				%load2 = load volatile i32, i32 addrspace(3)* undef
				%load3 = load volatile i32, i32 addrspace(3)* undef

				%insert.0 = insertelement <3 x i32> undef, i32 %load0, i32 0
				%insert.1 = insertelement <3 x i32> %insert.0, i32 %load1, i32 1
				%insert.2 = insertelement <3 x i32> %insert.1, i32 %load2, i32 2
				%insert.3 = insertvalue { <3 x i32>, i32 } undef, <3 x i32> %insert.2, 0
				%insert.4 = insertvalue { <3 x i32>, i32 } %insert.3, i32 %load3, 1
				ret { <3 x i32>, i32 } %insert.4
				}

				; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg:
				; GCN: ds_read_b32 v0,
				; GCN: ds_read_b32 v1,
				; GCN: ds_read_b32 v2,
				; GCN: ds_read_b32 v3,
				define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
				%load0 = load volatile float, float addrspace(3)* undef
				%load1 = load volatile float, float addrspace(3)* undef
				%load2 = load volatile float, float addrspace(3)* undef
				%load3 = load volatile i32, i32 addrspace(3)* undef

				%insert.0 = insertelement <3 x float> undef, float %load0, i32 0
				%insert.1 = insertelement <3 x float> %insert.0, float %load1, i32 1
				%insert.2 = insertelement <3 x float> %insert.1, float %load2, i32 2
				%insert.3 = insertvalue { <3 x float>, i32 } undef, <3 x float> %insert.2, 0
				%insert.4 = insertvalue { <3 x float>, i32 } %insert.3, i32 %load3, 1
				ret { <3 x float>, i32 } %insert.4
				}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }