diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h --- a/clang/lib/Basic/Targets/AMDGPU.h +++ b/clang/lib/Basic/Targets/AMDGPU.h @@ -115,6 +115,9 @@ return getTriple().getArch() == llvm::Triple::amdgcn ? 64 : 32; } + bool hasBFloat16Type() const override { return isAMDGCN(getTriple()); } + const char *getBFloat16Mangling() const override { return "u6__bf16"; }; + const char *getClobbers() const override { return ""; } ArrayRef getGCCRegNames() const override; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -365,6 +365,12 @@ !isAMDGCN(Triple)); UseAddrSpaceMapMangling = true; + if (isAMDGCN(Triple)) { + // __bf16 is always available as a load/store only type on AMDGCN. + BFloat16Width = BFloat16Align = 16; + BFloat16Format = &llvm::APFloat::BFloat(); + } + HasLegalHalfType = true; HasFloat16 = true; WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64; diff --git a/clang/test/CodeGenCUDA/amdgpu-bf16.cu b/clang/test/CodeGenCUDA/amdgpu-bf16.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgpu-bf16.cu @@ -0,0 +1,129 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "amdgcn-amd-amdhsa" \ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -emit-llvm -o - %s | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z8test_argPu6__bf16u6__bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BF16]] to ptr +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_arg(__bf16 *out, __bf16 in) { + __bf16 bf16 = in; + *out = bf16; +} + +// CHECK-LABEL: @_Z9test_loadPu6__bf16S_( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BF16]] to ptr +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[TMP0]], align 2 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP2]], ptr [[TMP3]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_load(__bf16 *out, __bf16 *in) { + __bf16 bf16 = *in; + *out = bf16; +} + +// CHECK-LABEL: @_Z8test_retu6__bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: ret bfloat [[TMP0]] +// +__device__ __bf16 test_ret( __bf16 in) { + return in; +} + +// CHECK-LABEL: @_Z9test_callu6__bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CALL:%.*]] = call contract noundef bfloat @_Z8test_retu6__bf16(bfloat noundef [[TMP0]]) #[[ATTR1:[0-9]+]] +// CHECK-NEXT: ret bfloat [[CALL]] +// +__device__ __bf16 test_call( __bf16 in) { + return test_ret(in); +} + + +// CHECK-LABEL: @_Z15test_vec_assignv( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VEC2_A:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) +// CHECK-NEXT: [[VEC2_B:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) +// CHECK-NEXT: [[VEC4_A:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) +// CHECK-NEXT: [[VEC4_B:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) +// CHECK-NEXT: [[VEC8_A:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5) +// CHECK-NEXT: [[VEC8_B:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5) +// CHECK-NEXT: [[VEC16_A:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5) +// CHECK-NEXT: [[VEC16_B:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5) +// CHECK-NEXT: [[VEC2_A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC2_A]] to ptr +// CHECK-NEXT: [[VEC2_B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC2_B]] to ptr +// CHECK-NEXT: [[VEC4_A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4_A]] to ptr +// CHECK-NEXT: [[VEC4_B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4_B]] to ptr +// CHECK-NEXT: [[VEC8_A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC8_A]] to ptr +// CHECK-NEXT: [[VEC8_B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC8_B]] to ptr +// CHECK-NEXT: [[VEC16_A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC16_A]] to ptr +// CHECK-NEXT: [[VEC16_B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC16_B]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr [[VEC2_B_ASCAST]], align 4 +// CHECK-NEXT: store <2 x bfloat> [[TMP0]], ptr [[VEC2_A_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr [[VEC4_B_ASCAST]], align 8 +// CHECK-NEXT: store <4 x bfloat> [[TMP1]], ptr [[VEC4_A_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x bfloat>, ptr [[VEC8_B_ASCAST]], align 16 +// CHECK-NEXT: store <8 x bfloat> [[TMP2]], ptr [[VEC8_A_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x bfloat>, ptr [[VEC16_B_ASCAST]], align 32 +// CHECK-NEXT: store <16 x bfloat> [[TMP3]], ptr [[VEC16_A_ASCAST]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_vec_assign() { + typedef __attribute__((ext_vector_type(2))) __bf16 bf16_x2; + bf16_x2 vec2_a, vec2_b; + vec2_a = vec2_b; + + typedef __attribute__((ext_vector_type(4))) __bf16 bf16_x4; + bf16_x4 vec4_a, vec4_b; + vec4_a = vec4_b; + + typedef __attribute__((ext_vector_type(8))) __bf16 bf16_x8; + bf16_x8 vec8_a, vec8_b; + vec8_a = vec8_b; + + typedef __attribute__((ext_vector_type(16))) __bf16 bf16_x16; + bf16_x16 vec16_a, vec16_b; + vec16_a = vec16_b; +} diff --git a/clang/test/SemaCUDA/amdgpu-bf16.cu b/clang/test/SemaCUDA/amdgpu-bf16.cu new file mode 100644 --- /dev/null +++ b/clang/test/SemaCUDA/amdgpu-bf16.cu @@ -0,0 +1,99 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 "-triple" "x86_64-unknown-linux-gnu" "-aux-triple" "amdgcn-amd-amdhsa"\ +// RUN: "-target-cpu" "x86-64" -fsyntax-only -verify=amdgcn %s +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "amdgcn-amd-amdhsa"\ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=amdgcn %s + +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "r600-unknown-unknown"\ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=amdgcn,r600 %s + +// AMDGCN has storage-only support for bf16. R600 does not support it should error out when +// it's the main target. + +#include "Inputs/cuda.h" + +// There should be no errors on using the type itself, or when loading/storing values for amdgcn. +// r600 should error on all uses of the type. + +// r600-error@+1 {{__bf16 is not supported on this target}} +typedef __attribute__((ext_vector_type(2))) __bf16 bf16_x2; +// r600-error@+1 {{__bf16 is not supported on this target}} +typedef __attribute__((ext_vector_type(4))) __bf16 bf16_x4; +// r600-error@+1 {{__bf16 is not supported on this target}} +typedef __attribute__((ext_vector_type(8))) __bf16 bf16_x8; +// r600-error@+1 {{__bf16 is not supported on this target}} +typedef __attribute__((ext_vector_type(16))) __bf16 bf16_x16; + +// r600-error@+1 2 {{__bf16 is not supported on this target}} +__device__ void test(bool b, __bf16 *out, __bf16 in) { + __bf16 bf16 = in; // r600-error {{__bf16 is not supported on this target}} + + bf16 + bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + bf16 - bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + bf16 * bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + bf16 / bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + + __fp16 fp16; + + bf16 + fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 + bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 - fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 - bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 * fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 * bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 / fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 / bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 = fp16; // amdgcn-error {{assigning to '__bf16' from incompatible type '__fp16'}} + fp16 = bf16; // amdgcn-error {{assigning to '__fp16' from incompatible type '__bf16'}} + bf16 + (b ? fp16 : bf16); // amdgcn-error {{incompatible operand types ('__fp16' and '__bf16')}} + *out = bf16; + + // amdgcn-error@+1 {{static_cast from '__bf16' to 'unsigned short' is not allowed}} + unsigned short u16bf16 = static_cast(bf16); + // amdgcn-error@+2 {{C-style cast from 'unsigned short' to '__bf16' is not allowed}} + // r600-error@+1 {{__bf16 is not supported on this target}} + bf16 = (__bf16)u16bf16; + + // amdgcn-error@+1 {{static_cast from '__bf16' to 'float' is not allowed}} + float f32bf16 = static_cast(bf16); + // amdgcn-error@+2 {{C-style cast from 'float' to '__bf16' is not allowed}} + // r600-error@+1 {{__bf16 is not supported on this target}} + bf16 = (__bf16)f32bf16; + + // amdgcn-error@+1 {{static_cast from '__bf16' to 'double' is not allowed}} + double f64bf16 = static_cast(bf16); + // amdgcn-error@+2 {{C-style cast from 'double' to '__bf16' is not allowed}} + // r600-error@+1 {{__bf16 is not supported on this target}} + bf16 = (__bf16)f64bf16; + + // r600-error@+1 {{__bf16 is not supported on this target}} + typedef __attribute__((ext_vector_type(2))) __bf16 bf16_x2; + bf16_x2 vec2_a, vec2_b; + vec2_a = vec2_b; + + // r600-error@+1 {{__bf16 is not supported on this target}} + typedef __attribute__((ext_vector_type(4))) __bf16 bf16_x4; + bf16_x4 vec4_a, vec4_b; + vec4_a = vec4_b; + + // r600-error@+1 {{__bf16 is not supported on this target}} + typedef __attribute__((ext_vector_type(8))) __bf16 bf16_x8; + bf16_x8 vec8_a, vec8_b; + vec8_a = vec8_b; + + // r600-error@+1 {{__bf16 is not supported on this target}} + typedef __attribute__((ext_vector_type(16))) __bf16 bf16_x16; + bf16_x16 vec16_a, vec16_b; + vec16_a = vec16_b; +} + +// r600-error@+1 2 {{__bf16 is not supported on this target}} +__bf16 hostfn(__bf16 a) { + return a; +} + +// r600-error@+2 {{__bf16 is not supported on this target}} +// r600-error@+1 {{vector size not an integral multiple of component size}} +typedef __bf16 foo __attribute__((__vector_size__(16), __aligned__(16))); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -17,6 +17,8 @@ // Calling convention for SI def CC_SI_Gfx : CallingConv<[ + CCIfType<[v2bf16], CCBitConvertToType>, + // 0-3 are reserved for the stack buffer descriptor // 30-31 are reserved for the return address // 32 is reserved for the stack pointer @@ -42,7 +44,7 @@ def RetCC_SI_Gfx : CallingConv<[ CCIfType<[i1], CCPromoteToType>, CCIfType<[i1, i16], CCIfExtend>>, - + CCIfType<[v2bf16], CCBitConvertToType>, CCIfNotInReg; def CC_SI_SHADER : CallingConv<[ + CCIfType<[v2bf16], CCBitConvertToType>, CCIfInReg>>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfType<[i32, i16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, @@ -183,6 +187,7 @@ CCIfByVal>, CCIfType<[i1], CCPromoteToType>, CCIfType<[i8, i16], CCIfExtend>>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, @@ -195,6 +200,7 @@ def RetCC_AMDGPU_Func : CallingConv<[ CCIfType<[i1], CCPromoteToType>, CCIfType<[i1, i16], CCIfExtend>>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -163,6 +163,7 @@ Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); @@ -178,6 +179,7 @@ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); @@ -272,6 +274,7 @@ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::f32, MVT::bf16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); @@ -280,6 +283,7 @@ setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); + setTruncStoreAction(MVT::f64, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -163,6 +163,9 @@ SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const; SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -475,6 +475,9 @@ setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Custom); + setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Custom); + if (Subtarget->has16BitInsts()) { setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, @@ -846,8 +849,11 @@ EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 16) { - if (Subtarget->has16BitInsts()) - return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + if (Subtarget->has16BitInsts()) { + if (VT.isInteger()) + return MVT::v2i16; + return (ScalarVT == MVT::bf16 ? MVT::v2bf16 : MVT::v2f16); + } return VT.isInteger() ? MVT::i32 : MVT::f32; } @@ -900,7 +906,10 @@ // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. if (Size == 16 && Subtarget->has16BitInsts()) { - RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + if (VT.isInteger()) + RegisterVT = MVT::v2i16; + else + RegisterVT = (ScalarVT == MVT::bf16 ? MVT::v2bf16 : MVT::v2f16); IntermediateVT = RegisterVT; NumIntermediates = (NumElts + 1) / 2; return NumIntermediates; @@ -4780,6 +4789,10 @@ return lowerXMUL_LOHI(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::BF16_TO_FP: + return lowerBF16_TO_FP(Op, DAG); + case ISD::FP_TO_BF16: + return lowerFP_TO_BF16(Op, DAG); } return SDValue(); } @@ -5131,6 +5144,10 @@ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; } + case ISD::FP_TO_BF16: + if (SDValue LoweredNode = lowerFP_TO_BF16(SDValue(N, 0), DAG)) + Results.push_back(LoweredNode); + return; default: break; } @@ -5531,6 +5548,35 @@ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } +SDValue SITargetLowering::lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const { + // bf16 is illegal and gets softened to i16 for storage, with float being used + // for arithmetic. + // + // After softening, fp_to_bf16 can be emitted, but with a i16 VT instead. + // This lowers them into (i16 (bitconvert (f32 (fptrunc x)))) + if (Op.getValueType() != MVT::i16) + return SDValue(); + + SDLoc SL(Op); + return DAG.getNode( + ISD::BITCAST, SL, MVT::i16, + DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Op->getOperand(0), + DAG.getIntPtrConstant(0, SL, /*isTarget=*/true))); +} + +SDValue SITargetLowering::lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const { + // Same as lowerFP_TO_BF16 + // Lowers (bf16_to_fp (i16 x)) to (f32 (fp_extend (f16(bitconvert x)))) + if (!Op.getValueType().isFloatingPoint() || + Op->getOperand(0).getValueType() != MVT::i16) + return SDValue(); + + SDLoc SL(Op); + return DAG.getNode( + ISD::FP_EXTEND, SL, MVT::f32, + DAG.getNode(ISD::BITCAST, SL, MVT::f16, Op->getOperand(0))); +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { if (Subtarget->hasApertureRegs()) { diff --git a/llvm/test/CodeGen/AMDGPU/bf16-ops.ll b/llvm/test/CodeGen/AMDGPU/bf16-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16-ops.ll @@ -0,0 +1,32 @@ +; RUN: not llc < %s -march=amdgcn -mcpu=hawaii +; RUN: not llc < %s -march=amdgcn -mcpu=tonga +; RUN: not llc < %s -march=amdgcn -mcpu=gfx900 +; RUN: not llc < %s -march=amdgcn -mcpu=gfx1010 + +; TODO: Add GlobalISel tests, currently it silently miscompiles as GISel does not handle BF16 at all. + +; We only have storage-only BF16 support so check codegen fails if we attempt to do operations on bfloats. + +define void @test_fneg(bfloat %a, ptr addrspace(1) %out) { + %result = fneg bfloat %a + store bfloat %result, ptr addrspace(1) %out + ret void +} + +define void @test_fabs(bfloat %a, ptr addrspace(1) %out) { + %result = fabs bfloat %a + store bfloat %result, ptr addrspace(1) %out + ret void +} + +define void @test_add(bfloat %a, bfloat %b, ptr addrspace(1) %out) { + %result = fadd bfloat %a, %b + store bfloat %result, ptr addrspace(1) %out + ret void +} + +define void @test_mul(bfloat %a, bfloat %b, ptr addrspace(1) %out) { + %result = fmul bfloat %a, %b + store bfloat %result, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -0,0 +1,2998 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN +; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefixes=GFX7 +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX8 +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 + +; We only have storage-only BF16 support. We can load/store those values as we treat them as u16, but +; we don't support operations on them. As such, codegen is expected to fail for any operation other +; than simple load/stores. + +define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_load_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load bfloat, ptr addrspace(1) %in + store bfloat %val, ptr addrspace(1) %out + ret void +} + +define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_load_store_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <2 x bfloat>, ptr addrspace(1) %in + store <2 x bfloat> %val, ptr addrspace(1) %out + ret void +} + +define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_load_store_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <4 x bfloat>, ptr addrspace(1) %in + store <4 x bfloat> %val, ptr addrspace(1) %out + ret void +} + +define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_load_store_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v8bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v8bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v8bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <8 x bfloat>, ptr addrspace(1) %in + store <8 x bfloat> %val, ptr addrspace(1) %out + ret void +} + +define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_load_store_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v16bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v16bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v16bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <16 x bfloat>, ptr addrspace(1) %in + store <16 x bfloat> %val, ptr addrspace(1) %out + ret void +} + +define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_arg_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: flat_store_short v[1:2], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: global_store_short v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_arg_store_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <2 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_arg_store_v3bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v3bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v3bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_store_short v[2:3], v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v3bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v3bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_arg_store_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <4 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_arg_store_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v6, v2 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v10, v7 +; GCN-NEXT: v_or_b32_e32 v0, v0, v11 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v8bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v8bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v8bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <8 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_arg_store_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v18, v15 +; GCN-NEXT: v_or_b32_e32 v0, v0, v19 +; GCN-NEXT: v_or_b32_e32 v7, v14, v20 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v16bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v15 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v13 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v12, v7, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v10, v7, v0 +; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v16bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v16bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <16 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_inreg_arg_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NEXT: s_mov_b32 s38, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 s36, s38 +; GCN-NEXT: s_mov_b32 s37, s38 +; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_inreg_arg_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GFX7-NEXT: s_mov_b32 s38, 0 +; GFX7-NEXT: s_mov_b32 s39, 0xf000 +; GFX7-NEXT: s_mov_b32 s36, s38 +; GFX7-NEXT: s_mov_b32 s37, s38 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_inreg_arg_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_inreg_arg_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_inreg_arg_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { +; GCN-LABEL: test_byval: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_byval: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_byval: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GFX8-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_byval: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GFX9-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_byval: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GFX10-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store bfloat %val, ptr addrspace(5) %bv + %retval = load bfloat, ptr addrspace(5) %bv + ret bfloat %retval +} + +define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { +; GCN-LABEL: test_sret: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_sret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_sret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_sret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_sret: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store bfloat %val, ptr addrspace(5) %sret + ret void +} + +define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GCN-LABEL: test_bitcast_from_bfloat: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_bitcast_from_bfloat: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_bitcast_from_bfloat: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_bitcast_from_bfloat: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_bitcast_from_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load bfloat, ptr addrspace(1) %in + %val_int = bitcast bfloat %val to i16 + store i16 %val_int, ptr addrspace(1) %out + ret void +} + +define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: test_bitcast_to_bfloat: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_bitcast_to_bfloat: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_bitcast_to_bfloat: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_bitcast_to_bfloat: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_bitcast_to_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v2, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load i16, ptr addrspace(1) %in + %val_fp = bitcast i16 %val to bfloat + store bfloat %val_fp, ptr addrspace(1) %out + ret void +} + +define bfloat @test_ret(bfloat %in) { +; GCN-LABEL: test_ret: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret bfloat %in +} + +define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) { +; GCN-LABEL: test_ret_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret_v2bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret_v2bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret_v2bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret_v2bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret <2 x bfloat> %in +} + +define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { +; GCN-LABEL: test_ret_v3bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret_v3bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret_v3bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret_v3bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret_v3bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret <3 x bfloat> %in +} + +define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) { +; GCN-LABEL: test_ret_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret_v4bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret_v4bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret_v4bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret <4 x bfloat> %in +} + +define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) { +; GCN-LABEL: test_ret_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret_v8bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret_v8bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret_v8bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret <8 x bfloat> %in +} + +define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) { +; GCN-LABEL: test_ret_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret_v16bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret_v16bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret_v16bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret <16 x bfloat> %in +} + +define void @test_call(bfloat %in, ptr addrspace(5) %out) { +; GCN-LABEL: test_call: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_writelane_b32 v2, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v2, s30, 0 +; GCN-NEXT: v_writelane_b32 v2, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v2, 1 +; GCN-NEXT: v_readlane_b32 s30, v2, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v2, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_call: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_writelane_b32 v2, s33, 2 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_writelane_b32 v2, s30, 0 +; GFX7-NEXT: v_writelane_b32 v2, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v2, 1 +; GFX7-NEXT: v_readlane_b32 s30, v2, 0 +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s33, v2, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_call: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_writelane_b32 v2, s33, 2 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_writelane_b32 v2, s30, 0 +; GFX8-NEXT: v_writelane_b32 v2, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_readlane_b32 s31, v2, 1 +; GFX8-NEXT: v_readlane_b32 s30, v2, 0 +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readlane_b32 s33, v2, 2 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_call: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v2, s30, 0 +; GFX9-NEXT: v_writelane_b32 v2, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_readlane_b32 s31, v2, 1 +; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_call: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: v_writelane_b32 v2, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_readlane_b32 s31, v2, 1 +; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call bfloat @test_arg_store(bfloat %in) + store volatile bfloat %result, ptr addrspace(5) %out + ret void +} + +define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { +; GCN-LABEL: test_call_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_writelane_b32 v3, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v3, s30, 0 +; GCN-NEXT: v_writelane_b32 v3, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v2 +; GCN-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v3, 1 +; GCN-NEXT: v_readlane_b32 s30, v3, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v3, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_call_v2bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_writelane_b32 v3, s33, 2 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_writelane_b32 v3, s30, 0 +; GFX7-NEXT: v_writelane_b32 v3, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 2, v2 +; GFX7-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s31, v3, 1 +; GFX7-NEXT: v_readlane_b32 s30, v3, 0 +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: v_readlane_b32 s33, v3, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_call_v2bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_writelane_b32 v2, s33, 2 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_writelane_b32 v2, s30, 0 +; GFX8-NEXT: v_writelane_b32 v2, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readlane_b32 s31, v2, 1 +; GFX8-NEXT: v_readlane_b32 s30, v2, 0 +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: v_readlane_b32 s33, v2, 2 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_call_v2bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v2, s30, 0 +; GFX9-NEXT: v_writelane_b32 v2, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readlane_b32 s31, v2, 1 +; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_call_v2bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: v_writelane_b32 v2, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readlane_b32 s31, v2, 1 +; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in) + store volatile <2 x bfloat> %result, ptr addrspace(5) %out + ret void +} + +define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { +; GCN-LABEL: test_call_v3bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_writelane_b32 v4, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v4, s30, 0 +; GCN-NEXT: v_writelane_b32 v4, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_short v2, v5, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v4, 1 +; GCN-NEXT: v_readlane_b32 s30, v4, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v4, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_call_v3bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_writelane_b32 v4, s33, 2 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_writelane_b32 v4, s30, 0 +; GFX7-NEXT: v_writelane_b32 v4, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_readlane_b32 s31, v4, 1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v3 +; GFX7-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s30, v4, 0 +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: v_readlane_b32 s33, v4, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_call_v3bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_writelane_b32 v3, s33, 2 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v2 +; GFX8-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readlane_b32 s31, v3, 1 +; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: v_readlane_b32 s33, v3, 2 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_call_v3bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_call_v3bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v3, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4 +; GFX10-NEXT: v_writelane_b32 v3, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readlane_b32 s31, v3, 1 +; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: v_readlane_b32 s33, v3, 2 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in) + store volatile <3 x bfloat> %result, ptr addrspace(5) %out + ret void +} + +define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { +; GCN-LABEL: test_call_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_writelane_b32 v5, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v5, s30, 0 +; GCN-NEXT: v_writelane_b32 v5, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 6, v4 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 4, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v4 +; GCN-NEXT: buffer_store_short v3, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v2, v7, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v5, 1 +; GCN-NEXT: v_readlane_b32 s30, v5, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v5, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_call_v4bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_writelane_b32 v5, s33, 2 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_writelane_b32 v5, s30, 0 +; GFX7-NEXT: v_writelane_b32 v5, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 6, v4 +; GFX7-NEXT: buffer_store_short v3, v6, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4 +; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4 +; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s31, v5, 1 +; GFX7-NEXT: v_readlane_b32 s30, v5, 0 +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: v_readlane_b32 s33, v5, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_call_v4bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_writelane_b32 v3, s33, 2 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: buffer_store_short v1, v6, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2 +; GFX8-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readlane_b32 s31, v3, 1 +; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: v_readlane_b32 s33, v3, 2 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_call_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_call_v4bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v3, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: v_writelane_b32 v3, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readlane_b32 s31, v3, 1 +; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: v_readlane_b32 s33, v3, 2 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in) + store volatile <4 x bfloat> %result, ptr addrspace(5) %out + ret void +} + +define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { +; GCN-LABEL: test_call_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_writelane_b32 v9, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v9, s30, 0 +; GCN-NEXT: v_writelane_b32 v9, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 14, v8 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 12, v8 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v8 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 8, v8 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 6, v8 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v8 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 2, v8 +; GCN-NEXT: buffer_store_short v7, v10, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v6, v11, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v4, v13, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v14, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v1, v16, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v9, 1 +; GCN-NEXT: v_readlane_b32 s30, v9, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v9, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_call_v8bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_writelane_b32 v9, s33, 2 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_writelane_b32 v9, s30, 0 +; GFX7-NEXT: v_writelane_b32 v9, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 14, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: buffer_store_short v7, v10, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8 +; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8 +; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8 +; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s31, v9, 1 +; GFX7-NEXT: v_readlane_b32 s30, v9, 0 +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: v_readlane_b32 s33, v9, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_call_v8bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_writelane_b32 v5, s33, 2 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_writelane_b32 v5, s30, 0 +; GFX8-NEXT: v_writelane_b32 v5, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 12, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: buffer_store_short v3, v10, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX8-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v4 +; GFX8-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX8-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4 +; GFX8-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readlane_b32 s31, v5, 1 +; GFX8-NEXT: v_readlane_b32 s30, v5, 0 +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: v_readlane_b32 s33, v5, 2 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_call_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v5, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v5, s30, 0 +; GFX9-NEXT: v_writelane_b32 v5, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readlane_b32 s31, v5, 1 +; GFX9-NEXT: v_readlane_b32 s30, v5, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: v_readlane_b32 s33, v5, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_call_v8bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v5, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v5, s30, 0 +; GFX10-NEXT: v_writelane_b32 v5, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen offset:12 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen offset:8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readlane_b32 s31, v5, 1 +; GFX10-NEXT: v_readlane_b32 s30, v5, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: v_readlane_b32 s33, v5, 2 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in) + store volatile <8 x bfloat> %result, ptr addrspace(5) %out + ret void +} + +define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { +; GCN-LABEL: test_call_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_writelane_b32 v17, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v17, s30, 0 +; GCN-NEXT: v_writelane_b32 v17, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 30, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 26, v16 +; GCN-NEXT: buffer_store_short v15, v18, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 22, v16 +; GCN-NEXT: buffer_store_short v14, v19, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 18, v16 +; GCN-NEXT: buffer_store_short v13, v20, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 14, v16 +; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16 +; GCN-NEXT: buffer_store_short v11, v18, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 6, v16 +; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16 +; GCN-NEXT: buffer_store_short v9, v19, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v7, v20, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v18, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v17, 1 +; GCN-NEXT: v_readlane_b32 s30, v17, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v17, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_call_v16bf16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_writelane_b32 v17, s33, 2 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_writelane_b32 v17, s30, 0 +; GFX7-NEXT: v_writelane_b32 v17, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 30, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-NEXT: buffer_store_short v15, v18, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16 +; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16 +; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16 +; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s31, v17, 1 +; GFX7-NEXT: v_readlane_b32 s30, v17, 0 +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: v_readlane_b32 s33, v17, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_call_v16bf16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_writelane_b32 v9, s33, 2 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_writelane_b32 v9, s30, 0 +; GFX8-NEXT: v_writelane_b32 v9, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 28, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX8-NEXT: buffer_store_short v7, v18, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX8-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX8-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX8-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 12, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX8-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX8-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX8-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 30, v8 +; GFX8-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 26, v8 +; GFX8-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 22, v8 +; GFX8-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 18, v8 +; GFX8-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v8 +; GFX8-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX8-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v8 +; GFX8-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v8 +; GFX8-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readlane_b32 s31, v9, 1 +; GFX8-NEXT: v_readlane_b32 s30, v9, 0 +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: v_readlane_b32 s33, v9, 2 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_call_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v9, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v9, s30, 0 +; GFX9-NEXT: v_writelane_b32 v9, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen offset:28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v6, v8, s[0:3], 0 offen offset:24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v5, v8, s[0:3], 0 offen offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v4, v8, s[0:3], 0 offen offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v3, v8, s[0:3], 0 offen offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v2, v8, s[0:3], 0 offen offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readlane_b32 s31, v9, 1 +; GFX9-NEXT: v_readlane_b32 s30, v9, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: v_readlane_b32 s33, v9, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_call_v16bf16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v9, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v9, s30, 0 +; GFX10-NEXT: v_writelane_b32 v9, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen offset:28 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v6, v8, s[0:3], 0 offen offset:24 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v5, v8, s[0:3], 0 offen offset:20 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v4, v8, s[0:3], 0 offen offset:16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v3, v8, s[0:3], 0 offen offset:12 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v2, v8, s[0:3], 0 offen offset:8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readlane_b32 s31, v9, 1 +; GFX10-NEXT: v_readlane_b32 s30, v9, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: v_readlane_b32 s33, v9, 2 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in) + store volatile <16 x bfloat> %result, ptr addrspace(5) %out + ret void +} + +define bfloat @test_alloca_load_store_ret(bfloat %in) { +; GCN-LABEL: test_alloca_load_store_ret: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_alloca_load_store_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_alloca_load_store_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_alloca_load_store_ret: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_alloca_load_store_ret: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %in.addr = alloca bfloat, align 2, addrspace(5) + store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2 + %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2 + ret bfloat %loaded +} + +define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { +; GCN-LABEL: test_overflow_stack: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0 +; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 +; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 +; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0 +; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 +; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0 +; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0 +; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 +; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_overflow_stack: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; GFX7-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; GFX7-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; GFX7-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 +; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_overflow_stack: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x78, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x74, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; GFX8-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; GFX8-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_overflow_stack: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_overflow_stack: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %ins.0 = insertvalue { <32 x i32>, bfloat } undef, <32 x i32> %b, 0 + %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 + ret { <32 x i32>, bfloat } %ins.1 +}